123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373 |
- from __future__ import absolute_import
- from dejavu.database import Database
- import sys
- if sys.version_info >= (3, 0):
- from itertools import zip_longest as izip_longest
- import queue as Queue
- else:
- from itertools import izip_longest
- import Queue
- PY3 = sys.version_info >= (3, 0)
- class MEMDatabase(Database):
- """
- Queries:
- 1) Find duplicates (shouldn't be any, though):
- select `hash`, `song_id`, `offset`, count(*) cnt
- from fingerprints
- group by `hash`, `song_id`, `offset`
- having cnt > 1
- order by cnt asc;
- 2) Get number of hashes by song:
- select song_id, song_name, count(song_id) as num
- from fingerprints
- natural join songs
- group by song_id
- order by count(song_id) desc;
- 3) get hashes with highest number of collisions
- select
- hash,
- count(distinct song_id) as n
- from fingerprints
- group by `hash`
- order by n DESC;
- => 26 different songs with same fingerprint (392 times):
- select songs.song_name, fingerprints.offset
- from fingerprints natural join songs
- where fingerprints.hash = "08d3c833b71c60a7b620322ac0c0aba7bf5a3e73";
- """
- type = "mem"
- # tables
- FINGERPRINTS_TABLENAME = "fingerprints"
- SONGS_TABLENAME = "ads"
- # fields
- FIELD_FINGERPRINTED = "fingerprinted"
- # creates
- CREATE_FINGERPRINTS_TABLE = """
- CREATE TABLE IF NOT EXISTS `%s` (
- `%s` binary(10) not null,
- `%s` int unsigned not null,
- `%s` int unsigned not null,
- INDEX (%s),
- UNIQUE KEY `unique_constraint` (%s, %s, %s),
- FOREIGN KEY (%s) REFERENCES %s(%s) ON DELETE CASCADE
- ) ENGINE=INNODB;""" % (
- FINGERPRINTS_TABLENAME, Database.FIELD_HASH,
- Database.FIELD_SONG_ID, Database.FIELD_OFFSET, Database.FIELD_HASH,
- Database.FIELD_SONG_ID, Database.FIELD_OFFSET, Database.FIELD_HASH,
- Database.FIELD_SONG_ID, SONGS_TABLENAME, Database.FIELD_SONG_ID
- )
- CREATE_SONGS_TABLE = """
- CREATE TABLE IF NOT EXISTS `%s` (
- `%s` int unsigned not null auto_increment,
- `%s` varchar(250) not null,
- `%s` bigint default 0,
- `%s` tinyint default 0,
- `%s` binary(20) not null,
- PRIMARY KEY (`%s`),
- UNIQUE KEY `%s` (`%s`)
- ) ENGINE=INNODB;""" % (
- SONGS_TABLENAME, Database.FIELD_SONG_ID, Database.FIELD_SONGNAME, Database.FIELD_LENGTH, FIELD_FINGERPRINTED,
- Database.FIELD_FILE_SHA1,
- Database.FIELD_SONG_ID, Database.FIELD_SONG_ID, Database.FIELD_SONG_ID,
- )
- # inserts (ignores duplicates)
- INSERT_FINGERPRINT = """
- INSERT IGNORE INTO %s (%s, %s, %s) values
- (UNHEX(%%s), %%s, %%s);
- """ % (FINGERPRINTS_TABLENAME, Database.FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET)
- INSERT_SONG = "INSERT INTO %s (%s, %s, %s) values (%%s, %%s, UNHEX(%%s));" % (
- SONGS_TABLENAME, Database.FIELD_SONGNAME, Database.FIELD_LENGTH, Database.FIELD_FILE_SHA1)
- # selects
- SELECT = """
- SELECT %s, %s FROM %s WHERE %s = UNHEX(%%s);
- """ % (Database.FIELD_SONG_ID, Database.FIELD_OFFSET, FINGERPRINTS_TABLENAME, Database.FIELD_HASH)
- SELECT_MULTIPLE = """
- SELECT HEX(%s), %s, %s FROM %s WHERE %s IN (%%s);
- """ % (Database.FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET,
- FINGERPRINTS_TABLENAME, Database.FIELD_HASH)
- SELECT_MULTIPLE_FILTER = """
- SELECT HEX(%s), %s, %s FROM %s WHERE %s IN (%%s) AND id IN (%%s);
- """ % (Database.FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET,
- FINGERPRINTS_TABLENAME, Database.FIELD_HASH)
- SELECT_ALL = """
- SELECT %s, %s FROM %s;
- """ % (Database.FIELD_SONG_ID, Database.FIELD_OFFSET, FINGERPRINTS_TABLENAME)
- SELECT_SONG = """
- SELECT %s, HEX(%s) as %s FROM %s WHERE %s = %%s;
- """ % (Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1, Database.FIELD_FILE_SHA1, SONGS_TABLENAME, Database.FIELD_SONG_ID)
- SELECT_NUM_FINGERPRINTS = """
- SELECT COUNT(*) as n FROM %s
- """ % (FINGERPRINTS_TABLENAME)
- SELECT_UNIQUE_SONG_IDS = """
- SELECT COUNT(DISTINCT %s) as n FROM %s WHERE %s = 1;
- """ % (Database.FIELD_SONG_ID, SONGS_TABLENAME, FIELD_FINGERPRINTED)
- SELECT_SONGS = """
- SELECT %s, %s, HEX(%s) as %s FROM %s WHERE %s = 1;
- """ % (Database.FIELD_SONG_ID, Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1, Database.FIELD_FILE_SHA1,
- SONGS_TABLENAME, FIELD_FINGERPRINTED)
- # drops
- DROP_FINGERPRINTS = "DROP TABLE IF EXISTS %s;" % FINGERPRINTS_TABLENAME
- DROP_SONGS = "DROP TABLE IF EXISTS %s;" % SONGS_TABLENAME
- # update
- UPDATE_SONG_FINGERPRINTED = """
- UPDATE %s SET %s = 1 WHERE %s = %%s
- """ % (SONGS_TABLENAME, FIELD_FINGERPRINTED, Database.FIELD_SONG_ID)
- # delete
- DELETE_UNFINGERPRINTED = """
- DELETE FROM %s WHERE %s = 0;
- """ % (SONGS_TABLENAME, FIELD_FINGERPRINTED)
- def __init__(self, **options):
- super(MEMDatabase, self).__init__()
- self._options = options
-
- def after_fork(self):
- raise Exception("not implemented")
- # Clear the cursor cache, we don't want any stale connections from
- # the previous process.
- Cursor.clear_cache()
- def setup(self):
- """
- Creates any non-existing tables required for dejavu to function.
- This also removes all songs that have been added but have no
- fingerprints associated with them.
- """
- self.songs = {}
- self.fingerprints = {}
- #with self.cursor() as cur:
- # cur.execute(self.CREATE_SONGS_TABLE)
- # cur.execute(self.CREATE_FINGERPRINTS_TABLE)
- # cur.execute(self.DELETE_UNFINGERPRINTED)
- def empty(self):
- raise Exception("not implemented")
- """
- Drops tables created by dejavu and then creates them again
- by calling `SQLDatabase.setup`.
- .. warning:
- This will result in a loss of data
- """
- with self.cursor() as cur:
- cur.execute(self.DROP_FINGERPRINTS)
- cur.execute(self.DROP_SONGS)
- self.setup()
- def delete_unfingerprinted_songs(self):
- raise Exception("not implemented")
- """
- Removes all songs that have no fingerprints associated with them.
- """
- with self.cursor() as cur:
- cur.execute(self.DELETE_UNFINGERPRINTED)
- def get_num_songs(self):
- raise Exception("not implemented")
- """
- Returns number of songs the database has fingerprinted.
- """
- with self.cursor() as cur:
- cur.execute(self.SELECT_UNIQUE_SONG_IDS)
- for count, in cur:
- return count
- return 0
- def get_num_fingerprints(self):
- raise Exception("not implemented")
- """
- Returns number of fingerprints the database has fingerprinted.
- """
- with self.cursor() as cur:
- cur.execute(self.SELECT_NUM_FINGERPRINTS)
- for count, in cur:
- return count
- return 0
- def set_song_fingerprinted(self, sid):
- """
- Set the fingerprinted flag to TRUE (1) once a song has been completely
- fingerprinted in the database.
- """
- self.songs[sid]["fingerprinted"] = 1
- #with self.cursor() as cur:
- # cur.execute(self.UPDATE_SONG_FINGERPRINTED, (sid,))
- def get_songs(self):
- """
- Return songs that have the fingerprinted flag set TRUE (1).
- """
- if PY3:
- for _, song in self.songs.items():
- yield song
- else:
- for _, song in self.songs.iteritems():
- yield song
- #with self.cursor(cursor_type=DictCursor) as cur:
- # cur.execute(self.SELECT_SONGS)
- # for row in cur:
- # yield row
- def get_song_by_id(self, sid):
- """
- Returns song by its ID.
- """
- return self.songs.get(sid, None)
- #with self.cursor(cursor_type=DictCursor) as cur:
- # cur.execute(self.SELECT_SONG, (sid,))
- # return cur.fetchone()
- def insert(self, hash, sid, offset):
- raise Exception("not implemented")
- """
- Insert a (sha1, song_id, offset) row into database.
- """
- with self.cursor() as cur:
- cur.execute(self.INSERT_FINGERPRINT, (hash, sid, offset))
- def insert_song(self, songname, length, file_hash):
- """
- Inserts song in the database and returns the ID of the inserted record.
- """
- self.songs[file_hash] = {
- Database.FIELD_SONG_ID: file_hash,
- Database.FIELD_SONGNAME: songname,
- Database.FIELD_LENGTH: length,
- "fingerprinted": 0,
- Database.FIELD_FILE_SHA1: file_hash
- }
- return file_hash
- #with self.cursor() as cur:
- # cur.execute(self.INSERT_SONG, (songname, length, file_hash))
- # return cur.lastrowid
- def query(self, hash):
- raise Exception("not implemented")
- """
- Return all tuples associated with hash.
- If hash is None, returns all entries in the
- database (be careful with that one!).
- """
- # select all if no key
- query = self.SELECT_ALL if hash is None else self.SELECT
- with self.cursor() as cur:
- cur.execute(query)
- for sid, offset in cur:
- yield (sid, offset)
- def get_iterable_kv_pairs(self):
- raise Exception("not implemented")
- """
- Returns all tuples in database.
- """
- return self.query(None)
- def insert_hashes(self, sid, hashes):
- """
- Insert series of hash => song_id, offset
- values into the database.
- """
- #values = {}
- for hash, offset in hashes:
- self.fingerprints[hash] = {"sid": sid, "offset": offset}
- #self.fingerprints[sid] = values
- #values = []
- #for hash, offset in hashes:
- # values.append((hash, sid, offset))
- #with self.cursor() as cur:
- # for split_values in grouper(values, 1000):
- # cur.executemany(self.INSERT_FINGERPRINT, split_values)
- def return_matches(self, hashes, ads_filter=None):
- """
- Return the (song_id, offset_diff) tuples associated with
- a list of (sha1, sample_offset) values.
- """
- for hash, offset in hashes:
- el = self.fingerprints.get(hash, None)
- if el is not None:
- yield (el["sid"], el["offset"] - offset, el["offset"])
- return
- # Create a dictionary of hash => offset pairs for later lookups
- mapper = {}
- for hash, offset in hashes:
- mapper[hash.upper()] = offset
- # Get an iteratable of all the hashes we need
- values = mapper.keys()
- with self.cursor() as cur:
- for split_values in grouper(values, 1000):
- # Create our IN part of the query
- if ads_filter is None:
- query = self.SELECT_MULTIPLE
- query = query % ', '.join(['UNHEX(%s)'] * len(split_values))
- else:
- query = self.SELECT_MULTIPLE_FILTER
- query = query % (', '.join(['UNHEX(%s)'] * len(split_values)), ",".join(ads_filter))
- #if ads_filter:
- #query = query + " and id in (" + ",".join(ads_filter) + ")"
- cur.execute(query, split_values)
- for hash, sid, offset in cur:
- # (sid, db_offset - song_sampled_offset)
- yield (sid, offset - mapper[hash], offset)
- def __getstate__(self):
- raise Exception("not implemented")
- return (self._options,)
- def __setstate__(self, state):
- raise Exception("not implemented")
- self._options, = state
- self.cursor = cursor_factory(**self._options)
- def grouper(iterable, n, fillvalue=None):
- args = [iter(iterable)] * n
- return (filter(None, values) for values
- in izip_longest(fillvalue=fillvalue, *args))
|