|
@@ -0,0 +1,372 @@
|
|
|
+from __future__ import absolute_import
|
|
|
+from dejavu.database import Database
|
|
|
+import sys
|
|
|
+
|
|
|
+if sys.version_info >= (3, 0):
|
|
|
+ from itertools import zip_longest as izip_longest
|
|
|
+ import queue as Queue
|
|
|
+else:
|
|
|
+ from itertools import izip_longest
|
|
|
+ import Queue
|
|
|
+
|
|
|
+PY3 = sys.version_info >= (3, 0)
|
|
|
+
|
|
|
+
|
|
|
+class MEMDatabase(Database):
|
|
|
+ """
|
|
|
+ Queries:
|
|
|
+
|
|
|
+ 1) Find duplicates (shouldn't be any, though):
|
|
|
+
|
|
|
+ select `hash`, `song_id`, `offset`, count(*) cnt
|
|
|
+ from fingerprints
|
|
|
+ group by `hash`, `song_id`, `offset`
|
|
|
+ having cnt > 1
|
|
|
+ order by cnt asc;
|
|
|
+
|
|
|
+ 2) Get number of hashes by song:
|
|
|
+
|
|
|
+ select song_id, song_name, count(song_id) as num
|
|
|
+ from fingerprints
|
|
|
+ natural join songs
|
|
|
+ group by song_id
|
|
|
+ order by count(song_id) desc;
|
|
|
+
|
|
|
+ 3) get hashes with highest number of collisions
|
|
|
+
|
|
|
+ select
|
|
|
+ hash,
|
|
|
+ count(distinct song_id) as n
|
|
|
+ from fingerprints
|
|
|
+ group by `hash`
|
|
|
+ order by n DESC;
|
|
|
+
|
|
|
+ => 26 different songs with same fingerprint (392 times):
|
|
|
+
|
|
|
+ select songs.song_name, fingerprints.offset
|
|
|
+ from fingerprints natural join songs
|
|
|
+ where fingerprints.hash = "08d3c833b71c60a7b620322ac0c0aba7bf5a3e73";
|
|
|
+ """
|
|
|
+
|
|
|
+ type = "mem"
|
|
|
+
|
|
|
+ # tables
|
|
|
+ FINGERPRINTS_TABLENAME = "fingerprints"
|
|
|
+ SONGS_TABLENAME = "ads"
|
|
|
+
|
|
|
+ # fields
|
|
|
+ FIELD_FINGERPRINTED = "fingerprinted"
|
|
|
+
|
|
|
+ # creates
|
|
|
+ CREATE_FINGERPRINTS_TABLE = """
|
|
|
+ CREATE TABLE IF NOT EXISTS `%s` (
|
|
|
+ `%s` binary(10) not null,
|
|
|
+ `%s` int unsigned not null,
|
|
|
+ `%s` int unsigned not null,
|
|
|
+ INDEX (%s),
|
|
|
+ UNIQUE KEY `unique_constraint` (%s, %s, %s),
|
|
|
+ FOREIGN KEY (%s) REFERENCES %s(%s) ON DELETE CASCADE
|
|
|
+ ) ENGINE=INNODB;""" % (
|
|
|
+ FINGERPRINTS_TABLENAME, Database.FIELD_HASH,
|
|
|
+ Database.FIELD_SONG_ID, Database.FIELD_OFFSET, Database.FIELD_HASH,
|
|
|
+ Database.FIELD_SONG_ID, Database.FIELD_OFFSET, Database.FIELD_HASH,
|
|
|
+ Database.FIELD_SONG_ID, SONGS_TABLENAME, Database.FIELD_SONG_ID
|
|
|
+ )
|
|
|
+
|
|
|
+ CREATE_SONGS_TABLE = """
|
|
|
+ CREATE TABLE IF NOT EXISTS `%s` (
|
|
|
+ `%s` int unsigned not null auto_increment,
|
|
|
+ `%s` varchar(250) not null,
|
|
|
+ `%s` bigint default 0,
|
|
|
+ `%s` tinyint default 0,
|
|
|
+ `%s` binary(20) not null,
|
|
|
+ PRIMARY KEY (`%s`),
|
|
|
+ UNIQUE KEY `%s` (`%s`)
|
|
|
+ ) ENGINE=INNODB;""" % (
|
|
|
+ SONGS_TABLENAME, Database.FIELD_SONG_ID, Database.FIELD_SONGNAME, Database.FIELD_LENGTH, FIELD_FINGERPRINTED,
|
|
|
+ Database.FIELD_FILE_SHA1,
|
|
|
+ Database.FIELD_SONG_ID, Database.FIELD_SONG_ID, Database.FIELD_SONG_ID,
|
|
|
+ )
|
|
|
+
|
|
|
+ # inserts (ignores duplicates)
|
|
|
+ INSERT_FINGERPRINT = """
|
|
|
+ INSERT IGNORE INTO %s (%s, %s, %s) values
|
|
|
+ (UNHEX(%%s), %%s, %%s);
|
|
|
+ """ % (FINGERPRINTS_TABLENAME, Database.FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET)
|
|
|
+
|
|
|
+ INSERT_SONG = "INSERT INTO %s (%s, %s, %s) values (%%s, %%s, UNHEX(%%s));" % (
|
|
|
+ SONGS_TABLENAME, Database.FIELD_SONGNAME, Database.FIELD_LENGTH, Database.FIELD_FILE_SHA1)
|
|
|
+
|
|
|
+ # selects
|
|
|
+ SELECT = """
|
|
|
+ SELECT %s, %s FROM %s WHERE %s = UNHEX(%%s);
|
|
|
+ """ % (Database.FIELD_SONG_ID, Database.FIELD_OFFSET, FINGERPRINTS_TABLENAME, Database.FIELD_HASH)
|
|
|
+
|
|
|
+ SELECT_MULTIPLE = """
|
|
|
+ SELECT HEX(%s), %s, %s FROM %s WHERE %s IN (%%s);
|
|
|
+ """ % (Database.FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET,
|
|
|
+ FINGERPRINTS_TABLENAME, Database.FIELD_HASH)
|
|
|
+
|
|
|
+ SELECT_MULTIPLE_FILTER = """
|
|
|
+ SELECT HEX(%s), %s, %s FROM %s WHERE %s IN (%%s) AND id IN (%%s);
|
|
|
+ """ % (Database.FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET,
|
|
|
+ FINGERPRINTS_TABLENAME, Database.FIELD_HASH)
|
|
|
+
|
|
|
+ SELECT_ALL = """
|
|
|
+ SELECT %s, %s FROM %s;
|
|
|
+ """ % (Database.FIELD_SONG_ID, Database.FIELD_OFFSET, FINGERPRINTS_TABLENAME)
|
|
|
+
|
|
|
+ SELECT_SONG = """
|
|
|
+ SELECT %s, HEX(%s) as %s FROM %s WHERE %s = %%s;
|
|
|
+ """ % (Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1, Database.FIELD_FILE_SHA1, SONGS_TABLENAME, Database.FIELD_SONG_ID)
|
|
|
+
|
|
|
+ SELECT_NUM_FINGERPRINTS = """
|
|
|
+ SELECT COUNT(*) as n FROM %s
|
|
|
+ """ % (FINGERPRINTS_TABLENAME)
|
|
|
+
|
|
|
+ SELECT_UNIQUE_SONG_IDS = """
|
|
|
+ SELECT COUNT(DISTINCT %s) as n FROM %s WHERE %s = 1;
|
|
|
+ """ % (Database.FIELD_SONG_ID, SONGS_TABLENAME, FIELD_FINGERPRINTED)
|
|
|
+
|
|
|
+ SELECT_SONGS = """
|
|
|
+ SELECT %s, %s, HEX(%s) as %s FROM %s WHERE %s = 1;
|
|
|
+ """ % (Database.FIELD_SONG_ID, Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1, Database.FIELD_FILE_SHA1,
|
|
|
+ SONGS_TABLENAME, FIELD_FINGERPRINTED)
|
|
|
+
|
|
|
+ # drops
|
|
|
+ DROP_FINGERPRINTS = "DROP TABLE IF EXISTS %s;" % FINGERPRINTS_TABLENAME
|
|
|
+ DROP_SONGS = "DROP TABLE IF EXISTS %s;" % SONGS_TABLENAME
|
|
|
+
|
|
|
+ # update
|
|
|
+ UPDATE_SONG_FINGERPRINTED = """
|
|
|
+ UPDATE %s SET %s = 1 WHERE %s = %%s
|
|
|
+ """ % (SONGS_TABLENAME, FIELD_FINGERPRINTED, Database.FIELD_SONG_ID)
|
|
|
+
|
|
|
+ # delete
|
|
|
+ DELETE_UNFINGERPRINTED = """
|
|
|
+ DELETE FROM %s WHERE %s = 0;
|
|
|
+ """ % (SONGS_TABLENAME, FIELD_FINGERPRINTED)
|
|
|
+
|
|
|
+ def __init__(self, **options):
|
|
|
+ super(MEMDatabase, self).__init__()
|
|
|
+ self._options = options
|
|
|
+
|
|
|
+
|
|
|
+ def after_fork(self):
|
|
|
+ raise Exception("not implemented")
|
|
|
+ # Clear the cursor cache, we don't want any stale connections from
|
|
|
+ # the previous process.
|
|
|
+ Cursor.clear_cache()
|
|
|
+
|
|
|
+ def setup(self):
|
|
|
+ """
|
|
|
+ Creates any non-existing tables required for dejavu to function.
|
|
|
+
|
|
|
+ This also removes all songs that have been added but have no
|
|
|
+ fingerprints associated with them.
|
|
|
+ """
|
|
|
+ self.songs = {}
|
|
|
+ self.fingerprints = {}
|
|
|
+ #with self.cursor() as cur:
|
|
|
+ # cur.execute(self.CREATE_SONGS_TABLE)
|
|
|
+ # cur.execute(self.CREATE_FINGERPRINTS_TABLE)
|
|
|
+ # cur.execute(self.DELETE_UNFINGERPRINTED)
|
|
|
+
|
|
|
+ def empty(self):
|
|
|
+ raise Exception("not implemented")
|
|
|
+ """
|
|
|
+ Drops tables created by dejavu and then creates them again
|
|
|
+ by calling `SQLDatabase.setup`.
|
|
|
+
|
|
|
+ .. warning:
|
|
|
+ This will result in a loss of data
|
|
|
+ """
|
|
|
+ with self.cursor() as cur:
|
|
|
+ cur.execute(self.DROP_FINGERPRINTS)
|
|
|
+ cur.execute(self.DROP_SONGS)
|
|
|
+
|
|
|
+ self.setup()
|
|
|
+
|
|
|
+ def delete_unfingerprinted_songs(self):
|
|
|
+ raise Exception("not implemented")
|
|
|
+ """
|
|
|
+ Removes all songs that have no fingerprints associated with them.
|
|
|
+ """
|
|
|
+ with self.cursor() as cur:
|
|
|
+ cur.execute(self.DELETE_UNFINGERPRINTED)
|
|
|
+
|
|
|
+ def get_num_songs(self):
|
|
|
+ raise Exception("not implemented")
|
|
|
+ """
|
|
|
+ Returns number of songs the database has fingerprinted.
|
|
|
+ """
|
|
|
+ with self.cursor() as cur:
|
|
|
+ cur.execute(self.SELECT_UNIQUE_SONG_IDS)
|
|
|
+
|
|
|
+ for count, in cur:
|
|
|
+ return count
|
|
|
+ return 0
|
|
|
+
|
|
|
+ def get_num_fingerprints(self):
|
|
|
+ raise Exception("not implemented")
|
|
|
+ """
|
|
|
+ Returns number of fingerprints the database has fingerprinted.
|
|
|
+ """
|
|
|
+ with self.cursor() as cur:
|
|
|
+ cur.execute(self.SELECT_NUM_FINGERPRINTS)
|
|
|
+
|
|
|
+ for count, in cur:
|
|
|
+ return count
|
|
|
+ return 0
|
|
|
+
|
|
|
+ def set_song_fingerprinted(self, sid):
|
|
|
+ """
|
|
|
+ Set the fingerprinted flag to TRUE (1) once a song has been completely
|
|
|
+ fingerprinted in the database.
|
|
|
+ """
|
|
|
+ self.songs[sid]["fingerprinted"] = 1
|
|
|
+ #with self.cursor() as cur:
|
|
|
+ # cur.execute(self.UPDATE_SONG_FINGERPRINTED, (sid,))
|
|
|
+
|
|
|
+ def get_songs(self):
|
|
|
+ """
|
|
|
+ Return songs that have the fingerprinted flag set TRUE (1).
|
|
|
+ """
|
|
|
+ if PY3:
|
|
|
+ for _, song in self.songs.items():
|
|
|
+ yield song
|
|
|
+ else:
|
|
|
+ for _, song in self.songs.iteritems():
|
|
|
+ yield song
|
|
|
+ #with self.cursor(cursor_type=DictCursor) as cur:
|
|
|
+ # cur.execute(self.SELECT_SONGS)
|
|
|
+ # for row in cur:
|
|
|
+ # yield row
|
|
|
+
|
|
|
+ def get_song_by_id(self, sid):
|
|
|
+ """
|
|
|
+ Returns song by its ID.
|
|
|
+ """
|
|
|
+ return self.songs.get(sid, None)
|
|
|
+ #with self.cursor(cursor_type=DictCursor) as cur:
|
|
|
+ # cur.execute(self.SELECT_SONG, (sid,))
|
|
|
+ # return cur.fetchone()
|
|
|
+
|
|
|
+ def insert(self, hash, sid, offset):
|
|
|
+ raise Exception("not implemented")
|
|
|
+ """
|
|
|
+ Insert a (sha1, song_id, offset) row into database.
|
|
|
+ """
|
|
|
+ with self.cursor() as cur:
|
|
|
+ cur.execute(self.INSERT_FINGERPRINT, (hash, sid, offset))
|
|
|
+
|
|
|
+ def insert_song(self, songname, length, file_hash):
|
|
|
+ """
|
|
|
+ Inserts song in the database and returns the ID of the inserted record.
|
|
|
+ """
|
|
|
+ self.songs[file_hash] = {
|
|
|
+ Database.FIELD_SONG_ID: file_hash,
|
|
|
+ Database.FIELD_SONGNAME: songname,
|
|
|
+ Database.FIELD_LENGTH: length,
|
|
|
+ "fingerprinted": 0,
|
|
|
+ Database.FIELD_FILE_SHA1: file_hash
|
|
|
+ }
|
|
|
+ return file_hash
|
|
|
+ #with self.cursor() as cur:
|
|
|
+ # cur.execute(self.INSERT_SONG, (songname, length, file_hash))
|
|
|
+ # return cur.lastrowid
|
|
|
+
|
|
|
+ def query(self, hash):
|
|
|
+ raise Exception("not implemented")
|
|
|
+ """
|
|
|
+ Return all tuples associated with hash.
|
|
|
+
|
|
|
+ If hash is None, returns all entries in the
|
|
|
+ database (be careful with that one!).
|
|
|
+ """
|
|
|
+ # select all if no key
|
|
|
+ query = self.SELECT_ALL if hash is None else self.SELECT
|
|
|
+
|
|
|
+ with self.cursor() as cur:
|
|
|
+ cur.execute(query)
|
|
|
+ for sid, offset in cur:
|
|
|
+ yield (sid, offset)
|
|
|
+
|
|
|
+ def get_iterable_kv_pairs(self):
|
|
|
+ raise Exception("not implemented")
|
|
|
+ """
|
|
|
+ Returns all tuples in database.
|
|
|
+ """
|
|
|
+ return self.query(None)
|
|
|
+
|
|
|
+ def insert_hashes(self, sid, hashes):
|
|
|
+ """
|
|
|
+ Insert series of hash => song_id, offset
|
|
|
+ values into the database.
|
|
|
+ """
|
|
|
+ #values = {}
|
|
|
+ for hash, offset in hashes:
|
|
|
+ self.fingerprints[hash] = {"sid": sid, "offset": offset}
|
|
|
+
|
|
|
+ #self.fingerprints[sid] = values
|
|
|
+
|
|
|
+ #values = []
|
|
|
+ #for hash, offset in hashes:
|
|
|
+ # values.append((hash, sid, offset))
|
|
|
+
|
|
|
+ #with self.cursor() as cur:
|
|
|
+ # for split_values in grouper(values, 1000):
|
|
|
+ # cur.executemany(self.INSERT_FINGERPRINT, split_values)
|
|
|
+
|
|
|
+ def return_matches(self, hashes, ads_filter=None):
|
|
|
+ """
|
|
|
+ Return the (song_id, offset_diff) tuples associated with
|
|
|
+ a list of (sha1, sample_offset) values.
|
|
|
+ """
|
|
|
+ for hash, offset in hashes:
|
|
|
+ el = self.fingerprints.get(hash, None)
|
|
|
+ if el is not None:
|
|
|
+ yield (el["sid"], el["offset"] - offset, el["offset"])
|
|
|
+
|
|
|
+ return
|
|
|
+ # Create a dictionary of hash => offset pairs for later lookups
|
|
|
+ mapper = {}
|
|
|
+ for hash, offset in hashes:
|
|
|
+ mapper[hash.upper()] = offset
|
|
|
+
|
|
|
+ # Get an iteratable of all the hashes we need
|
|
|
+ values = mapper.keys()
|
|
|
+
|
|
|
+ with self.cursor() as cur:
|
|
|
+ for split_values in grouper(values, 1000):
|
|
|
+ # Create our IN part of the query
|
|
|
+ if ads_filter is None:
|
|
|
+ query = self.SELECT_MULTIPLE
|
|
|
+ query = query % ', '.join(['UNHEX(%s)'] * len(split_values))
|
|
|
+ else:
|
|
|
+ query = self.SELECT_MULTIPLE_FILTER
|
|
|
+ query = query % (', '.join(['UNHEX(%s)'] * len(split_values)), ",".join(ads_filter))
|
|
|
+
|
|
|
+ #if ads_filter:
|
|
|
+ #query = query + " and id in (" + ",".join(ads_filter) + ")"
|
|
|
+
|
|
|
+ cur.execute(query, split_values)
|
|
|
+
|
|
|
+ for hash, sid, offset in cur:
|
|
|
+ # (sid, db_offset - song_sampled_offset)
|
|
|
+ yield (sid, offset - mapper[hash], offset)
|
|
|
+
|
|
|
+ def __getstate__(self):
|
|
|
+ raise Exception("not implemented")
|
|
|
+ return (self._options,)
|
|
|
+
|
|
|
+ def __setstate__(self, state):
|
|
|
+ raise Exception("not implemented")
|
|
|
+ self._options, = state
|
|
|
+ self.cursor = cursor_factory(**self._options)
|
|
|
+
|
|
|
+
|
|
|
+def grouper(iterable, n, fillvalue=None):
|
|
|
+ args = [iter(iterable)] * n
|
|
|
+ return (filter(None, values) for values
|
|
|
+ in izip_longest(fillvalue=fillvalue, *args))
|