from __future__ import absolute_import from dejavu.database import Database import sys if sys.version_info >= (3, 0): from itertools import zip_longest as izip_longest import queue as Queue else: from itertools import izip_longest import Queue PY3 = sys.version_info >= (3, 0) class MEMDatabase(Database): """ Queries: 1) Find duplicates (shouldn't be any, though): select `hash`, `song_id`, `offset`, count(*) cnt from fingerprints group by `hash`, `song_id`, `offset` having cnt > 1 order by cnt asc; 2) Get number of hashes by song: select song_id, song_name, count(song_id) as num from fingerprints natural join songs group by song_id order by count(song_id) desc; 3) get hashes with highest number of collisions select hash, count(distinct song_id) as n from fingerprints group by `hash` order by n DESC; => 26 different songs with same fingerprint (392 times): select songs.song_name, fingerprints.offset from fingerprints natural join songs where fingerprints.hash = "08d3c833b71c60a7b620322ac0c0aba7bf5a3e73"; """ type = "mem" # tables FINGERPRINTS_TABLENAME = "fingerprints" SONGS_TABLENAME = "ads" # fields FIELD_FINGERPRINTED = "fingerprinted" # creates CREATE_FINGERPRINTS_TABLE = """ CREATE TABLE IF NOT EXISTS `%s` ( `%s` binary(10) not null, `%s` int unsigned not null, `%s` int unsigned not null, INDEX (%s), UNIQUE KEY `unique_constraint` (%s, %s, %s), FOREIGN KEY (%s) REFERENCES %s(%s) ON DELETE CASCADE ) ENGINE=INNODB;""" % ( FINGERPRINTS_TABLENAME, Database.FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET, Database.FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET, Database.FIELD_HASH, Database.FIELD_SONG_ID, SONGS_TABLENAME, Database.FIELD_SONG_ID ) CREATE_SONGS_TABLE = """ CREATE TABLE IF NOT EXISTS `%s` ( `%s` int unsigned not null auto_increment, `%s` varchar(250) not null, `%s` bigint default 0, `%s` tinyint default 0, `%s` binary(20) not null, PRIMARY KEY (`%s`), UNIQUE KEY `%s` (`%s`) ) ENGINE=INNODB;""" % ( SONGS_TABLENAME, Database.FIELD_SONG_ID, Database.FIELD_SONGNAME, Database.FIELD_LENGTH, FIELD_FINGERPRINTED, Database.FIELD_FILE_SHA1, Database.FIELD_SONG_ID, Database.FIELD_SONG_ID, Database.FIELD_SONG_ID, ) # inserts (ignores duplicates) INSERT_FINGERPRINT = """ INSERT IGNORE INTO %s (%s, %s, %s) values (UNHEX(%%s), %%s, %%s); """ % (FINGERPRINTS_TABLENAME, Database.FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET) INSERT_SONG = "INSERT INTO %s (%s, %s, %s) values (%%s, %%s, UNHEX(%%s));" % ( SONGS_TABLENAME, Database.FIELD_SONGNAME, Database.FIELD_LENGTH, Database.FIELD_FILE_SHA1) # selects SELECT = """ SELECT %s, %s FROM %s WHERE %s = UNHEX(%%s); """ % (Database.FIELD_SONG_ID, Database.FIELD_OFFSET, FINGERPRINTS_TABLENAME, Database.FIELD_HASH) SELECT_MULTIPLE = """ SELECT HEX(%s), %s, %s FROM %s WHERE %s IN (%%s); """ % (Database.FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET, FINGERPRINTS_TABLENAME, Database.FIELD_HASH) SELECT_MULTIPLE_FILTER = """ SELECT HEX(%s), %s, %s FROM %s WHERE %s IN (%%s) AND id IN (%%s); """ % (Database.FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET, FINGERPRINTS_TABLENAME, Database.FIELD_HASH) SELECT_ALL = """ SELECT %s, %s FROM %s; """ % (Database.FIELD_SONG_ID, Database.FIELD_OFFSET, FINGERPRINTS_TABLENAME) SELECT_SONG = """ SELECT %s, HEX(%s) as %s FROM %s WHERE %s = %%s; """ % (Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1, Database.FIELD_FILE_SHA1, SONGS_TABLENAME, Database.FIELD_SONG_ID) SELECT_NUM_FINGERPRINTS = """ SELECT COUNT(*) as n FROM %s """ % (FINGERPRINTS_TABLENAME) SELECT_UNIQUE_SONG_IDS = """ SELECT COUNT(DISTINCT %s) as n FROM %s WHERE %s = 1; """ % (Database.FIELD_SONG_ID, SONGS_TABLENAME, FIELD_FINGERPRINTED) SELECT_SONGS = """ SELECT %s, %s, HEX(%s) as %s FROM %s WHERE %s = 1; """ % (Database.FIELD_SONG_ID, Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1, Database.FIELD_FILE_SHA1, SONGS_TABLENAME, FIELD_FINGERPRINTED) # drops DROP_FINGERPRINTS = "DROP TABLE IF EXISTS %s;" % FINGERPRINTS_TABLENAME DROP_SONGS = "DROP TABLE IF EXISTS %s;" % SONGS_TABLENAME # update UPDATE_SONG_FINGERPRINTED = """ UPDATE %s SET %s = 1 WHERE %s = %%s """ % (SONGS_TABLENAME, FIELD_FINGERPRINTED, Database.FIELD_SONG_ID) # delete DELETE_UNFINGERPRINTED = """ DELETE FROM %s WHERE %s = 0; """ % (SONGS_TABLENAME, FIELD_FINGERPRINTED) def __init__(self, **options): super(MEMDatabase, self).__init__() self._options = options def after_fork(self): raise Exception("not implemented") # Clear the cursor cache, we don't want any stale connections from # the previous process. Cursor.clear_cache() def setup(self): """ Creates any non-existing tables required for dejavu to function. This also removes all songs that have been added but have no fingerprints associated with them. """ self.songs = {} self.fingerprints = {} #with self.cursor() as cur: # cur.execute(self.CREATE_SONGS_TABLE) # cur.execute(self.CREATE_FINGERPRINTS_TABLE) # cur.execute(self.DELETE_UNFINGERPRINTED) def empty(self): raise Exception("not implemented") """ Drops tables created by dejavu and then creates them again by calling `SQLDatabase.setup`. .. warning: This will result in a loss of data """ with self.cursor() as cur: cur.execute(self.DROP_FINGERPRINTS) cur.execute(self.DROP_SONGS) self.setup() def delete_unfingerprinted_songs(self): raise Exception("not implemented") """ Removes all songs that have no fingerprints associated with them. """ with self.cursor() as cur: cur.execute(self.DELETE_UNFINGERPRINTED) def get_num_songs(self): raise Exception("not implemented") """ Returns number of songs the database has fingerprinted. """ with self.cursor() as cur: cur.execute(self.SELECT_UNIQUE_SONG_IDS) for count, in cur: return count return 0 def get_num_fingerprints(self): raise Exception("not implemented") """ Returns number of fingerprints the database has fingerprinted. """ with self.cursor() as cur: cur.execute(self.SELECT_NUM_FINGERPRINTS) for count, in cur: return count return 0 def set_song_fingerprinted(self, sid): """ Set the fingerprinted flag to TRUE (1) once a song has been completely fingerprinted in the database. """ self.songs[sid]["fingerprinted"] = 1 #with self.cursor() as cur: # cur.execute(self.UPDATE_SONG_FINGERPRINTED, (sid,)) def get_songs(self): """ Return songs that have the fingerprinted flag set TRUE (1). """ if PY3: for _, song in self.songs.items(): yield song else: for _, song in self.songs.iteritems(): yield song #with self.cursor(cursor_type=DictCursor) as cur: # cur.execute(self.SELECT_SONGS) # for row in cur: # yield row def get_song_by_id(self, sid): """ Returns song by its ID. """ return self.songs.get(sid, None) #with self.cursor(cursor_type=DictCursor) as cur: # cur.execute(self.SELECT_SONG, (sid,)) # return cur.fetchone() def insert(self, hash, sid, offset): raise Exception("not implemented") """ Insert a (sha1, song_id, offset) row into database. """ with self.cursor() as cur: cur.execute(self.INSERT_FINGERPRINT, (hash, sid, offset)) def insert_song(self, songname, length, file_hash): """ Inserts song in the database and returns the ID of the inserted record. """ self.songs[file_hash] = { Database.FIELD_SONG_ID: file_hash, Database.FIELD_SONGNAME: songname, Database.FIELD_LENGTH: length, "fingerprinted": 0, Database.FIELD_FILE_SHA1: file_hash } return file_hash #with self.cursor() as cur: # cur.execute(self.INSERT_SONG, (songname, length, file_hash)) # return cur.lastrowid def query(self, hash): raise Exception("not implemented") """ Return all tuples associated with hash. If hash is None, returns all entries in the database (be careful with that one!). """ # select all if no key query = self.SELECT_ALL if hash is None else self.SELECT with self.cursor() as cur: cur.execute(query) for sid, offset in cur: yield (sid, offset) def get_iterable_kv_pairs(self): raise Exception("not implemented") """ Returns all tuples in database. """ return self.query(None) def insert_hashes(self, sid, hashes): """ Insert series of hash => song_id, offset values into the database. """ #values = {} for hash, offset in hashes: self.fingerprints[hash] = {"sid": sid, "offset": offset} #self.fingerprints[sid] = values #values = [] #for hash, offset in hashes: # values.append((hash, sid, offset)) #with self.cursor() as cur: # for split_values in grouper(values, 1000): # cur.executemany(self.INSERT_FINGERPRINT, split_values) def return_matches(self, hashes, ads_filter=None): """ Return the (song_id, offset_diff) tuples associated with a list of (sha1, sample_offset) values. """ for hash, offset in hashes: el = self.fingerprints.get(hash, None) if el is not None: yield (el["sid"], el["offset"] - offset, el["offset"]) return # Create a dictionary of hash => offset pairs for later lookups mapper = {} for hash, offset in hashes: mapper[hash.upper()] = offset # Get an iteratable of all the hashes we need values = mapper.keys() with self.cursor() as cur: for split_values in grouper(values, 1000): # Create our IN part of the query if ads_filter is None: query = self.SELECT_MULTIPLE query = query % ', '.join(['UNHEX(%s)'] * len(split_values)) else: query = self.SELECT_MULTIPLE_FILTER query = query % (', '.join(['UNHEX(%s)'] * len(split_values)), ",".join(ads_filter)) #if ads_filter: #query = query + " and id in (" + ",".join(ads_filter) + ")" cur.execute(query, split_values) for hash, sid, offset in cur: # (sid, db_offset - song_sampled_offset) yield (sid, offset - mapper[hash], offset) def __getstate__(self): raise Exception("not implemented") return (self._options,) def __setstate__(self, state): raise Exception("not implemented") self._options, = state self.cursor = cursor_factory(**self._options) def grouper(iterable, n, fillvalue=None): args = [iter(iterable)] * n return (filter(None, values) for values in izip_longest(fillvalue=fillvalue, *args))