from __future__ import absolute_import from dejavu.database import get_database, Database from dejavu import decoder from dejavu import fingerprint from pydub.exceptions import CouldntDecodeError import multiprocessing import os import traceback import sys class Dejavu(object): SONG_ID = "id" SONG_NAME = 'name' CONFIDENCE = 'confidence' MATCH_TIME = 'match_time' OFFSET = 'offset' OFFSET_SECS = 'offset_seconds' def __init__(self, config): super(Dejavu, self).__init__() self.config = config # initialize db db_cls = get_database(config.get("database_type", None)) self.db = db_cls(**config.get("database", {})) self.db.setup() # if we should limit seconds fingerprinted, # None|-1 means use entire track self.limit = self.config.get("fingerprint_limit", None) if self.limit == -1: # for JSON compatibility self.limit = None self.get_fingerprinted_songs() def get_fingerprinted_songs(self): # get songs previously indexed self.songs = self.db.get_songs() self.songhashes_set = set() # to know which ones we've computed before self.songs_dict = {} for song in self.songs: song_hash = song[Database.FIELD_FILE_SHA1] self.songhashes_set.add(song_hash) self.songs_dict[song_hash] = song def fingerprint_directory(self, path, extensions, nprocesses=None): # Try to use the maximum amount of processes if not given. try: nprocesses = nprocesses or multiprocessing.cpu_count() except NotImplementedError: nprocesses = 1 else: nprocesses = 1 if nprocesses <= 0 else nprocesses pool = multiprocessing.Pool(nprocesses) filenames_to_fingerprint = [] for filename, _ in decoder.find_files(path, extensions): # don't refingerprint already fingerprinted files if decoder.unique_hash(filename) in self.songhashes_set: print("%s already fingerprinted, continuing..." % filename) continue filenames_to_fingerprint.append(filename) # Prepare _fingerprint_worker input worker_input = zip(filenames_to_fingerprint, [self.limit] * len(filenames_to_fingerprint)) # Send off our tasks iterator = pool.imap_unordered(_fingerprint_worker, worker_input) # Loop till we have all of them while True: try: song_name, hashes, file_hash = iterator.next() except multiprocessing.TimeoutError: continue except StopIteration: break except: print("Failed fingerprinting") # Print traceback because we can't reraise it here traceback.print_exc(file=sys.stdout) else: sid = self.db.insert_song(song_name, file_hash) self.db.insert_hashes(sid, hashes) self.db.set_song_fingerprinted(sid) self.get_fingerprinted_songs() pool.close() pool.join() def fingerprint_file(self, filepath, song_name=None): songname = decoder.path_to_songname(filepath) song_hash = decoder.unique_hash(filepath) song_name = song_name or songname # don't refingerprint already fingerprinted files if song_hash in self.songhashes_set: song = self.songs_dict[song_hash] return -1,-1,-1, song["name"] #song already fingerprinted else: song_name, length, hashes, file_hash = _fingerprint_worker( filepath, self.limit, song_name=song_name ) sid = self.db.insert_song(song_name, length, file_hash) self.db.insert_hashes(sid, hashes) self.db.set_song_fingerprinted(sid) self.get_fingerprinted_songs() return sid, length, len(hashes), "" def find_matches(self, samples, Fs=fingerprint.DEFAULT_FS, ads_filter=None): hashes = fingerprint.fingerprint(samples, Fs=Fs) return self.db.return_matches(hashes, ads_filter) def align_matches(self, matches): """ Finds hash matches that align in time with other matches and finds consensus about which hashes are "true" signal from the audio. Returns a dictionary with match information. """ # align by diffs diff_counter = {} largest = 0 largest_offset = 0 largest_count = 0 song_id = -1 for tup in matches: sid, diff, offset = tup if diff not in diff_counter: diff_counter[diff] = {} if sid not in diff_counter[diff]: diff_counter[diff][sid] = 0 diff_counter[diff][sid] += 1 if diff_counter[diff][sid] > largest_count: largest = diff largest_offset = offset largest_count = diff_counter[diff][sid] song_id = sid # extract idenfication song = self.db.get_song_by_id(song_id) if song: # TODO: Clarify what `get_song_by_id` should return. songname = song.get(Dejavu.SONG_NAME, None) else: return None # return match info nseconds = round(float(largest) / fingerprint.DEFAULT_FS * fingerprint.DEFAULT_WINDOW_SIZE * (1 - fingerprint.DEFAULT_OVERLAP_RATIO), 5) song = { Dejavu.SONG_ID : song_id, Dejavu.SONG_NAME : songname, Dejavu.CONFIDENCE : largest_count, Dejavu.OFFSET : int(largest), Dejavu.OFFSET_SECS : nseconds, 'position': largest_offset, Database.FIELD_FILE_SHA1 : song.get(Database.FIELD_FILE_SHA1, None),} return song def recognize(self, recognizer, *options, **kwoptions): r = recognizer(self) return r.recognize(*options, **kwoptions) def _fingerprint_worker(filename, limit=None, song_name=None): # Pool.imap sends arguments as tuples so we have to unpack # them ourself. try: filename, limit = filename except ValueError: pass songname, extension = os.path.splitext(os.path.basename(filename)) song_name = song_name or songname channels, Fs, file_hash, length = decoder.read(filename, limit) result = set() channel_amount = len(channels) for channeln, channel in enumerate(channels): # TODO: Remove prints or change them into optional logging. #print("Fingerprinting channel %d/%d for %s" % (channeln + 1, # channel_amount, # filename)) hashes = fingerprint.fingerprint(channel, Fs=Fs) #print("Finished channel %d/%d for %s" % (channeln + 1, channel_amount, # filename)) result |= set(hashes) return song_name, length, result, file_hash def chunkify(lst, n): """ Splits a list into roughly n equal parts. http://stackoverflow.com/questions/2130016/splitting-a-list-of-arbitrary-size-into-only-roughly-n-equal-parts """ return [lst[i::n] for i in xrange(n)]