__init__.py 7.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212
  1. from __future__ import absolute_import
  2. from dejavu.database import get_database, Database
  3. from dejavu import decoder
  4. from dejavu import fingerprint
  5. from pydub.exceptions import CouldntDecodeError
  6. import multiprocessing
  7. import os
  8. import traceback
  9. import sys
  10. class Dejavu(object):
  11. SONG_ID = "id"
  12. SONG_NAME = 'name'
  13. CONFIDENCE = 'confidence'
  14. MATCH_TIME = 'match_time'
  15. OFFSET = 'offset'
  16. OFFSET_SECS = 'offset_seconds'
  17. def __init__(self, config):
  18. super(Dejavu, self).__init__()
  19. self.config = config
  20. # initialize db
  21. db_cls = get_database(config.get("database_type", None))
  22. self.db = db_cls(**config.get("database", {}))
  23. self.db.setup()
  24. # if we should limit seconds fingerprinted,
  25. # None|-1 means use entire track
  26. self.limit = self.config.get("fingerprint_limit", None)
  27. if self.limit == -1: # for JSON compatibility
  28. self.limit = None
  29. self.get_fingerprinted_songs()
  30. def get_fingerprinted_songs(self):
  31. # get songs previously indexed
  32. self.songs = self.db.get_songs()
  33. self.songhashes_set = set() # to know which ones we've computed before
  34. self.songs_dict = {}
  35. for song in self.songs:
  36. song_hash = song[Database.FIELD_FILE_SHA1]
  37. self.songhashes_set.add(song_hash)
  38. self.songs_dict[song_hash] = song
  39. def fingerprint_directory(self, path, extensions, nprocesses=None):
  40. # Try to use the maximum amount of processes if not given.
  41. try:
  42. nprocesses = nprocesses or multiprocessing.cpu_count()
  43. except NotImplementedError:
  44. nprocesses = 1
  45. else:
  46. nprocesses = 1 if nprocesses <= 0 else nprocesses
  47. pool = multiprocessing.Pool(nprocesses)
  48. filenames_to_fingerprint = []
  49. for filename, _ in decoder.find_files(path, extensions):
  50. # don't refingerprint already fingerprinted files
  51. if decoder.unique_hash(filename) in self.songhashes_set:
  52. print("%s already fingerprinted, continuing..." % filename)
  53. continue
  54. filenames_to_fingerprint.append(filename)
  55. # Prepare _fingerprint_worker input
  56. worker_input = zip(filenames_to_fingerprint,
  57. [self.limit] * len(filenames_to_fingerprint))
  58. # Send off our tasks
  59. iterator = pool.imap_unordered(_fingerprint_worker,
  60. worker_input)
  61. # Loop till we have all of them
  62. while True:
  63. try:
  64. song_name, hashes, file_hash = iterator.next()
  65. except multiprocessing.TimeoutError:
  66. continue
  67. except StopIteration:
  68. break
  69. except:
  70. print("Failed fingerprinting")
  71. # Print traceback because we can't reraise it here
  72. traceback.print_exc(file=sys.stdout)
  73. else:
  74. sid = self.db.insert_song(song_name, file_hash)
  75. self.db.insert_hashes(sid, hashes)
  76. self.db.set_song_fingerprinted(sid)
  77. self.get_fingerprinted_songs()
  78. pool.close()
  79. pool.join()
  80. def fingerprint_file(self, filepath, song_name=None):
  81. songname = decoder.path_to_songname(filepath)
  82. song_hash = decoder.unique_hash(filepath)
  83. song_name = song_name or songname
  84. # don't refingerprint already fingerprinted files
  85. if song_hash in self.songhashes_set:
  86. song = self.songs_dict[song_hash]
  87. return -1,-1,-1, song["name"] #song already fingerprinted
  88. else:
  89. song_name, length, hashes, file_hash = _fingerprint_worker(
  90. filepath,
  91. self.limit,
  92. song_name=song_name
  93. )
  94. sid = self.db.insert_song(song_name, length, file_hash)
  95. self.db.insert_hashes(sid, hashes)
  96. self.db.set_song_fingerprinted(sid)
  97. self.get_fingerprinted_songs()
  98. return sid, length, len(hashes), ""
  99. def find_matches(self, samples, Fs=fingerprint.DEFAULT_FS, ads_filter=None):
  100. hashes = fingerprint.fingerprint(samples, Fs=Fs)
  101. return self.db.return_matches(hashes, ads_filter)
  102. def align_matches(self, matches):
  103. """
  104. Finds hash matches that align in time with other matches and finds
  105. consensus about which hashes are "true" signal from the audio.
  106. Returns a dictionary with match information.
  107. """
  108. # align by diffs
  109. diff_counter = {}
  110. largest = 0
  111. largest_offset = 0
  112. largest_count = 0
  113. song_id = -1
  114. for tup in matches:
  115. sid, diff, offset = tup
  116. if diff not in diff_counter:
  117. diff_counter[diff] = {}
  118. if sid not in diff_counter[diff]:
  119. diff_counter[diff][sid] = 0
  120. diff_counter[diff][sid] += 1
  121. if diff_counter[diff][sid] > largest_count:
  122. largest = diff
  123. largest_offset = offset
  124. largest_count = diff_counter[diff][sid]
  125. song_id = sid
  126. # extract idenfication
  127. song = self.db.get_song_by_id(song_id)
  128. if song:
  129. # TODO: Clarify what `get_song_by_id` should return.
  130. songname = song.get(Dejavu.SONG_NAME, None)
  131. else:
  132. return None
  133. # return match info
  134. nseconds = round(float(largest) / fingerprint.DEFAULT_FS *
  135. fingerprint.DEFAULT_WINDOW_SIZE *
  136. (1 - fingerprint.DEFAULT_OVERLAP_RATIO), 5)
  137. song = {
  138. Dejavu.SONG_ID : song_id,
  139. Dejavu.SONG_NAME : songname,
  140. Dejavu.CONFIDENCE : largest_count,
  141. Dejavu.OFFSET : int(largest),
  142. Dejavu.OFFSET_SECS : nseconds,
  143. 'position': largest_offset,
  144. Database.FIELD_FILE_SHA1 : song.get(Database.FIELD_FILE_SHA1, None),}
  145. return song
  146. def recognize(self, recognizer, *options, **kwoptions):
  147. r = recognizer(self)
  148. return r.recognize(*options, **kwoptions)
  149. def _fingerprint_worker(filename, limit=None, song_name=None):
  150. # Pool.imap sends arguments as tuples so we have to unpack
  151. # them ourself.
  152. try:
  153. filename, limit = filename
  154. except ValueError:
  155. pass
  156. songname, extension = os.path.splitext(os.path.basename(filename))
  157. song_name = song_name or songname
  158. channels, Fs, file_hash, length = decoder.read(filename, limit)
  159. result = set()
  160. channel_amount = len(channels)
  161. for channeln, channel in enumerate(channels):
  162. # TODO: Remove prints or change them into optional logging.
  163. #print("Fingerprinting channel %d/%d for %s" % (channeln + 1,
  164. # channel_amount,
  165. # filename))
  166. hashes = fingerprint.fingerprint(channel, Fs=Fs)
  167. #print("Finished channel %d/%d for %s" % (channeln + 1, channel_amount,
  168. # filename))
  169. result |= set(hashes)
  170. return song_name, length, result, file_hash
  171. def chunkify(lst, n):
  172. """
  173. Splits a list into roughly n equal parts.
  174. http://stackoverflow.com/questions/2130016/splitting-a-list-of-arbitrary-size-into-only-roughly-n-equal-parts
  175. """
  176. return [lst[i::n] for i in xrange(n)]