database_mem.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373
  1. from __future__ import absolute_import
  2. from dejavu.database import Database
  3. import sys
  4. if sys.version_info >= (3, 0):
  5. from itertools import zip_longest as izip_longest
  6. import queue as Queue
  7. else:
  8. from itertools import izip_longest
  9. import Queue
  10. PY3 = sys.version_info >= (3, 0)
  11. class MEMDatabase(Database):
  12. """
  13. Queries:
  14. 1) Find duplicates (shouldn't be any, though):
  15. select `hash`, `song_id`, `offset`, count(*) cnt
  16. from fingerprints
  17. group by `hash`, `song_id`, `offset`
  18. having cnt > 1
  19. order by cnt asc;
  20. 2) Get number of hashes by song:
  21. select song_id, song_name, count(song_id) as num
  22. from fingerprints
  23. natural join songs
  24. group by song_id
  25. order by count(song_id) desc;
  26. 3) get hashes with highest number of collisions
  27. select
  28. hash,
  29. count(distinct song_id) as n
  30. from fingerprints
  31. group by `hash`
  32. order by n DESC;
  33. => 26 different songs with same fingerprint (392 times):
  34. select songs.song_name, fingerprints.offset
  35. from fingerprints natural join songs
  36. where fingerprints.hash = "08d3c833b71c60a7b620322ac0c0aba7bf5a3e73";
  37. """
  38. type = "mem"
  39. # tables
  40. FINGERPRINTS_TABLENAME = "fingerprints"
  41. SONGS_TABLENAME = "ads"
  42. # fields
  43. FIELD_FINGERPRINTED = "fingerprinted"
  44. # creates
  45. CREATE_FINGERPRINTS_TABLE = """
  46. CREATE TABLE IF NOT EXISTS `%s` (
  47. `%s` binary(10) not null,
  48. `%s` int unsigned not null,
  49. `%s` int unsigned not null,
  50. INDEX (%s),
  51. UNIQUE KEY `unique_constraint` (%s, %s, %s),
  52. FOREIGN KEY (%s) REFERENCES %s(%s) ON DELETE CASCADE
  53. ) ENGINE=INNODB;""" % (
  54. FINGERPRINTS_TABLENAME, Database.FIELD_HASH,
  55. Database.FIELD_SONG_ID, Database.FIELD_OFFSET, Database.FIELD_HASH,
  56. Database.FIELD_SONG_ID, Database.FIELD_OFFSET, Database.FIELD_HASH,
  57. Database.FIELD_SONG_ID, SONGS_TABLENAME, Database.FIELD_SONG_ID
  58. )
  59. CREATE_SONGS_TABLE = """
  60. CREATE TABLE IF NOT EXISTS `%s` (
  61. `%s` int unsigned not null auto_increment,
  62. `%s` varchar(250) not null,
  63. `%s` bigint default 0,
  64. `%s` tinyint default 0,
  65. `%s` binary(20) not null,
  66. PRIMARY KEY (`%s`),
  67. UNIQUE KEY `%s` (`%s`)
  68. ) ENGINE=INNODB;""" % (
  69. SONGS_TABLENAME, Database.FIELD_SONG_ID, Database.FIELD_SONGNAME, Database.FIELD_LENGTH, FIELD_FINGERPRINTED,
  70. Database.FIELD_FILE_SHA1,
  71. Database.FIELD_SONG_ID, Database.FIELD_SONG_ID, Database.FIELD_SONG_ID,
  72. )
  73. # inserts (ignores duplicates)
  74. INSERT_FINGERPRINT = """
  75. INSERT IGNORE INTO %s (%s, %s, %s) values
  76. (UNHEX(%%s), %%s, %%s);
  77. """ % (FINGERPRINTS_TABLENAME, Database.FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET)
  78. INSERT_SONG = "INSERT INTO %s (%s, %s, %s) values (%%s, %%s, UNHEX(%%s));" % (
  79. SONGS_TABLENAME, Database.FIELD_SONGNAME, Database.FIELD_LENGTH, Database.FIELD_FILE_SHA1)
  80. # selects
  81. SELECT = """
  82. SELECT %s, %s FROM %s WHERE %s = UNHEX(%%s);
  83. """ % (Database.FIELD_SONG_ID, Database.FIELD_OFFSET, FINGERPRINTS_TABLENAME, Database.FIELD_HASH)
  84. SELECT_MULTIPLE = """
  85. SELECT HEX(%s), %s, %s FROM %s WHERE %s IN (%%s);
  86. """ % (Database.FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET,
  87. FINGERPRINTS_TABLENAME, Database.FIELD_HASH)
  88. SELECT_MULTIPLE_FILTER = """
  89. SELECT HEX(%s), %s, %s FROM %s WHERE %s IN (%%s) AND id IN (%%s);
  90. """ % (Database.FIELD_HASH, Database.FIELD_SONG_ID, Database.FIELD_OFFSET,
  91. FINGERPRINTS_TABLENAME, Database.FIELD_HASH)
  92. SELECT_ALL = """
  93. SELECT %s, %s FROM %s;
  94. """ % (Database.FIELD_SONG_ID, Database.FIELD_OFFSET, FINGERPRINTS_TABLENAME)
  95. SELECT_SONG = """
  96. SELECT %s, HEX(%s) as %s FROM %s WHERE %s = %%s;
  97. """ % (Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1, Database.FIELD_FILE_SHA1, SONGS_TABLENAME, Database.FIELD_SONG_ID)
  98. SELECT_NUM_FINGERPRINTS = """
  99. SELECT COUNT(*) as n FROM %s
  100. """ % (FINGERPRINTS_TABLENAME)
  101. SELECT_UNIQUE_SONG_IDS = """
  102. SELECT COUNT(DISTINCT %s) as n FROM %s WHERE %s = 1;
  103. """ % (Database.FIELD_SONG_ID, SONGS_TABLENAME, FIELD_FINGERPRINTED)
  104. SELECT_SONGS = """
  105. SELECT %s, %s, HEX(%s) as %s FROM %s WHERE %s = 1;
  106. """ % (Database.FIELD_SONG_ID, Database.FIELD_SONGNAME, Database.FIELD_FILE_SHA1, Database.FIELD_FILE_SHA1,
  107. SONGS_TABLENAME, FIELD_FINGERPRINTED)
  108. # drops
  109. DROP_FINGERPRINTS = "DROP TABLE IF EXISTS %s;" % FINGERPRINTS_TABLENAME
  110. DROP_SONGS = "DROP TABLE IF EXISTS %s;" % SONGS_TABLENAME
  111. # update
  112. UPDATE_SONG_FINGERPRINTED = """
  113. UPDATE %s SET %s = 1 WHERE %s = %%s
  114. """ % (SONGS_TABLENAME, FIELD_FINGERPRINTED, Database.FIELD_SONG_ID)
  115. # delete
  116. DELETE_UNFINGERPRINTED = """
  117. DELETE FROM %s WHERE %s = 0;
  118. """ % (SONGS_TABLENAME, FIELD_FINGERPRINTED)
  119. def __init__(self, **options):
  120. super(MEMDatabase, self).__init__()
  121. self._options = options
  122. def after_fork(self):
  123. raise Exception("not implemented")
  124. # Clear the cursor cache, we don't want any stale connections from
  125. # the previous process.
  126. Cursor.clear_cache()
  127. def setup(self):
  128. """
  129. Creates any non-existing tables required for dejavu to function.
  130. This also removes all songs that have been added but have no
  131. fingerprints associated with them.
  132. """
  133. self.songs = {}
  134. self.fingerprints = {}
  135. #with self.cursor() as cur:
  136. # cur.execute(self.CREATE_SONGS_TABLE)
  137. # cur.execute(self.CREATE_FINGERPRINTS_TABLE)
  138. # cur.execute(self.DELETE_UNFINGERPRINTED)
  139. def empty(self):
  140. raise Exception("not implemented")
  141. """
  142. Drops tables created by dejavu and then creates them again
  143. by calling `SQLDatabase.setup`.
  144. .. warning:
  145. This will result in a loss of data
  146. """
  147. with self.cursor() as cur:
  148. cur.execute(self.DROP_FINGERPRINTS)
  149. cur.execute(self.DROP_SONGS)
  150. self.setup()
  151. def delete_unfingerprinted_songs(self):
  152. raise Exception("not implemented")
  153. """
  154. Removes all songs that have no fingerprints associated with them.
  155. """
  156. with self.cursor() as cur:
  157. cur.execute(self.DELETE_UNFINGERPRINTED)
  158. def get_num_songs(self):
  159. raise Exception("not implemented")
  160. """
  161. Returns number of songs the database has fingerprinted.
  162. """
  163. with self.cursor() as cur:
  164. cur.execute(self.SELECT_UNIQUE_SONG_IDS)
  165. for count, in cur:
  166. return count
  167. return 0
  168. def get_num_fingerprints(self):
  169. raise Exception("not implemented")
  170. """
  171. Returns number of fingerprints the database has fingerprinted.
  172. """
  173. with self.cursor() as cur:
  174. cur.execute(self.SELECT_NUM_FINGERPRINTS)
  175. for count, in cur:
  176. return count
  177. return 0
  178. def set_song_fingerprinted(self, sid):
  179. """
  180. Set the fingerprinted flag to TRUE (1) once a song has been completely
  181. fingerprinted in the database.
  182. """
  183. self.songs[sid]["fingerprinted"] = 1
  184. #with self.cursor() as cur:
  185. # cur.execute(self.UPDATE_SONG_FINGERPRINTED, (sid,))
  186. def get_songs(self):
  187. """
  188. Return songs that have the fingerprinted flag set TRUE (1).
  189. """
  190. if PY3:
  191. for _, song in self.songs.items():
  192. yield song
  193. else:
  194. for _, song in self.songs.iteritems():
  195. yield song
  196. #with self.cursor(cursor_type=DictCursor) as cur:
  197. # cur.execute(self.SELECT_SONGS)
  198. # for row in cur:
  199. # yield row
  200. def get_song_by_id(self, sid):
  201. """
  202. Returns song by its ID.
  203. """
  204. return self.songs.get(sid, None)
  205. #with self.cursor(cursor_type=DictCursor) as cur:
  206. # cur.execute(self.SELECT_SONG, (sid,))
  207. # return cur.fetchone()
  208. def insert(self, hash, sid, offset):
  209. raise Exception("not implemented")
  210. """
  211. Insert a (sha1, song_id, offset) row into database.
  212. """
  213. with self.cursor() as cur:
  214. cur.execute(self.INSERT_FINGERPRINT, (hash, sid, offset))
  215. def insert_song(self, songname, length, file_hash):
  216. """
  217. Inserts song in the database and returns the ID of the inserted record.
  218. """
  219. self.songs[file_hash] = {
  220. Database.FIELD_SONG_ID: file_hash,
  221. Database.FIELD_SONGNAME: songname,
  222. Database.FIELD_LENGTH: length,
  223. "fingerprinted": 0,
  224. Database.FIELD_FILE_SHA1: file_hash
  225. }
  226. return file_hash
  227. #with self.cursor() as cur:
  228. # cur.execute(self.INSERT_SONG, (songname, length, file_hash))
  229. # return cur.lastrowid
  230. def query(self, hash):
  231. raise Exception("not implemented")
  232. """
  233. Return all tuples associated with hash.
  234. If hash is None, returns all entries in the
  235. database (be careful with that one!).
  236. """
  237. # select all if no key
  238. query = self.SELECT_ALL if hash is None else self.SELECT
  239. with self.cursor() as cur:
  240. cur.execute(query)
  241. for sid, offset in cur:
  242. yield (sid, offset)
  243. def get_iterable_kv_pairs(self):
  244. raise Exception("not implemented")
  245. """
  246. Returns all tuples in database.
  247. """
  248. return self.query(None)
  249. def insert_hashes(self, sid, hashes):
  250. """
  251. Insert series of hash => song_id, offset
  252. values into the database.
  253. """
  254. #values = {}
  255. for hash, offset in hashes:
  256. self.fingerprints[hash] = {"sid": sid, "offset": offset}
  257. #self.fingerprints[sid] = values
  258. #values = []
  259. #for hash, offset in hashes:
  260. # values.append((hash, sid, offset))
  261. #with self.cursor() as cur:
  262. # for split_values in grouper(values, 1000):
  263. # cur.executemany(self.INSERT_FINGERPRINT, split_values)
  264. def return_matches(self, hashes, ads_filter=None):
  265. """
  266. Return the (song_id, offset_diff) tuples associated with
  267. a list of (sha1, sample_offset) values.
  268. """
  269. for hash, offset in hashes:
  270. el = self.fingerprints.get(hash, None)
  271. if el is not None:
  272. yield (el["sid"], el["offset"] - offset, el["offset"])
  273. return
  274. # Create a dictionary of hash => offset pairs for later lookups
  275. mapper = {}
  276. for hash, offset in hashes:
  277. mapper[hash.upper()] = offset
  278. # Get an iteratable of all the hashes we need
  279. values = mapper.keys()
  280. with self.cursor() as cur:
  281. for split_values in grouper(values, 1000):
  282. # Create our IN part of the query
  283. if ads_filter is None:
  284. query = self.SELECT_MULTIPLE
  285. query = query % ', '.join(['UNHEX(%s)'] * len(split_values))
  286. else:
  287. query = self.SELECT_MULTIPLE_FILTER
  288. query = query % (', '.join(['UNHEX(%s)'] * len(split_values)), ",".join(ads_filter))
  289. #if ads_filter:
  290. #query = query + " and id in (" + ",".join(ads_filter) + ")"
  291. cur.execute(query, split_values)
  292. for hash, sid, offset in cur:
  293. # (sid, db_offset - song_sampled_offset)
  294. yield (sid, offset - mapper[hash], offset)
  295. def __getstate__(self):
  296. raise Exception("not implemented")
  297. return (self._options,)
  298. def __setstate__(self, state):
  299. raise Exception("not implemented")
  300. self._options, = state
  301. self.cursor = cursor_factory(**self._options)
  302. def grouper(iterable, n, fillvalue=None):
  303. args = [iter(iterable)] * n
  304. return (filter(None, values) for values
  305. in izip_longest(fillvalue=fillvalue, *args))