service_streamlit.py 9.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. # -*- coding: utf8 -*-
  2. from __future__ import print_function, absolute_import
  3. from tornado.ioloop import IOLoop
  4. from client import Client, ConnectionError
  5. from boxconfig import parse_config
  6. from dejavu.recognize import FilePerSecondRecognizer
  7. from endpoint import setup_endpoint
  8. from calibration import Calibrations
  9. from dejavu import Dejavu, CouldntDecodeError
  10. from multiprocessing import Process
  11. import logging as log
  12. import mutagen.mp3
  13. import math
  14. import sys
  15. import os
  16. from datetime import datetime
  17. import streamlit as st
  18. if sys.version_info >= (3, 0):
  19. from queue import Queue, Empty
  20. else:
  21. from Queue import Queue, Empty
  22. log.basicConfig(format='[%(asctime)s] [%(module)s] %(message)s', level=log.INFO)
  23. AUDIOS_PATH = '/tmp'
  24. AHEAD_TIME_AUDIO_TOLERANCE = 2 # second
  25. MAX_SEGMENT_THREADS = 4
  26. THRESHOLD = 10
  27. SEGMENTS_TOLERANCE_RATE = 0.6
  28. FALL_TOLERANCE_SEGMENTS = 1
  29. # THRESHOLD
  30. THRESHOLD_FIXED = 1
  31. THRESHOLD_AVERAGE = 2
  32. # Modos de procesamiento de queue
  33. # - QUEQUE_SINGLE: procesa solo un segmento a la vez
  34. # - QUEUE_THREAD: inicia un hilo para cada segmento
  35. # Por default se usará el threaded.
  36. # TODO: hacerlo configurable por medio de argumentos
  37. # de ejecución.
  38. QUEUE_SINGLE = 1
  39. QUEUE_THREAD = 2
  40. # Se pueden usar diferentes API'se
  41. # la de threading y la de multiprocessing.
  42. MultiAPI = Process
  43. config = parse_config()
  44. queue = Queue()
  45. client = Client(config['device_id'],
  46. config['apiSecret'])
  47. cloud_base_url = 'https://storage.googleapis.com/{}' \
  48. .format(config['bucket'])
  49. base_path = config.get("basepath", "/var/fourier")
  50. device_id = config['device_id']
  51. device_path = os.path.join(base_path, device_id)
  52. recognizer = FilePerSecondRecognizer
  53. # settings
  54. queue_mode = QUEUE_SINGLE
  55. threshold_mode = THRESHOLD_FIXED
  56. db_path = config.get('localDatabase', os.path.join(device_path, 'files.db'))
  57. #db = sqlite3.connect(db_path)
  58. cloud_cache = {}
  59. def process_segment(anuncio, audio_busqueda, audios=None, calibration=None):
  60. """ Procesa una hora de audio """
  61. print(anuncio +" y "+ audio_busqueda)
  62. #date = dateutil.parser.parse(item['fecha'], ignoretz=True)
  63. segment_size = 5
  64. audio_length = 0
  65. # 1.1 Calcular el número de segmentos requeridos
  66. # de acuerdo a la duración total del audio.
  67. try:
  68. #filename = "/tmp/anuncios/RDF2112020ORIGINAL.MP3"
  69. filename = anuncio
  70. audio = mutagen.mp3.MP3(filename)
  71. audio_length = audio.info.length
  72. if segment_size == 'integer':
  73. segment_size = int(audio_length)
  74. elif segment_size == 'ceil':
  75. segment_size = int(math.ceil(audio_length / 5)) * 5
  76. segments_needed = int(round(float(audio_length) / float(segment_size)))
  77. segments_needed = int(round(segments_needed * 0.8))
  78. except Exception as ex:
  79. #log.error('[process_segment] file {} is not an mp3'.format(filename))
  80. log.error(str(ex))
  81. return
  82. dejavu = Dejavu({"database_type": "mem"})
  83. try:
  84. dejavu.fingerprint_file(filename)
  85. except Exception as ex:
  86. log.error('[process_segment] cannot fingerprint: {}'.format(ex))
  87. # 2. Read the list of files from local database
  88. audios_counter = 0
  89. results = []
  90. v = []
  91. #audios_iterable = [("/tmp/anuncios/RDF2112020GDL115271020NOESTAENREPORTEPEROESCORRECTO.mp3", "RDF2112020GDL115271020NOESTAENREPORTEPEROESCORRECTO.mp3", 1000)]
  92. audios_iterable = [(audio_busqueda, audio_busqueda, 1000)]
  93. for path, name, ts in audios_iterable:
  94. audios_counter += os.path.isfile(path)
  95. values = []
  96. try:
  97. for match in dejavu.recognize(recognizer, path, segment_size):
  98. name = None
  99. ad = None
  100. results.append({
  101. 'ad': ad,
  102. 'confidence': match['confidence'],
  103. 'timestamp': ts,
  104. 'offset': match['offset'],
  105. 'name': name
  106. })
  107. values.append(str(match['confidence']))
  108. ts += match['length'] / 1000
  109. v.append(','.join(values))
  110. log.info('[process_segment] {0}) {1}'.format(
  111. os.path.split(path)[-1],
  112. ','.join(values),
  113. ))
  114. st.text('[process_segment] {0}) {1}'.format(
  115. os.path.split(path)[-1],
  116. ','.join(values),
  117. ))
  118. except CouldntDecodeError as ex:
  119. log.error('[process_segment] {}'.format(ex))
  120. try:
  121. encontrados = {}
  122. item_ids = []
  123. for i in item_ids:
  124. r = [result for result in results if result["name"] == i]
  125. encontrados[i] = find_repetitions(r, segments_needed=segments_needed, calibration=calibration,)
  126. #for id in encontrados:
  127. # for e in encontrados[id]:
  128. # for i in item['elementos']:
  129. # if i['id'] == id and i['anuncio'] == e['ad']:
  130. # if 'encontrados' not in i:
  131. # i['encontrados'] = []
  132. # i['encontrados'].append(e)
  133. # break
  134. #item["archivos_perdidos"] = (12 - audios_counter) if audios_counter < 12 else 0
  135. except ConnectionError as ex:
  136. log.error('[process_segment] {}'.format(str(ex)))
  137. except UserWarning as warn:
  138. log.warning(str(warn))
  139. def find_repetitions(results, segments_needed=2, calibration=None):
  140. found_counter = 0
  141. found_down_counter = 0
  142. found_index = None
  143. expect_space = False
  144. expect_recover = False
  145. last_value_in_threshold_index = -1
  146. fall_tolerance = calibration['fallTolerance']
  147. found = []
  148. high = 100 # Obtener este valor desde un parámetro
  149. middle_high = 50 # Obtener este valor desde un parámetro
  150. segment_middle_needed = 2 # Obtener este valor desde un parámetro
  151. found_high = None
  152. found_middle_high = []
  153. if threshold_mode == THRESHOLD_FIXED:
  154. threshold = calibration['threshold']
  155. elif threshold_mode == THRESHOLD_AVERAGE:
  156. values = [x['confidence'] for x in results]
  157. threshold = math.ceil(float(sum(values)) / float(len(values)))
  158. if segments_needed < 1:
  159. segments_needed = 1
  160. for index, result in enumerate(results):
  161. #if result['confidence'] >= high:
  162. # if found_high is None:
  163. # found_high = index
  164. # elif result['confidence'] > results[found_high]['confidence']:
  165. # found_high = index
  166. #elif result['confidence'] >= middle_high:
  167. # found_middle_high.append(index)
  168. if not expect_space:
  169. if result['confidence'] >= threshold:
  170. found_counter += 1
  171. last_value_in_threshold_index = index
  172. if found_index is None:
  173. found_index = index
  174. if expect_recover:
  175. found_counter += found_down_counter
  176. expect_recover = False
  177. elif fall_tolerance:
  178. if not expect_recover:
  179. if last_value_in_threshold_index != -1:
  180. """ Solo cuando ya haya entrado por lo menos
  181. un valor en el rango del threshold, es cuando
  182. se podrá esperar un valor bajo """
  183. expect_recover = True
  184. found_down_counter += 1
  185. else:
  186. pass
  187. else:
  188. """ Si después de haber pasado tolerado 1 elemento
  189. vuelve a salir otro fuera del threshold continuo,
  190. entonces ya se da por perdido """
  191. found_counter = 0
  192. found_down_counter = 0
  193. found_index = None
  194. expect_recover = False
  195. else:
  196. found_counter = 0
  197. found_down_counter = 0
  198. found_index = None
  199. expect_recover = False
  200. # Aquí veremos si hay un valor alto
  201. #if found_high is not None:
  202. # found_row = results[found_high]
  203. # found.append(found_row)
  204. #elif len(found_middle_high) >= segment_middle_needed:
  205. # found_row = results[found_middle_high[0]]
  206. # found.append(found_row)
  207. #found_high = None
  208. #found_middle_high = []
  209. else:
  210. if result['confidence'] <= threshold:
  211. expect_space = False
  212. if found_counter >= segments_needed:
  213. found_row = results[found_index]
  214. found.append(found_row)
  215. found_counter = 0
  216. expect_space = True
  217. #found_high = None
  218. #found_middle_high = []
  219. return found
  220. def WebService():
  221. anuncio_file = st.file_uploader("Selecciona el anuncio")
  222. if anuncio_file is not None:
  223. name = "anuncio"
  224. with open(name, "wb") as anuncio:
  225. anuncio.write(anuncio_file.getvalue())
  226. audio_file = st.file_uploader("Selecciona el audio")
  227. if audio_file is not None:
  228. with open("audio", "wb") as audio:
  229. audio.write(audio_file.getvalue())
  230. if st.button("Comparar"):
  231. process_segment("anuncio", "audio")
  232. WebService()