AudioValid
/
fourier-ondemand


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325
							# -*- coding: utf8 -*-
from __future__ import print_function, absolute_import
from tornado.ioloop import IOLoop
from tornado.web import Application
from fourier.api.client import Client, ConnectionError
from fourier.boxconfig import parse_config
from fourier.dejavu.recognize import FilePerSecondRecognizer
from datetime import datetime, timedelta
from ondemand.endpoint import setup_endpoint
from fourier.dejavu import Dejavu
from Queue import Queue, Empty
from firebase_admin import credentials
from firebase_admin import db as fbdb
from binascii import hexlify
from base64 import b64decode
from threading import Thread
from multiprocessing import Process
from argparse import ArgumentParser
import logging as log
import firebase_admin
import mutagen.mp3
import requests
import dateutil
import sqlite3
import time
import sys
import os

log.basicConfig(format='[%(asctime)s] %(message)s', level=log.INFO)

AUDIOS_PATH = '/tmp'
AHEAD_TIME_AUDIO_TOLERANCE = 2 # second
MAX_SEGMENT_THREADS = 4

# Modos de procesamiento de queue
#  - QUEQUE_SINGLE: procesa solo un segmento a la vez
#  - QUEUE_THREAD:  inicia un hilo para cada segmento
# Por default se usará el threaded.
# TOOD: hacerlo configurable por medio de argumentos
#       de ejecución.
QUEUE_SINGLE = 1
QUEUE_THREAD = 2

# Se pueden usar diferentes API's
# la de threading y la de multiprocessing.
MultiAPI = Process

config = parse_config()
queue = Queue()
client = Client(config['device_id'],
                config['apiSecret'])
cloud_base_url = 'https://storage.googleapis.com/{}'\
                .format(config['bucket'])
base_path = config.get("basepath", "/var/fourier")
fb_credentials = credentials.Certificate('/etc/Fourier-key.json')
firebase_admin.initialize_app(fb_credentials, config['firebase'])
dejavu = Dejavu({"database_type":"mem"})
device_id = config['device_id']
device_path = os.path.join(base_path, device_id)
recognizer = FilePerSecondRecognizer
queue_mode = QUEUE_THREAD

db_path = config.get('localDatabase', os.path.join(device_path, 'files.db'))
db = sqlite3.connect(db_path)
cloud_cache = {}

def feed_queue():
    """ Search for pending scheduled work in
    server and add them to a memory queue. """
    try:
        response = client.get_schedule_pending()
        downloaded_counter = len(response['items'])
        for item in response['items']:
            queue.put(item)

        if downloaded_counter:
            log.info(('[feed_queue] {} new '
                    + 'pending schedule items.')\
                    .format(downloaded_counter)
                    )

        if queue.qsize() > 0:
            if queue_mode == QUEUE_THREAD:
                loop.add_callback(process_queue_with_threads)
            else:
                loop.add_callback(process_queue)
        else:
            loop.add_timeout(time.time() + 30, feed_queue)

    except ConnectionError as ex:
        log.error('[feed_queue] cannot feed: {}, retryig later'.format(ex))
        loop.add_timeout(time.time() + 15, feed_queue)

    except Exception as ex:
        """ Errores desconocidos """
        log.error('[feed_queue] {}'.format(ex))
        loop.add_timeout(time.time() + 60, feed_queue)

def process_queue():
    """ Try to the next item in a queue and start
    processing it accordingly. If success, repeat
    the function or go to feed if no more items. """
    try:
        item = queue.get(False)
        process_segment(item)
        loop.add_callback(process_queue)
    except Empty:
        loop.add_callback(feed_queue)
    except Exception as ex:
        log.error(ex)
        loop.add_callback(process_queue)

def process_queue_with_threads():
    threads = [None] * MAX_SEGMENT_THREADS
    is_drained = False

    log.info('Starting thread processing')

    while True:
        for index, t in enumerate(threads):
            if not t:
                try:
                    item = queue.get(False)
                    station = item['station']
                    date = dateutil.parser.parse(item['date'])

                    thread = MultiAPI(target=process_segment,
                        args=(item,),
                        kwargs={
                            'audios': [f for f in iterate_audios(date, station)]
                        }
                    )
                    threads[index] = thread
                    thread.start()
                except Empty:
                    is_drained = True

            elif not t.is_alive():
                threads[index] = None

        if is_drained:
            if threads.count(None) == MAX_SEGMENT_THREADS:
                break

    log.info('Finished thread processing')
    loop.add_callback(feed_queue)

def process_segment(item, audios=None):
    """ Procesa una hora de audio """

    station = item['station']
    date = dateutil.parser.parse(item['date'])

    log.info('processing segment: {}'.format(item))

    # 1. obtener el audio desde firebase
    #    y calcular su fingerprint.
    filename, md5hash = cloud_download(ad_key=item['ad'])
    if not filename:
        log.info('ad file missing')
        return

    # 1.1 Calcular el número de segmentos requeridos
    # de acuerdo a la duración total del audio.
    try:
        audio = mutagen.mp3.MP3(filename)
        segments_needed = int(round(float(audio.info.length) / float(5)))
    except Exception as ex:
        log.error('file {} is not an mp3'.format(audio))
        log.error(str(ex))
        return

    try:
        dejavu.fingerprint_file(filename)
    except Exception as ex:
        log.error('cannot fingerprint: {}'.format(ex))

    """ Hay dos posibles escensarios al obtener los audios
      a. Los audios vienen por el parámetro "audios" de la
         función, siendo esta una lista.
      b. Los audios se obtienen directamente de la base
         de datos en modo de cursor.
    """
    audios_iterable = audios if audios \
        else iterate_audios(date, station)

    # 2. Read the list of files from local database
    audios_counter = 0
    results = []
    for path, name, ts in audios_iterable:
        log.info('file: {}'.format(path))
        audios_counter += os.path.isfile(path)
        for match in dejavu.recognize(recognizer, path, 5,
                                      ads_filter=[md5hash]):
            try:
                results.append({
                    'confidence': match['confidence'],
                    'timestamp': ts,
                    'offset': match['offset']
                })
                log.info("{} {}".format(ts, match['confidence']))
            except KeyError as ex:
                log.error(str(ex))

            ts += match['length'] / 1000
    
    try:
        response = client.put_schedule_results(
            item['schedule'],
            item['id'],
            None, # TODO: send results again
            found=find_repetitions(results,
                segments_needed=segments_needed
            ),
            missing_files=(12 - audios_counter) \
                          if audios_counter < 12 else 0
        )
        log.info('API response: {}'.format(response))
    except ConnectionError as ex:
        log.error(str(ex))
    except UserWarning as warn:
        log.warning(str(warn))

def find_repetitions(results, segments_needed=2):
    found_counter = 0
    found_index = None
    seconds_needed = 9
    threshold = 20
    expect_space = False
    found = []

    if segments_needed < 1:
        segments_needed = 1

    for index, result in enumerate(results):
        if not expect_space:
            if result['confidence'] > threshold:
                found_counter += 1
                if found_index is None:
                    found_index = index
            else:
                found_counter = 0
                found_index = None
        else:
            if result['confidence'] <= threshold:
                expect_space = False

        if found_counter >= segments_needed:
            found.append(results[found_index]['timestamp'])
            found_counter = 0
            expect_space = True

    return found

def iterate_audios(dt, station):
    """ Given a datetime object and an station,
    iterate a list of files that are between
    the the date and itself plus 5 minutes;
    station must match too """
    from_time = time.mktime(dt.timetuple()) \
              - AHEAD_TIME_AUDIO_TOLERANCE
    to_time = from_time + 3599 + AHEAD_TIME_AUDIO_TOLERANCE

    log.info('from {} to {}'.format(int(from_time), int(to_time)))

    cursor = db.cursor()
    cursor.execute((
        'select "filename", "timestamp" '
        'from "file" '
        'where "timestamp" between ? and ? '
            'and "station" = ? '
        'order by "timestamp" asc'
        ),   
        (from_time, to_time, station, ),
    )
    files = [file for file in cursor]
    cursor.close()

    for mp3 in files:
        mp3path, ts = mp3
        mp3name = os.path.basename(mp3path)
        yield (mp3path, mp3name, ts)


def cloud_download(ad_key=None):
    """ Given an ad key, the file is downloaded to
    the system temporal folder to be processed """

    if ad_key in cloud_cache:
        """ If this file has already been downloaded,
        will not be downloaded again, instead will
        be taken from cloud_cache dictionary """
        filename, md5hash = cloud_cache[ad_key]
        if os.path.isfile(filename):
            return filename, md5hash

    ad = fbdb.reference('ads/{}'.format(ad_key)).get()
    filename = os.path.basename(ad['path'])
    out_file = os.path.join(AUDIOS_PATH, filename)
    url = '{}/{}'.format(cloud_base_url, ad['path'])
    response = requests.get(url)
    
    if response.status_code == 200:
        hashes = response.headers['x-goog-hash']
        hashes = hashes.split(',')
        hashes = [h.split('=', 1) for h in hashes]
        hashes = {h[0].strip(): hexlify(b64decode(h[1])) for h in hashes}
        md5sum = hashes['md5']

        with open(out_file, "wb") as fp:
            fp.write(response.content)
            tp = (out_file, md5sum,)
            cloud_cache[ad_key] = tp
            return tp

app = setup_endpoint(queue=queue)
loop = IOLoop.current()
loop.add_callback(feed_queue)

if __name__ == '__main__':
    try:
        log.info('Starting ondemand service')
        loop.start()
    except KeyboardInterrupt:
        log.error('Process killed')