123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150 |
- from __future__ import absolute_import
- import os
- import sys
- import fnmatch
- import numpy as np
- from pydub import AudioSegment
- from pydub.utils import audioop
- from dejavu import wavio
- from hashlib import sha1
- DEFAULT_FS = int(44100 / 2)
- if sys.version_info >= (3, 0):
- xrange = range
- def unique_hash(filepath, blocksize=2**20):
- """ Small function to generate a hash to uniquely generate
- a file. Inspired by MD5 version here:
- http://stackoverflow.com/a/1131255/712997
- Works with large files.
- """
- s = sha1()
- with open(filepath , "rb") as f:
- while True:
- buf = f.read(blocksize)
- if not buf:
- break
- s.update(buf)
- return s.hexdigest().upper()
- def find_files(path, extensions):
- # Allow both with ".mp3" and without "mp3" to be used for extensions
- extensions = [e.replace(".", "") for e in extensions]
- for dirpath, dirnames, files in os.walk(path):
- for extension in extensions:
- for f in fnmatch.filter(files, "*.%s" % extension):
- p = os.path.join(dirpath, f)
- yield (p, extension)
- def read_chunks(filename, chunk_size = 1, start = 0, fmt = None):
- start = start * 1000
- chunk_size = chunk_size * 1000
- filename_hash = unique_hash(filename)
- try:
- audiofile = AudioSegment.from_file(filename,fmt)
- if audiofile.frame_rate != DEFAULT_FS:
- audiofile = audiofile.set_frame_rate(DEFAULT_FS)
- while True:
- end = start + chunk_size
- audio_chunk = audiofile[start:end]
- if len(audio_chunk) == 0:
- return;
- data = np.fromstring(audio_chunk._data, np.int16)
- channels = []
- for chn in xrange(audio_chunk.channels):
- channels.append(data[chn::audio_chunk.channels])
- yield channels, audio_chunk.frame_rate,filename_hash, len(audio_chunk)
- start = end
-
- except audioop.error:
- fs, _, audiofile = wavio.readwav(filename)
-
- if audiofile.frame_rate != DEFAULT_FS:
- audiofile = audiofile.set_frame_rate(DEFAULT_FS)
- while True:
- end = start + chunk_size
- audio_chunk = audiofile[start:end]
- if len(audio_chunk) == 0:
- return;
- audio_chunk = audio_chunk.T
- audio_chunk = audio_chunk.astype(np.int16)
- channels = []
- for chn in audio_chunk:
- channels.append(chn)
- yield channels, audio_chunk.frame_rate, filename_hash, len(audio_chunk)
- start = end
-
- def read(filename, limit=None, fmt = None, offset = 0):
- """
- Reads any file supported by pydub (ffmpeg) and returns the data contained
- within. If file reading fails due to input being a 24-bit wav file,
- wavio is used as a backup.
- Can be optionally limited to a certain amount of seconds from the start
- of the file by specifying the `limit` parameter. This is the amount of
- seconds from the start of the file.
- returns: (channels, samplerate)
- """
- if limit:
- offset = offset * 1000
- limit = offset + limit * 1000
- # pydub does not support 24-bit wav files, use wavio when this occurs
- try:
- audiofile = AudioSegment.from_file(filename,fmt)
- if audiofile.frame_rate != DEFAULT_FS:
- audiofile = audiofile.set_frame_rate(DEFAULT_FS)
- if limit:
- audiofile = audiofile[offset:limit]
- data = np.fromstring(audiofile._data, np.int16)
- channels = []
- for chn in xrange(audiofile.channels):
- channels.append(data[chn::audiofile.channels])
- fs = audiofile.frame_rate
- except audioop.error:
- fs, _, audiofile = wavio.readwav(filename)
-
- if audiofile.frame_rate != DEFAULT_FS:
- audiofile = audiofile.set_frame_rate(DEFAULT_FS)
- if limit:
- audiofile = audiofile[offset:limit]
- audiofile = audiofile.T
- audiofile = audiofile.astype(np.int16)
- channels = []
- for chn in audiofile:
- channels.append(chn)
- return channels, audiofile.frame_rate, unique_hash(filename), len(audiofile)
- def path_to_songname(path):
- """
- Extracts song name from a filepath. Used to identify which songs
- have already been fingerprinted on disk.
- """
- return os.path.splitext(os.path.basename(path))[0]
|