Source code for dvt.aggregate.audio

# -*- coding: utf-8 -*-
"""Audio annotation objects.

This module provides audio aggregators. We generally use frame numbers
as the reference point, with the frames per second and audio sampling rate
values making it possible to translate from the audio samples to visual frames.
"""

from os.path import join

from matplotlib.pyplot import (
    close, pcolormesh, savefig, plot, xlabel, ylabel, ylim
)
from matplotlib import use
from numpy import (
    arange, int64, log10, mean as np_mean, sqrt, transpose, vstack
)
from scipy.signal import spectrogram

from ..abstract import Aggregator
from ..utils import _check_data_exists, _check_out_dir


[docs]class SpectrogramAggregator(Aggregator): """Compute a spectrogram on the audio input. A spectrogram shows how the spectrum of frequencies varies with time. This aggregator optionaly produces two types of output: png files visualizing the audio track, and an array of numbers describing the spectrogram. Attributes: breaks (list): An increasing list of break points given by frame numbers. When given N+1 breaks, the annotator will produce N outputs. spectrogram (bool): Should the numeric spectrogram be returned. Defaults to False. output_dir (str): Directory pointing to where to store the output spectrogram. Set to None (the default), to suppress the creation of output PNG files. name (str): A description of the aggregator. Used as a key in the output data. """ name = "spectrogram" def __init__(self, **kwargs): self.breaks = kwargs['breaks'] self.spectrogram = kwargs.get('spectrogram', False) self.output_dir = kwargs.get('output_dir', None) super().__init__(**kwargs)
[docs] def aggregate(self, ldframe, **kwargs): """Run a collection of annotators over the input material. If output_dir is not none, produces PNG files of the spectrograms for each group in the desired output location. If spectrogram is set to True, will return the numeric spectrograms. Otherwise returns an empty output. """ _check_data_exists(ldframe, ["meta", "audio", "audiometa"]) if self.output_dir is not None: _check_out_dir(self.output_dir) use("template") dta = ldframe['audio']['data'].values rate = ldframe['audiometa']['rate'].values[0] saved_times = [] saved_specs = [] for stime, audio, i in _audio_chunks( self.breaks, dta, rate, ldframe['meta']['fps'] ): frequencies, times, spec = spectrogram(audio, fs=rate) if self.output_dir is not None: opath = join(self.output_dir, "frame-{0:06d}.png".format(i)) pcolormesh(times + int(stime), frequencies, 10 * log10(spec)) xlabel("Time (seconds)") ylabel("Frequency") savefig(opath) close() if self.spectrogram: saved_times.extend(times + stime) saved_specs.extend([transpose(spec)]) print(spec.shape) if self.spectrogram: return { 'times': saved_times, 'spectrogram': vstack(saved_specs) } return None
[docs]class PowerToneAggregator(Aggregator): """Computes the RMS of power and optionally plot the tone of chunks. The RMS of the tone gives a rough measurement of how loud the input audio track is. The tone PNG files visualize the sound wave over time. Attributes: breaks (list): An increasing list of break points given by frame numbers. When given N+1 breaks, the annotator will produce N outputs. output_dir (str): Directory pointing to where to store the output spectrogram. Set to None (the default), to suppress the creation of output PNG files. name (str): A description of the aggregator. Used as a key in the output data. """ name = "power" def __init__(self, **kwargs): self.breaks = kwargs['breaks'] self.output_dir = kwargs.get('output_dir') super().__init__(**kwargs)
[docs] def aggregate(self, ldframe, **kwargs): """Run a collection of annotators over the input material. If output_dir is not none, produces PNG files of the tone for each group in the desired output location. Then returns the RMS power calculated for each batch of the audio. """ _check_data_exists(ldframe, ["meta", "audio", "audiometa"]) if self.output_dir is not None: _check_out_dir(self.output_dir) use("template") dta = ldframe['audio']['data'].values rate = ldframe['audiometa']['rate'].values[0] output = {'frame_start': [], 'frame_end': [], 'rms': []} for stime, audio, i in _audio_chunks( self.breaks, dta, rate, ldframe['meta']['fps'] ): if self.output_dir is not None: opath = join(self.output_dir, "frame-{0:06d}.png".format(i)) time_array = arange(0, audio.shape[0], 1) time_array = time_array / rate plot(time_array + stime, audio, color='k') ylim([-32768, 32767]) xlabel("Time (seconds)") ylabel("Amplitude") savefig(opath) close() output['frame_start'].append(self.breaks[i]) output['frame_end'].append(self.breaks[i+1]) output['rms'].append(sqrt(np_mean(int64(audio)**2))) return output
def _audio_chunks(breaks, data, rate, fps): for i in range(len(breaks) - 1): time_start = float(breaks[i] / fps) time_end = float(breaks[i + 1] / fps) index_start = int(time_start * rate) index_end = int(time_end * rate) yield time_start, data[index_start:index_end], i