```
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Musical data augmentation example code accompanying the paper
Jan Schlüter and Thomas Grill: Exploring Data Augmentation for
Improved Singing Voice Detection with Neural Networks.
In Proceedings of the 16th International Society for Music
Information Retrieval Conference (ISMIR), Malaga, Spain, 2015.
PDF: http://ofai.at/~jan.schlueter/pubs/2015_ismir.pdf
BibTeX: http://ofai.at/~jan.schlueter/pubs/2015_ismir.bib
Contact: http://ofai.at/~jan.schlueter
This code demonstrates how to incorporate any combination of six of the
augmentation methods in the paper into an existing training loop.
When run as a script, it will generate and display randomly chosen minibatches
with random pitch-shifting (+/- 30%), time-stretching (+/- 30%) and frequency
filtering (+/- 10 dB). If one or more audio files are supplied on the command
line, it will use those, otherwise it will download two files from the web.
When imported, you can use any of the functions for your own data.
Requirements:
- Python 2.7 or 3.4+
- Numpy, Scipy, Matplotlib
- ffmpeg or avconv
Feel free to use the code under the terms of the following license:
The MIT License (MIT)
Copyright (c) 2015 Jan Schlüter
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""
import sys
import os
import subprocess
import numpy as np
import scipy.ndimage
import matplotlib.pyplot as plt
def main():
# 'parse' command line
audio_files = sys.argv[1:]
# download two files from the web if none were supplied
if not audio_files:
audio_files = download_audio_examples()
# extract linear spectrograms from the audio files
print("Precomputing linear spectra...")
spects = [extract_spect(audio_file) for audio_file in audio_files]
# invent frame-wise binary labels for them
print("Inventing labels...")
labels = [(np.random.rand(len(spect)) > .5) for spect in spects]
# define demo training step function that just displays the excerpts
# (in an actual experiment, this could instead be a neural network update
# function compiled with Lasagne: https://github.com/Lasagne/Lasagne)
print("Running demo loop...")
def plot_data(inputs, targets):
plt.figure()
for i in range(min(16, len(inputs))):
plt.subplot(4, 4, i+1)
plt.imshow(inputs[i].T, interpolation='nearest', origin='lower',
cmap='hot')
plt.show()
# run demo training loop
training_loop(spects, labels, train_fn=plot_data)
def training_loop(spects, labels, train_fn, steps=10, batchsize=16, frames=115,
sample_rate=22050):
"""
Demonstrates running a training loop with musical data augmentation. The
augmentation methods to use are hard-coded here (it's just a demo).
Parameters
----------
spects : list of ndarray
Linear magnitude spectrograms, one per input file.
labels : list of ndarray
Binary frame-wise labels, one array per input file.
train_fn : callable
Function performing a training step. Will be called with a mini-batch
of `batchsize` randomly augmented spectrogram excerpts of `frames`
frames and the accompanying binary label
steps : int
Numer of training steps to be performed.
batchsize : int
Number of excerpts and labels to call `train_fn` with
frames : int
Length of spectrogram excerpts passed to `train_fn`, in frames
sample_rate : float
Sample rate of the spectrograms in `spects`, needed for mel filtering.
"""
# We're first going to set up a generator for our mini-batches.
# As we want to apply random time-stretching (up to 30%), we request longer
# excerpts than we finally need to return.
max_stretch = .3
batches = grab_random_excerpts(spects, labels, batchsize=batchsize,
frames=int(frames / (1 - max_stretch)))
# We wrap the generator in another one that applies random time stretching
# and pitch shifting, keeping a given number of frames and bins only.
max_shift = .3
bin_nyquist = spects[0].shape[1]
bin_8k = bin_nyquist * 2 * 8000 // sample_rate # frequency bin of 8 kHz
batches = apply_random_stretch_shift(batches, max_stretch, max_shift,
keep_frames=frames, keep_bins=bin_8k)
# We transform the excerpts to logarithmic magnitude and mel frequency.
frame_len = (spects[0].shape[1] - 1) * 2
mel_bank = create_mel_filterbank(sample_rate, frame_len, 80, 27.5, 8000)
mel_bank = mel_bank[:bin_8k]
batches = apply_filterbank(batches, mel_bank)
batches = apply_logarithm(batches)
# We apply random frequency filters (we could have done that earlier, but
# it's a bit more efficient in logarithmic-magnitude mel space)
batches = apply_random_filters(batches, mel_bank, 8000, max_db=10)
# We apply dropout (comment in to enable)
# batches = apply_dropout(batches, p=.1)
# We apply Gaussian noise (comment in to enable)
# Note: In the paper, we did that after normalizing inputs to unit variance
# batches = apply_gaussian_noise(batches, sigma=.1)
# We start the mini-batch generator and augmenter in a background thread
# (not required, can be disabled just like anything else)
batches = generate_in_background(batches, num_cached=5)
# Blending with negative examples (the seventh augmentation method in the
# paper) is not shown here, as this requires a second generator yielding
# only negative training examples. It would be easy to blend with fully
# random examples, though.
# We can now loop over the generated batches and call the training function
for step, (inputs, targets) in enumerate(batches):
if step == steps:
break
train_fn(inputs, targets)
def download_audio_examples():
"""
Downloads two hard-coded files from the web, and returns the local file
names they were saved under.
"""
if sys.version_info[0] == 2:
from urllib import urlretrieve
else:
from urllib.request import urlretrieve
# just got archive.org URLs from the SALAMI dataset, should be long-lived
urls = ['http://www.archive.org/download/rachel2004-09-18.flac16/'
'rachel2004-09-18d1t6_vbr.mp3',
'http://www.archive.org/download/chingus2005-03-25.flac16/'
'chingus2005-03-25d1t05_vbr.mp3']
result = []
for url in urls:
target = url.rsplit('/', 1)[-1]
if not os.path.exists(target):
print("Downloading " + url)
try:
urlretrieve(url, target)
except:
print("Download failed.")
if os.path.exists(target):
result.append(target)
return result
def read_ffmpeg(infile, sample_rate, cmd='ffmpeg'):
"""
Decodes a given audio file using ffmpeg, resampled to a given sample rate,
downmixed to mono, and converted to float32 samples. Returns a numpy array.
"""
call = [cmd, "-v", "quiet", "-i", infile, "-f", "f32le",
"-ar", str(sample_rate), "-ac", "1", "pipe:1"]
samples = subprocess.check_output(call)
return np.frombuffer(samples, dtype=np.float32)
def extract_spect(filename, sample_rate=22050, frame_len=1024, fps=70):
"""
Extracts a magnitude spectrogram for a given audio file at a given sample
rate (in Hz), frame length (in samples) and frame rate (in Hz). Returns a
numpy array.
"""
try:
samples = read_ffmpeg(filename, sample_rate)
except Exception:
samples = read_ffmpeg(filename, sample_rate, cmd='avconv')
win = np.hanning(frame_len)
hopsize = sample_rate // fps
spect = np.vstack(np.abs(np.fft.rfft(samples[pos:pos + frame_len] * win))
for pos in range(0, len(samples) - frame_len + 1,
int(hopsize)))
return spect
def grab_random_excerpts(spects, labels, batchsize, frames):
"""
Exctracts random excerpts of `frames` frames from the spectrograms in
`spects` paired with the label from `labels` associated with the central
frame. Yields `batchsize` excerpts at a time. Draws without replacement
until exhausted, then shuffles anew and continues.
"""
# buffer to write each batch into
batch_spects = np.empty((batchsize, frames, spects[0].shape[1]),
dtype=spects[0].dtype)
batch_labels = np.empty((batchsize,) + labels[0].shape[1:],
dtype=labels[0].dtype)
# array of all possible (spect_idx, frame_idx) combinations
indices = np.vstack(np.vstack((np.ones(len(spect) - frames + 1,
dtype=np.int) * spect_idx,
np.arange(len(spect) - frames + 1,
dtype=np.int))).T
for spect_idx, spect in enumerate(spects))
# infinite loop
while True:
# shuffle all possible indices
np.random.shuffle(indices)
# draw without replacement until exhausted
b = 0
for spect_idx, frame_idx in indices:
batch_spects[b] = spects[spect_idx][frame_idx:frame_idx + frames]
batch_labels[b] = labels[spect_idx][frame_idx + frames//2]
b += 1
if b == batchsize:
yield batch_spects, batch_labels
b = 0
def apply_random_stretch_shift(batches, max_stretch, max_shift,
keep_frames, keep_bins):
"""
Apply random time stretching of up to +/- `max_stretch`, random pitch
shifting of up to +/- `max_shift`, keeping the central `keep_frames` frames
and the first `keep_bins` bins.
"""
for spects, labels in batches:
outputs = np.empty((len(spects), keep_frames, keep_bins),
dtype=spects.dtype)
randoms = (np.random.rand(len(spects), 2) - .5) * 2
for spect, output, random in zip(spects, outputs, randoms):
stretch = 1 + random[0] * max_stretch
shift = 1 + random[1] * max_shift
# We can do shifting/stretching and cropping in a single affine
# transform (including setting the upper bands to zero if we shift
# down the signal so far that we exceed the input's nyquist rate)
scipy.ndimage.affine_transform(
spect, (1 / stretch, 1 / shift),
output_shape=(keep_frames, keep_bins), output=output,
offset=(.5 * (len(spect) * stretch - keep_frames), 0),
mode='constant', cval=0)
yield outputs, labels
def create_mel_filterbank(sample_rate, frame_len, num_bands, min_freq,
max_freq):
"""
Creates a mel filterbank of `num_bands` triangular filters, with the first
filter starting at `min_freq` and the last one stopping at `max_freq`.
Returns the filterbank as a matrix suitable for a dot product against
magnitude spectra created from samples at a sample rate of `sample_rate`
with a window length of `frame_len` samples.
"""
# prepare output matrix
input_bins = (frame_len // 2) + 1
filterbank = np.zeros((input_bins, num_bands))
# mel-spaced peak frequencies
min_mel = 1127 * np.log1p(min_freq / 700.0)
max_mel = 1127 * np.log1p(max_freq / 700.0)
spacing = (max_mel - min_mel) / (num_bands + 1)
peaks_mel = min_mel + np.arange(num_bands + 2) * spacing
peaks_hz = 700 * (np.exp(peaks_mel / 1127) - 1)
fft_freqs = np.linspace(0, sample_rate / 2., input_bins)
peaks_bin = np.searchsorted(fft_freqs, peaks_hz)
# fill output matrix with triangular filters
for b, filt in enumerate(filterbank.T):
# The triangle starts at the previous filter's peak (peaks_freq[b]),
# has its maximum at peaks_freq[b+1] and ends at peaks_freq[b+2].
left_hz, top_hz, right_hz = peaks_hz[b:b+3] # b, b+1, b+2
left_bin, top_bin, right_bin = peaks_bin[b:b+3]
# Create triangular filter compatible to yaafe
filt[left_bin:top_bin] = ((fft_freqs[left_bin:top_bin] - left_hz) /
(top_bin - left_bin))
filt[top_bin:right_bin] = ((right_hz - fft_freqs[top_bin:right_bin]) /
(right_bin - top_bin))
filt[left_bin:right_bin] /= filt[left_bin:right_bin].sum()
return filterbank
def apply_filterbank(batches, filterbank):
"""
Apply a filterbank to batches of spectrogram excerpts via a dot product.
"""
for spects, labels in batches:
# we reshape (batchsize, frames, bins) to (batchsize * frames, bins) so
# we can transform all excerpts in a single dot product, then go back
# to (batchsize, frames, filters)
yield (np.dot(spects.reshape(-1, spects.shape[-1]), filterbank).reshape(
(spects.shape[0], spects.shape[1], -1)), labels)
def apply_logarithm(batches, clip=1e-5):
"""
Convert linear to logarithmic magnitudes, clipping magnitudes < `clip`.
"""
# Note: In the paper, we clipped below 1e-7. For visualization, 1e-5 is
# better as it avoids extreme ranges of logarithmic magnitudes.
for spects, labels in batches:
yield np.log(np.maximum(spects, clip)), labels
def apply_random_filters(batches, filterbank, max_freq, max_db, min_std=5,
max_std=7):
"""
Applies random filter responses to logarithmic-magnitude mel spectrograms.
The filter response is a Gaussian of a standard deviation between `min_std`
and `max_std` semitones, a mean between 150 Hz and `max_freq`, and a
strength between -/+ `max_db` dezibel. Assumes the mel spectrograms have
been transformed with `filterbank` and cover up to `max_freq` Hz.
"""
for spects, labels in batches:
batchsize, length, bands = spects.shape
bins = len(filterbank)
# sample means and std deviations on logarithmic pitch scale
min_pitch = 12 * np.log2(150)
max_pitch = 12 * np.log2(max_freq)
mean = min_pitch + (np.random.rand(batchsize) *
(max_pitch - min_pitch))
std = min_std + np.random.randn(batchsize) * (max_std - min_std)
# convert means and std deviations to linear frequency scale
std = 2**((mean + std) / 12) - 2**(mean / 12)
mean = 2**(mean / 12)
# convert means and std deviations to bins
mean = mean * bins / max_freq
std = std * bins / max_freq
# sample strengths uniformly in dB
strength = max_db * 2 * (np.random.rand(batchsize) - .5)
# create Gaussians
filt = (strength[:, np.newaxis] *
np.exp(np.square((np.arange(bins) - mean[:, np.newaxis]) /
std[:, np.newaxis]) * -.5))
# transform from dB to factors
filt = 10**(filt / 20.)
# transform to mel scale
filt = np.dot(filt, filterbank)
# logarithmize
filt = np.log(filt)
# apply (it's a simple addition now, broadcasting over the second axis)
yield spects + filt[:, np.newaxis, :], labels
def apply_dropout(batches, p):
"""
Sets input elements to zero with probability `p` and scales the result by
1/(1-`p`) so the average stays about the same.
"""
for spects, labels in batches:
yield spects * (np.random.rand(*spects.shape) > p) / (1 - p), labels
def apply_gaussian_noise(batches, sigma):
"""
Adds Gaussian noise of standard deviation `sigma`.
"""
for spects, labels in batches:
yield (spects + np.asarray(np.random.randn(*spects.shape) * sigma,
dtype=spects.dtype),
labels)
def generate_in_background(generator, num_cached=50):
"""
Runs any generator in a background thread, caching up to `num_cached` items.
"""
import Queue
queue = Queue.Queue(maxsize=num_cached)
sentinel = object() # guaranteed unique reference
# define producer (putting items into queue)
def producer():
for item in generator:
queue.put(item)
queue.put(sentinel)
# start producer (in a background thread)
import threading
thread = threading.Thread(target=producer)
thread.daemon = True
thread.start()
# run as consumer (read items from queue, in current thread)
item = queue.get()
while item is not sentinel:
yield item
queue.task_done()
item = queue.get()
if __name__ == "__main__":
main()
```