Musical Data Augmentation Example Code

See below, or download the raw file.
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Musical data augmentation example code accompanying the paper

    Jan Schlüter and Thomas Grill: Exploring Data Augmentation for
    Improved Singing Voice Detection with Neural Networks.
    In Proceedings of the 16th International Society for Music
    Information Retrieval Conference (ISMIR), Malaga, Spain, 2015.

    PDF: http://ofai.at/~jan.schlueter/pubs/2015_ismir.pdf
    BibTeX: http://ofai.at/~jan.schlueter/pubs/2015_ismir.bib
    Contact: http://ofai.at/~jan.schlueter

This code demonstrates how to incorporate any combination of six of the
augmentation methods in the paper into an existing training loop.

When run as a script, it will generate and display randomly chosen minibatches
with random pitch-shifting (+/- 30%), time-stretching (+/- 30%) and frequency
filtering (+/- 10 dB). If one or more audio files are supplied on the command
line, it will use those, otherwise it will download two files from the web.

When imported, you can use any of the functions for your own data.

Requirements:
- Python 2.7 or 3.4+
- Numpy, Scipy, Matplotlib
- ffmpeg or avconv



Feel free to use the code under the terms of the following license:

The MIT License (MIT)

Copyright (c) 2015 Jan Schlüter

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
"""

import sys
import os
import subprocess

import numpy as np
import scipy.ndimage
import matplotlib.pyplot as plt


def main():
    # 'parse' command line
    audio_files = sys.argv[1:]

    # download two files from the web if none were supplied
    if not audio_files:
        audio_files = download_audio_examples()

    # extract linear spectrograms from the audio files
    print("Precomputing linear spectra...")
    spects = [extract_spect(audio_file) for audio_file in audio_files]

    # invent frame-wise binary labels for them
    print("Inventing labels...")
    labels = [(np.random.rand(len(spect)) > .5) for spect in spects]

    # define demo training step function that just displays the excerpts
    # (in an actual experiment, this could instead be a neural network update
    # function compiled with Lasagne: https://github.com/Lasagne/Lasagne)
    print("Running demo loop...")

    def plot_data(inputs, targets):
        plt.figure()
        for i in range(min(16, len(inputs))):
            plt.subplot(4, 4, i+1)
            plt.imshow(inputs[i].T, interpolation='nearest', origin='lower',
                       cmap='hot')
        plt.show()

    # run demo training loop
    training_loop(spects, labels, train_fn=plot_data)


def training_loop(spects, labels, train_fn, steps=10, batchsize=16, frames=115,
                  sample_rate=22050):
    """
    Demonstrates running a training loop with musical data augmentation. The
    augmentation methods to use are hard-coded here (it's just a demo).

    Parameters
    ----------
    spects : list of ndarray
        Linear magnitude spectrograms, one per input file.
    labels : list of ndarray
        Binary frame-wise labels, one array per input file.
    train_fn : callable
        Function performing a training step. Will be called with a mini-batch
        of `batchsize` randomly augmented spectrogram excerpts of `frames`
        frames and the accompanying binary label
    steps : int
        Numer of training steps to be performed.
    batchsize : int
        Number of excerpts and labels to call `train_fn` with
    frames : int
        Length of spectrogram excerpts passed to `train_fn`, in frames
    sample_rate : float
        Sample rate of the spectrograms in `spects`, needed for mel filtering.
    """
    # We're first going to set up a generator for our mini-batches.
    # As we want to apply random time-stretching (up to 30%), we request longer
    # excerpts than we finally need to return.
    max_stretch = .3
    batches = grab_random_excerpts(spects, labels, batchsize=batchsize,
                                   frames=int(frames / (1 - max_stretch)))

    # We wrap the generator in another one that applies random time stretching
    # and pitch shifting, keeping a given number of frames and bins only.
    max_shift = .3
    bin_nyquist = spects[0].shape[1]
    bin_8k = bin_nyquist * 2 * 8000 // sample_rate  # frequency bin of 8 kHz
    batches = apply_random_stretch_shift(batches, max_stretch, max_shift,
                                         keep_frames=frames, keep_bins=bin_8k)

    # We transform the excerpts to logarithmic magnitude and mel frequency.
    frame_len = (spects[0].shape[1] - 1) * 2
    mel_bank = create_mel_filterbank(sample_rate, frame_len, 80, 27.5, 8000)
    mel_bank = mel_bank[:bin_8k]
    batches = apply_filterbank(batches, mel_bank)
    batches = apply_logarithm(batches)

    # We apply random frequency filters (we could have done that earlier, but
    # it's a bit more efficient in logarithmic-magnitude mel space)
    batches = apply_random_filters(batches, mel_bank, 8000, max_db=10)

    # We apply dropout (comment in to enable)
    # batches = apply_dropout(batches, p=.1)

    # We apply Gaussian noise (comment in to enable)
    # Note: In the paper, we did that after normalizing inputs to unit variance
    # batches = apply_gaussian_noise(batches, sigma=.1)

    # We start the mini-batch generator and augmenter in a background thread
    # (not required, can be disabled just like anything else)
    batches = generate_in_background(batches, num_cached=5)

    # Blending with negative examples (the seventh augmentation method in the
    # paper) is not shown here, as this requires a second generator yielding
    # only negative training examples. It would be easy to blend with fully
    # random examples, though.

    # We can now loop over the generated batches and call the training function
    for step, (inputs, targets) in enumerate(batches):
        if step == steps:
            break
        train_fn(inputs, targets)


def download_audio_examples():
    """
    Downloads two hard-coded files from the web, and returns the local file
    names they were saved under.
    """
    if sys.version_info[0] == 2:
        from urllib import urlretrieve
    else:
        from urllib.request import urlretrieve
    # just got archive.org URLs from the SALAMI dataset, should be long-lived
    urls = ['http://www.archive.org/download/rachel2004-09-18.flac16/'
            'rachel2004-09-18d1t6_vbr.mp3',
            'http://www.archive.org/download/chingus2005-03-25.flac16/'
            'chingus2005-03-25d1t05_vbr.mp3']
    result = []
    for url in urls:
        target = url.rsplit('/', 1)[-1]
        if not os.path.exists(target):
            print("Downloading " + url)
            try:
                urlretrieve(url, target)
            except:
                print("Download failed.")
        if os.path.exists(target):
            result.append(target)
    return result


def read_ffmpeg(infile, sample_rate, cmd='ffmpeg'):
    """
    Decodes a given audio file using ffmpeg, resampled to a given sample rate,
    downmixed to mono, and converted to float32 samples. Returns a numpy array.
    """
    call = [cmd, "-v", "quiet", "-i", infile, "-f", "f32le",
            "-ar", str(sample_rate), "-ac", "1", "pipe:1"]
    samples = subprocess.check_output(call)
    return np.frombuffer(samples, dtype=np.float32)


def extract_spect(filename, sample_rate=22050, frame_len=1024, fps=70):
    """
    Extracts a magnitude spectrogram for a given audio file at a given sample
    rate (in Hz), frame length (in samples) and frame rate (in Hz). Returns a
    numpy array.
    """
    try:
        samples = read_ffmpeg(filename, sample_rate)
    except Exception:
        samples = read_ffmpeg(filename, sample_rate, cmd='avconv')
    win = np.hanning(frame_len)
    hopsize = sample_rate // fps
    spect = np.vstack(np.abs(np.fft.rfft(samples[pos:pos + frame_len] * win))
                      for pos in range(0, len(samples) - frame_len + 1,
                                       int(hopsize)))
    return spect


def grab_random_excerpts(spects, labels, batchsize, frames):
    """
    Exctracts random excerpts of `frames` frames from the spectrograms in
    `spects` paired with the label from `labels` associated with the central
    frame. Yields `batchsize` excerpts at a time. Draws without replacement
    until exhausted, then shuffles anew and continues.
    """
    # buffer to write each batch into
    batch_spects = np.empty((batchsize, frames, spects[0].shape[1]),
                            dtype=spects[0].dtype)
    batch_labels = np.empty((batchsize,) + labels[0].shape[1:],
                            dtype=labels[0].dtype)
    # array of all possible (spect_idx, frame_idx) combinations
    indices = np.vstack(np.vstack((np.ones(len(spect) - frames + 1,
                                           dtype=np.int) * spect_idx,
                                   np.arange(len(spect) - frames + 1,
                                             dtype=np.int))).T
                        for spect_idx, spect in enumerate(spects))
    # infinite loop
    while True:
        # shuffle all possible indices
        np.random.shuffle(indices)
        # draw without replacement until exhausted
        b = 0
        for spect_idx, frame_idx in indices:
            batch_spects[b] = spects[spect_idx][frame_idx:frame_idx + frames]
            batch_labels[b] = labels[spect_idx][frame_idx + frames//2]
            b += 1
            if b == batchsize:
                yield batch_spects, batch_labels
                b = 0


def apply_random_stretch_shift(batches, max_stretch, max_shift,
                               keep_frames, keep_bins):
    """
    Apply random time stretching of up to +/- `max_stretch`, random pitch
    shifting of up to +/- `max_shift`, keeping the central `keep_frames` frames
    and the first `keep_bins` bins.
    """
    for spects, labels in batches:
        outputs = np.empty((len(spects), keep_frames, keep_bins),
                           dtype=spects.dtype)
        randoms = (np.random.rand(len(spects), 2) - .5) * 2
        for spect, output, random in zip(spects, outputs, randoms):
            stretch = 1 + random[0] * max_stretch
            shift = 1 + random[1] * max_shift
            # We can do shifting/stretching and cropping in a single affine
            # transform (including setting the upper bands to zero if we shift
            # down the signal so far that we exceed the input's nyquist rate)
            scipy.ndimage.affine_transform(
                    spect, (1 / stretch, 1 / shift),
                    output_shape=(keep_frames, keep_bins), output=output,
                    offset=(.5 * (len(spect) * stretch - keep_frames), 0),
                    mode='constant', cval=0)
        yield outputs, labels


def create_mel_filterbank(sample_rate, frame_len, num_bands, min_freq,
                          max_freq):
    """
    Creates a mel filterbank of `num_bands` triangular filters, with the first
    filter starting at `min_freq` and the last one stopping at `max_freq`.
    Returns the filterbank as a matrix suitable for a dot product against
    magnitude spectra created from samples at a sample rate of `sample_rate`
    with a window length of `frame_len` samples.
    """
    # prepare output matrix
    input_bins = (frame_len // 2) + 1
    filterbank = np.zeros((input_bins, num_bands))

    # mel-spaced peak frequencies
    min_mel = 1127 * np.log1p(min_freq / 700.0)
    max_mel = 1127 * np.log1p(max_freq / 700.0)
    spacing = (max_mel - min_mel) / (num_bands + 1)
    peaks_mel = min_mel + np.arange(num_bands + 2) * spacing
    peaks_hz = 700 * (np.exp(peaks_mel / 1127) - 1)
    fft_freqs = np.linspace(0, sample_rate / 2., input_bins)
    peaks_bin = np.searchsorted(fft_freqs, peaks_hz)

    # fill output matrix with triangular filters
    for b, filt in enumerate(filterbank.T):
        # The triangle starts at the previous filter's peak (peaks_freq[b]),
        # has its maximum at peaks_freq[b+1] and ends at peaks_freq[b+2].
        left_hz, top_hz, right_hz = peaks_hz[b:b+3]  # b, b+1, b+2
        left_bin, top_bin, right_bin = peaks_bin[b:b+3]
        # Create triangular filter compatible to yaafe
        filt[left_bin:top_bin] = ((fft_freqs[left_bin:top_bin] - left_hz) /
                                  (top_bin - left_bin))
        filt[top_bin:right_bin] = ((right_hz - fft_freqs[top_bin:right_bin]) /
                                   (right_bin - top_bin))
        filt[left_bin:right_bin] /= filt[left_bin:right_bin].sum()

    return filterbank


def apply_filterbank(batches, filterbank):
    """
    Apply a filterbank to batches of spectrogram excerpts via a dot product.
    """
    for spects, labels in batches:
        # we reshape (batchsize, frames, bins) to (batchsize * frames, bins) so
        # we can transform all excerpts in a single dot product, then go back
        # to (batchsize, frames, filters)
        yield (np.dot(spects.reshape(-1, spects.shape[-1]), filterbank).reshape(
                (spects.shape[0], spects.shape[1], -1)), labels)


def apply_logarithm(batches, clip=1e-5):
    """
    Convert linear to logarithmic magnitudes, clipping magnitudes < `clip`.
    """
    # Note: In the paper, we clipped below 1e-7. For visualization, 1e-5 is
    # better as it avoids extreme ranges of logarithmic magnitudes.
    for spects, labels in batches:
        yield np.log(np.maximum(spects, clip)), labels


def apply_random_filters(batches, filterbank, max_freq, max_db, min_std=5,
                         max_std=7):
    """
    Applies random filter responses to logarithmic-magnitude mel spectrograms.
    The filter response is a Gaussian of a standard deviation between `min_std`
    and `max_std` semitones, a mean between 150 Hz and `max_freq`, and a
    strength between -/+ `max_db` dezibel. Assumes the mel spectrograms have
    been transformed with `filterbank` and cover up to `max_freq` Hz.
    """
    for spects, labels in batches:
        batchsize, length, bands = spects.shape
        bins = len(filterbank)
        # sample means and std deviations on logarithmic pitch scale
        min_pitch = 12 * np.log2(150)
        max_pitch = 12 * np.log2(max_freq)
        mean = min_pitch + (np.random.rand(batchsize) *
                            (max_pitch - min_pitch))
        std = min_std + np.random.randn(batchsize) * (max_std - min_std)
        # convert means and std deviations to linear frequency scale
        std = 2**((mean + std) / 12) - 2**(mean / 12)
        mean = 2**(mean / 12)
        # convert means and std deviations to bins
        mean = mean * bins / max_freq
        std = std * bins / max_freq
        # sample strengths uniformly in dB
        strength = max_db * 2 * (np.random.rand(batchsize) - .5)
        # create Gaussians
        filt = (strength[:, np.newaxis] *
                np.exp(np.square((np.arange(bins) - mean[:, np.newaxis]) /
                                 std[:, np.newaxis]) * -.5))
        # transform from dB to factors
        filt = 10**(filt / 20.)
        # transform to mel scale
        filt = np.dot(filt, filterbank)
        # logarithmize
        filt = np.log(filt)
        # apply (it's a simple addition now, broadcasting over the second axis)
        yield spects + filt[:, np.newaxis, :], labels


def apply_dropout(batches, p):
    """
    Sets input elements to zero with probability `p` and scales the result by
    1/(1-`p`) so the average stays about the same.
    """
    for spects, labels in batches:
        yield spects * (np.random.rand(*spects.shape) > p) / (1 - p), labels


def apply_gaussian_noise(batches, sigma):
    """
    Adds Gaussian noise of standard deviation `sigma`.
    """
    for spects, labels in batches:
        yield (spects + np.asarray(np.random.randn(*spects.shape) * sigma,
                                   dtype=spects.dtype),
               labels)


def generate_in_background(generator, num_cached=50):
    """
    Runs any generator in a background thread, caching up to `num_cached` items.
    """
    import Queue
    queue = Queue.Queue(maxsize=num_cached)
    sentinel = object()  # guaranteed unique reference

    # define producer (putting items into queue)
    def producer():
        for item in generator:
            queue.put(item)
        queue.put(sentinel)

    # start producer (in a background thread)
    import threading
    thread = threading.Thread(target=producer)
    thread.daemon = True
    thread.start()

    # run as consumer (read items from queue, in current thread)
    item = queue.get()
    while item is not sentinel:
        yield item
        queue.task_done()
        item = queue.get()


if __name__ == "__main__":
    main()