Source code for audiomate.processing.pipeline.varia

import librosa
import numpy as np

from . import base


[docs]class Delta(base.Computation):
    """
    Compute delta features.

    See http://librosa.github.io/librosa/generated/librosa.feature.delta.html
    """

    def __init__(self, width=9, order=1, axis=0, mode='interp', parent=None, name=None):
        needed_context = int(width / 2.0)

        super(Delta, self).__init__(parent=parent, name=name,
                                    min_frames=needed_context + 1,
                                    left_context=needed_context, right_context=needed_context)

        self.width = width
        self.order = order
        self.axis = axis
        self.mode = mode

[docs]    def compute(self, chunk, sampling_rate, corpus=None, utterance=None):
        axis = len(chunk.data.shape) - self.axis - 1
        output = librosa.feature.delta(chunk.data.T, width=self.width, order=self.order, axis=axis, mode=self.mode).T

        return output[chunk.left_context:chunk.data.shape[0] - chunk.right_context]


[docs]class AddContext(base.Computation):
    """
    For every frame add context frames from left or/and right.
    For frames at the beginning and end of a sequence, where no context is available, zeros are used.

    Args:
        left_frames (int): Number of previous frames to prepend to a frame.
        right_frames (int): Number of subsequent frames to append to a frame.

    Example:
        >>> input = np.array([
        >>>     [1,2,3],
        >>>     [4,5,6],
        >>>     [7,8,9]
        >>> ])
        >>> chunk = Chunk(input, offset=0, is_last=True)
        >>> AddContext(left_frames=1, right_frames=1).compute(chunk, 16000)
        array([[0, 0, 0, 1, 2, 3, 4, 5, 6],
               [1, 2, 3, 4, 5, 6, 7, 8, 9],
               [4, 5, 6, 7, 8, 9, 0, 0, 0]])
    """

    def __init__(self, left_frames, right_frames, parent=None, name=None):
        super(AddContext, self).__init__(parent=parent, name=name, min_frames=1,
                                         left_context=left_frames, right_context=right_frames)

        self.left_frames = left_frames
        self.right_frames = right_frames

[docs]    def compute(self, chunk, sampling_rate, corpus=None, utterance=None):
        context = []

        for shift in range(self.left_frames, 0, -1):
            shift_context = chunk.data[:chunk.data.shape[0] - shift]
            pad_widths = [[0, 0] for __ in range(len(shift_context.shape))]
            pad_widths[0][0] = shift
            shift_context = np.pad(shift_context, pad_widths, mode='constant', constant_values=0)
            context.append(shift_context)

        context.append(chunk.data)

        for i in range(self.right_frames):
            shift = i + 1
            shift_context = chunk.data[shift:]
            pad_widths = [[0, 0] for __ in range(len(shift_context.shape))]
            pad_widths[0][1] = shift
            shift_context = np.pad(shift_context, pad_widths, mode='constant', constant_values=0)
            context.append(shift_context)

        stacked = np.hstack(context)
        return stacked[chunk.left_context:chunk.data.shape[0] - chunk.right_context]