Source code for audiomate.corpus.preprocessing.processor

import abc

import librosa
import numpy as np

from audiomate.corpus import assets
from audiomate.utils import units


[docs]class Processor(metaclass=abc.ABCMeta):
    """
    This class is the base class for all kind of feature extraction.
    The processor produces from a given corpus features, which it then stores in a feature-container.

    For implementing a specific processor, the ``process_utterance`` method has to be implemented:

        * This method is called for every utterance in the corpus.
        * In the method any feature extraction / pre-processing can be done.
        * The result then has to be saved in the feature-container, which is passed along with the utterance.
          The result has to be saved, with the id of the utterance, which is passed as argument.

    Example:
        >>> import audiomate
        >>> from audiomate.corpus.preprocessing.pipeline import offline
        >>>
        >>> ds = audiomate.Corpus.load('some/corpus/path')
        >>> mfcc_processor = offline.MFCC(n_mfcc=13, n_mels=128)
        >>> norm_processor = offline.MeanVarianceNorm(mean=5.4, variance=2.3, parent=mfcc_processor)
        >>>
        >>> fc = norm_processor.process_corpus(ds, output_path='path/mfcc_features.h5', frame_size=400, hop_size=160)
        >>> fc
        <audiomate.corpus.assets.features.FeatureContainer at 0x10d451a20>
        >>> fc.open()
        >>> fc.get('existing-utterance-id')[()]
        array([[-6.18418212,  3.93379946,  2.51237535,  3.62199459, -6.77845303,
         3.28746939,  1.36316432, -0.7814685 , -2.36003147,  3.27370797,
        -3.24373709, -2.42513017, -1.55695699],
        ...
    """

[docs]    def process_corpus(self, corpus, output_path, frame_size=400, hop_size=160, sr=None):
        """
        Process the given corpus and save the processed features in a feature-container at the given path.

        Args:
            corpus (Corpus): The corpus to process the utterances from.
            output_path (str): A path to save the feature-container to.
            frame_size (int): The number of samples per frame.
            hop_size (int): The number of samples between two frames.
            sr (int): Use the given sampling rate. If None uses the native sampling rate from the file.

        Returns:
            FeatureContainer: The feature-container containing the processed features.
        """

        feat_container = assets.FeatureContainer(output_path)
        feat_container.open()

        sampling_rate = -1

        for utterance in corpus.utterances.values():
            utt_sampling_rate = utterance.sampling_rate

            if sr is None:
                if sampling_rate > 0 and sampling_rate != utt_sampling_rate:
                    raise ValueError(
                        'File {} has a different sampling-rate than the previous ones!'.format(utterance.file.idx))

                sampling_rate = utt_sampling_rate

            self.process_utterance(utterance, feat_container,
                                   corpus=corpus,
                                   frame_size=frame_size,
                                   hop_size=hop_size,
                                   sr=sr)

        feat_container.frame_size = frame_size
        feat_container.hop_size = hop_size
        feat_container.sampling_rate = sr or sampling_rate

        feat_container.close()

        return feat_container

[docs]    def process_corpus_from_feature_container(self, corpus, input_features, output_path):
        """
        Process the given corpus and save the processed features in a feature-container at the given path.
        Instead of using the framed signal, use the features from from the given feature-container.

        Args:
            corpus (Corpus): The corpus to process the utterances from.
            input_features (FeatureContainer): The feature-container to process the frames from.
            output_path (str): A path to save the feature-container to.

        Returns:
            FeatureContainer: The feature-container containing the processed features.
        """

        feat_container = assets.FeatureContainer(output_path)
        feat_container.open()

        input_features.open()

        for utterance in corpus.utterances.values():
            self.process_utterance_from_feature_container(utterance, input_features, feat_container, corpus)

        feat_container.frame_size = input_features.frame_size
        feat_container.hop_size = input_features.hop_size
        feat_container.sampling_rate = input_features.sampling_rate

        feat_container.close()

        return feat_container

[docs]    @abc.abstractmethod
    def process_utterance(self, utterance, feature_container, corpus=None, frame_size=400, hop_size=160, sr=None):
        """
        Extract features of the given utterances and put it in the given feature-container.

        Args:
            utterance (Utterance): The utterance to process.
            feature_container (FeatureContainer): The feature-container to store the output.
            corpus (Corpus): The corpus where the utterance is from, if available.
            frame_size (int): The number of samples per frame.
            hop_size (int): The number of samples between two frames.
            sr (int): Use the given sampling rate. If None uses the native sampling rate from the file.
        """
        pass

[docs]    @abc.abstractmethod
    def process_utterance_from_feature_container(self, utterance, in_feat_container, out_feat_container, corpus=None):
        """
        Process the feature of the given utterance from the given input feature-container and put it
        to the given output feature-container.

        Args:
            utterance (Utterance): The utterance to process.
            in_feat_container (FeatureContainer): The feature-container to read the input frames.
            out_feat_container (FeatureContainer): The feature-container to store the output.
            corpus (Corpus): The corpus where the utterance is from, if available.
        """
        pass


[docs]class OfflineProcessor(Processor, metaclass=abc.ABCMeta):
    """
    This class should be used for feature extraction in batch mode (one full utterance in a step).

    For implementing a specific offline-processor, the ``process_sequence`` method has to be implemented:

        * As input the method receives a 2-Dimensional array of frames (n-frames x n-samples-per-frame).
        * It must return a array with the first dimension of the same size as the input.

    Note:
        The samples are padded with zeros to match the number of frames equal to
        math.ceil((num_samples - self.frame_size) / self.hop_size + 1).

    """

[docs]    def process_utterance(self, utterance, feature_container, corpus=None, frame_size=400, hop_size=160, sr=None):
        frame_settings = units.FrameSettings(frame_size, hop_size)
        samples = utterance.read_samples(sr=sr)

        if samples.size <= 0:
            raise ValueError('Utterance {} has no samples'.format(utterance.idx))

        # Pad with zeros to match frames
        num_frames = frame_settings.num_frames(samples.size)
        num_pad_samples = (num_frames - 1) * hop_size + frame_size

        if num_pad_samples > samples.size:
            samples = np.pad(samples, (0, num_pad_samples - samples.size), mode='constant', constant_values=0)

        # Get sampling-rate if not given
        sampling_rate = sr or utterance.sampling_rate

        frames = librosa.util.frame(samples, frame_length=frame_size, hop_length=hop_size).T
        processed = self.process_sequence(frames, sampling_rate, utterance=utterance, corpus=corpus)
        feature_container.set(utterance.idx, processed)

[docs]    def process_utterance_from_feature_container(self, utterance, in_feat_container, out_feat_container, corpus=None):
        sampling_rate = in_feat_container.sampling_rate
        frames = in_feat_container.get(utterance.idx, mem_map=False)
        processed = self.process_sequence(frames, sampling_rate, utterance=utterance, corpus=corpus)
        out_feat_container.set(utterance.idx, processed)

[docs]    @abc.abstractmethod
    def process_sequence(self, frames, sampling_rate, utterance=None, corpus=None):
        """
        Process the given frames, which represent an utterance.

        Args:
            frames (numpy.ndarray): (n-frames x n-samples-per-frame) Frames.
            sampling_rate (int): The sampling rate of the underlying signal.
            corpus (Corpus): The corpus where the data is from, if available.
            utterance (Utterance): The utterance the data is from, if available.

        Returns:
            numpy.ndarray: (n-frames x ...) The features extracted from the given samples.
        """
        pass