Source code for audiomate.encoding.frame_based

import numpy as np

from . import base
from audiomate.utils import misc


[docs]class FrameHotEncoder(base.Encoder):
    """
    The FrameHotEncoder is used to encode the labels per frame.
    It creates a matrix with dimension num-frames x len(labels).
    The vector (2nd dim) has an entry for every label in the passed labels-list.
    If the sequence contains a given label within a frame it is set to 1.

    Arguments:
        labels (list): List of labels (str) which should be included in the vector representation.
        label_list_idx (str): The name of the label-list to use for encoding.
                              Only labels of this label-list are considered.
        frame_settings (FrameSettings): Frame settings to use.
        sr (int): The sampling rate used, if None it is assumed the native sampling rate from the file is used.

    Example:
        >>> from audiomate import annotations
        >>> from audiomate.utils import units import
        >>>
        >>> ll = annotations.LabelList(idx='test', labels=[
        >>>     annotations.Label('music', 0, 2),
        >>>     annotations.Label('speech', 2, 5),
        >>>     annotations.Label('noise', 4, 6),
        >>>     annotations.Label('music', 6, 8)
        >>> ])
        >>> utt.set_label_list(ll)
        >>>
        >>> labels = ['speech', 'music', 'noise']
        >>> fs = units.FrameSettings(16000, 16000)
        >>> encoder = FrameHotEncoder(labels, 'test', frame_settings=fs, sr=16000)
        >>> encoder.encode_utterance(utt)
        array([
            [0, 1, 0],
            [0, 1, 0],
            [1, 0, 0],
            [1, 0, 0],
            [1, 0, 1],
            [0, 0, 1],
            [0, 1, 0],
            [0, 1, 0]
        ])

    """

    def __init__(self, labels, label_list_idx, frame_settings, sr=None):
        self.labels = labels
        self.label_list_idx = label_list_idx
        self.frame_settings = frame_settings
        self.sr = sr

[docs]    def encode_utterance(self, utterance, corpus=None):
        sr = self.sr or utterance.sampling_rate
        num_samples = utterance.num_samples(sr=sr)
        num_frames = self.frame_settings.num_frames(num_samples)

        mat = np.zeros((num_frames, len(self.labels)))

        if self.label_list_idx not in utterance.label_lists:
            raise ValueError('Utterance {} has no label-list with idx {}'.format(utterance.idx, self.label_list_idx))

        label_list = utterance.label_lists[self.label_list_idx]

        for label in label_list:
            if label.value in self.labels:
                if label.end == float('inf'):
                    if utterance.end == float('inf'):
                        label_end = utterance.track.duration
                    else:
                        label_end = utterance.duration
                else:
                    label_end = label.end

                start, end = self.frame_settings.time_range_to_frame_range(label.start, label_end, sr)

                # If label ends at the end of the utterance
                if label.end == float('inf'):
                    end = num_frames

                mat[start:end, self.labels.index(label.value)] = 1

        return mat


[docs]class FrameOrdinalEncoder(base.Encoder):
    """
    The FrameOrdinalEncoder is used to encode the labels per frame.
    It creates a vector with length num-frames.
    For every frame sets the index of the label that is present for that frame.
    If multiple labels are present the longest within the frame.
    If multiple labels have the same length the smaller index is selected, hence
    the passed `labels` list acts as a priority.

    Arguments:
        labels (list): List of labels (str) which should be included in the vector representation.
        label_list_idx (str): The name of the label-list to use for encoding.
                              Only labels of this label-list are considered.
        frame_settings (FrameSettings): Frame settings to use.
        sr (int): The sampling rate used, if None it is assumed the native sampling rate from the file is used.

    Example:
        >>> from audiomate import annotations
        >>> from audiomate.utils import units import
        >>>
        >>> ll = annotations.LabelList(idx='test', labels=[
        >>>     annotations.Label('music', 0, 2),
        >>>     annotations.Label('speech', 2, 5),
        >>>     annotations.Label('noise', 4, 6),
        >>>     annotations.Label('music', 6, 8)
        >>> ])
        >>> utt.set_label_list(ll)
        >>>
        >>> labels = ['speech', 'music', 'noise']
        >>> fs = units.FrameSettings(16000, 16000)
        >>> encoder = FrameOrdinalEncoder(labels, 'test', frame_settings=fs)
        >>> encoder.encode_utterance(utt)
        array([1,1,0,0,0,2,1,1])
    """

    def __init__(self, labels, label_list_idx, frame_settings, sr=None):
        self.labels = labels
        self.label_list_idx = label_list_idx
        self.frame_settings = frame_settings
        self.sr = sr

[docs]    def encode_utterance(self, utterance, corpus=None):
        sr = self.sr or utterance.sampling_rate
        num_samples = utterance.num_samples(sr=sr)
        num_frames = self.frame_settings.num_frames(num_samples)

        mat = np.zeros((num_frames, len(self.labels)))

        if self.label_list_idx not in utterance.label_lists:
            raise ValueError('Utterance {} has no label-list with idx {}'.format(utterance.idx, self.label_list_idx))

        label_list = utterance.label_lists[self.label_list_idx]

        for label in label_list:
            if label.value in self.labels:
                if label.end == float('inf'):
                    if utterance.end == float('inf'):
                        label_end = utterance.track.duration
                    else:
                        label_end = utterance.duration
                else:
                    label_end = label.end

                start, end = self.frame_settings.time_range_to_frame_range(label.start, label_end, sr)

                # If label ends at the end of the utterance
                if label.end == float('inf'):
                    end = num_frames

                label_index = self.labels.index(label.value)

                for frame_index in range(start, min(end, num_frames)):
                    frame_start, frame_end = self.frame_settings.frame_to_seconds(frame_index, sr=sr)
                    overlap = misc.length_of_overlap(frame_start, frame_end, label.start, label.end)

                    mat[frame_index, label_index] = overlap

        return np.argmax(mat, axis=1)