Source code for audiomate.feeding.iterator

import bisect
import math
import random

import numpy as np

import audiomate
from audiomate import containers
from . import partitioning


[docs]class DataIterator:
    """
    An abstract class representing a data-iterator. A data-iterator provides sequential access to data.
    An implementation of a concrete data-iterator should override the methods ``__iter__`` and ``__next__``.

    A sample returned from a data-iterator is a tuple containing the data for this sample from every container.
    The data from different containers is ordered in the way the containers were passed to the DataIterator.

    Args:
        corpus_or_utt_ids (Corpus, list): Either a corpus or a list of utterances.
                                          This defines which utterances are considered for iterating.
        containers (list, Container): A single container or a list of containers.
        shuffle (bool): Indicates whether the data should be returned in
                        random order (``True``) or not (``False``).
        seed (int): Seed to be used for the random number generator.
    """

    def __init__(self, corpus_or_utt_ids, feature_containers, shuffle=True, seed=None):
        if isinstance(corpus_or_utt_ids, audiomate.corpus.CorpusView):
            self.utt_ids = list(corpus_or_utt_ids.utterances.keys())
        else:
            self.utt_ids = corpus_or_utt_ids

        if isinstance(feature_containers, containers.Container):
            self.containers = [feature_containers]
        else:
            self.containers = feature_containers

        if len(self.containers) == 0:
            raise ValueError('At least one container has to be provided!')

        self.shuffle = shuffle

        self.rand = random.Random()
        self.rand.seed(a=seed)

    def __iter__(self):
        raise NotImplementedError

    def __next__(self):
        raise NotImplementedError


[docs]class MultiFrameIterator(DataIterator):
    """
    A data-iterator wrapping chunks of subsequent frames of a corpus.
    A single sample represents a chunk of frames.

    Args:
        corpus_or_utt_ids (Corpus, list): Either a corpus or a list of utterances.
                                          This defines which utterances are considered for iterating.
        container (list, Container): A single container or a list of containers.
        partition_size (str): Size of the partitions in bytes. The units ``k`` (kibibytes), ``m`` (mebibytes) and ``g``
                              (gibibytes) are supported, i.e. a ``partition_size`` of ``1g`` equates :math:`2^{30}`
                              bytes.
        frames_per_chunk (int): Number of subsequent frames in a single sample.
        return_length (bool): If True, the length of the chunk is returned as well. (default ``False``)
                              The length is appended to tuple as the last element.
                              (e.g. [container1-data, container2-data, length])
        pad (bool): If True, samples that are shorter are padded with zeros to match ``frames_per_chunk``.
                    If padding is enabled, the lengths are always returned ``return_length = True``.
        shuffle (bool): Indicates whether the data should be returned in
                        random order (``True``) or not (``False``).
        seed (int): Seed to be used for the random number generator.

    Note:
        For a MultiFrameIterator it is expected that every container contains exactly one value/vector for every frame.
        So the first dimension (outermost) of every array in every container have to match.

    Example:
        >>> corpus = audiomate.Corpus.load('/path/to/corpus')
        >>> container_inputs = containers.FeatureContainer('/path/to/features.hdf5')
        >>> container_outputs = containers.Container('/path/to/targets.hdf5')
        >>>
        >>> ds = MultiFrameIterator(corpus, [container_inputs, container_outputs], '1G', 5, shuffle=True, seed=23)
        >>> next(ds) # Next Chunk (inputs, outputs)
        (
            array([[0.72991909, 0.20258683, 0.30574747, 0.53783217],
                   [0.38875413, 0.83611128, 0.49054591, 0.15710017],
                   [0.35153358, 0.40051009, 0.93647765, 0.29589257],
                   [0.97465772, 0.80160451, 0.81871436, 0.4892925 ],
                   [0.59310933, 0.8565602 , 0.95468696, 0.07933512]])
            array([[0.0, 1.0], [0.0, 1.0],[0.0, 1.0],[0.0, 1.0], [0.0, 1.0]])
        )
    """

    def __init__(self, corpus_or_utt_ids, container, partition_size, frames_per_chunk, return_length=False,
                 pad=False, shuffle=True, seed=None):
        super(MultiFrameIterator, self).__init__(corpus_or_utt_ids, container, shuffle=shuffle, seed=seed)

        self.partition_size = partition_size
        self.frames_per_chunk = frames_per_chunk
        self.pad = pad

        if self.pad:
            self.return_length = True
        else:
            self.return_length = return_length

        self.loader = None
        self.current_partition = None
        self.current_partition_index = -1
        self.current_chunk_index = 0

        self.loader = partitioning.PartitioningContainerLoader(self.utt_ids,
                                                               self.containers,
                                                               self.partition_size,
                                                               shuffle=self.shuffle,
                                                               seed=self.rand.random())

    def __iter__(self):
        self.current_partition = None
        self.current_partition_index = -1
        self.current_chunk_index = 0

        self.loader.reload()

        return self

    def __next__(self):
        if self.current_partition is None or self.current_chunk_index >= len(self.current_partition):
            self.current_partition_index += 1
            self.current_chunk_index = 0

            if self.current_partition_index < len(self.loader.partitions):
                partition_data = self.loader.load_partition_data(self.current_partition_index)
                self.current_partition = MultiFramePartitionData(partition_data,
                                                                 self.frames_per_chunk,
                                                                 return_length=self.return_length,
                                                                 pad=self.pad,
                                                                 shuffle=self.shuffle,
                                                                 seed=self.rand.random())
            else:
                raise StopIteration

        next_chunk = self.current_partition[self.current_chunk_index]
        self.current_chunk_index += 1

        return next_chunk


[docs]class FrameIterator(MultiFrameIterator):
    """
    A data-iterator wrapping frames of a corpus. A single sample represents a single frame.

    Args:
        corpus_or_utt_ids (Corpus, list): Either a corpus or a list of utterances.
                                          This defines which utterances are considered for iterating.
        container (list, Container): A single container or a list of containers.
        partition_size (str): Size of the partitions in bytes. The units ``k`` (kibibytes), ``m`` (mebibytes) and ``g``
                              (gibibytes) are supported, i.e. a ``partition_size`` of ``1g`` equates :math:`2^{30}`
                              bytes.
        shuffle (bool): Indicates whether the data should be returned in
                        random order (``True``) or not (``False``).
        seed (int): Seed to be used for the random number generator.

    Note:
        For a FrameIterator it is expected that every container contains exactly one value/vector for every frame.
        So the first dimension of every array in every container have to match.

    Example:
        >>> corpus = audiomate.Corpus.load('/path/to/corpus')
        >>> container_inputs = containers.FeatureContainer('/path/to/features.hdf5')
        >>> container_outputs = containers.Container('/path/to/targets.hdf5')
        >>>
        >>> ds = FrameIterator(corpus, [container_inputs, container_outputs], '1G', shuffle=True, seed=23)
        >>> next(ds) # Next Frame (inputs, outputs)
        (
            array([0.58843831, 0.18128443, 0.19718328, 0.25284105]),
            array([0.0, 1.0])
        )
    """

    def __init__(self, corpus_or_utt_ids, container, partition_size, shuffle=True, seed=None):
        super(FrameIterator, self).__init__(corpus_or_utt_ids, container, partition_size, 1,
                                            return_length=False, shuffle=shuffle, seed=seed)

    def __next__(self):
        data = super(FrameIterator, self).__next__()

        # We have to remove the outermost dimension, which is 1 for chunk-size of 1 frame
        return [x[0] for x in data]


class MultiFramePartitionData:
    """
    Wrapper for PartitionData to access chunks of frames via indexes.

    Args:
        partition_data (PartitionData): The loaded partition-data.
        frames_per_chunk (int): Number of subsequent frames in a chunk.
        return_length (bool): If True, the length of the chunk is returned as well. (default ``False``)
                              The length is appended to tuple as the last element.
                              (e.g. [container1-data, container2-data, length])
        pad (bool): If True, samples that are shorter are padded with zeros to match ``frames_per_chunk``.
                    If padding is enabled, the lengths are always returned ``return_length = True``.
        shuffle (bool): If True the frames are shuffled randomly for access.
        seed (int): The seed to use for shuffling.
    """

    def __init__(self, partition_data, frames_per_chunk, return_length=False, pad=False, shuffle=True, seed=None):
        if frames_per_chunk < 1:
            raise ValueError('Number of frames per chunk has to higher than 0.')

        self.data = partition_data
        self.frames_per_chunk = frames_per_chunk
        self.pad = pad

        if self.pad:
            self.return_length = True
        else:
            self.return_length = return_length

        self.shuffle = shuffle

        self.rand = random.Random()
        self.rand.seed(a=seed)

        # Regions are used to provide indexed access across all utterances
        self.regions = self.get_utt_regions()
        self.region_offsets = [x[0] for x in self.regions]

        # Sampling used to access frames
        self.sampling = list(range(len(self)))

        if self.shuffle:
            self.rand.shuffle(self.sampling)

    def __len__(self):
        last_region = self.regions[-1]
        return last_region[0] + last_region[1]

    def __getitem__(self, item):
        index = self.sampling[item]

        # we search the region before the offset is higher than the index.
        region_index = bisect.bisect_right(self.region_offsets, index) - 1
        region = self.regions[region_index]

        frame_offset = (index - region[0]) * self.frames_per_chunk
        frame_end = frame_offset + self.frames_per_chunk

        data = [x[frame_offset:frame_end].astype(np.float32) for x in region[2]]
        size = data[0].shape[0]

        if self.pad and size < self.frames_per_chunk:
            padded_data = []
            for x in data:
                # Only pad the outermost (first) dimension
                pad_widths = [(0, 0)] * (len(x.shape) - 1)
                pad_widths.insert(0, (0, self.frames_per_chunk - size))
                padded_x = np.pad(x, pad_widths, mode='constant', constant_values=0)
                padded_data.append(padded_x)

            data = padded_data

        if self.return_length:
            data.append(size)

        return data

    def get_utt_regions(self):
        """
        Return the regions of all utterances, assuming all utterances are concatenated.
        A region is defined by offset, length (num-frames) and
        a list of references to the utterance datasets in the containers.

        Returns:
            list: List of with a tuple for every utterances containing the region info.
        """

        regions = []
        current_offset = 0

        for utt_idx, utt_data in zip(self.data.info.utt_ids, self.data.utt_data):
            offset = current_offset

            num_frames = []
            refs = []

            for part in utt_data:
                num_frames.append(part.shape[0])
                refs.append(part)

            if len(set(num_frames)) != 1:
                raise ValueError('Utterance {} has not the same number of frames in all containers!'.format(utt_idx))

            num_chunks = math.ceil(num_frames[0] / float(self.frames_per_chunk))

            region = (offset, num_chunks, refs)
            regions.append(region)

            # Sets the offset for the next utterances
            current_offset += num_chunks

        return regions