Source code for audiomate.feeding.iterator

import bisect
import math
import random

import numpy as np

import audiomate
from audiomate import containers
from . import partitioning


[docs]class DataIterator: """ An abstract class representing a data-iterator. A data-iterator provides sequential access to data. An implementation of a concrete data-iterator should override the methods ``__iter__`` and ``__next__``. A sample returned from a data-iterator is a tuple containing the data for this sample from every container. The data from different containers is ordered in the way the containers were passed to the DataIterator. Args: corpus_or_utt_ids (Corpus, list): Either a corpus or a list of utterances. This defines which utterances are considered for iterating. containers (list, Container): A single container or a list of containers. shuffle (bool): Indicates whether the data should be returned in random order (``True``) or not (``False``). seed (int): Seed to be used for the random number generator. """ def __init__(self, corpus_or_utt_ids, feature_containers, shuffle=True, seed=None): if isinstance(corpus_or_utt_ids, audiomate.corpus.CorpusView): self.utt_ids = list(corpus_or_utt_ids.utterances.keys()) else: self.utt_ids = corpus_or_utt_ids if isinstance(feature_containers, containers.Container): self.containers = [feature_containers] else: self.containers = feature_containers if len(self.containers) == 0: raise ValueError('At least one container has to be provided!') self.shuffle = shuffle self.rand = random.Random() self.rand.seed(a=seed) def __iter__(self): raise NotImplementedError def __next__(self): raise NotImplementedError
[docs]class MultiFrameIterator(DataIterator): """ A data-iterator wrapping chunks of subsequent frames of a corpus. A single sample represents a chunk of frames. Args: corpus_or_utt_ids (Corpus, list): Either a corpus or a list of utterances. This defines which utterances are considered for iterating. container (list, Container): A single container or a list of containers. partition_size (str): Size of the partitions in bytes. The units ``k`` (kibibytes), ``m`` (mebibytes) and ``g`` (gibibytes) are supported, i.e. a ``partition_size`` of ``1g`` equates :math:`2^{30}` bytes. frames_per_chunk (int): Number of subsequent frames in a single sample. return_length (bool): If True, the length of the chunk is returned as well. (default ``False``) The length is appended to tuple as the last element. (e.g. [container1-data, container2-data, length]) pad (bool): If True, samples that are shorter are padded with zeros to match ``frames_per_chunk``. If padding is enabled, the lengths are always returned ``return_length = True``. shuffle (bool): Indicates whether the data should be returned in random order (``True``) or not (``False``). seed (int): Seed to be used for the random number generator. Note: For a MultiFrameIterator it is expected that every container contains exactly one value/vector for every frame. So the first dimension (outermost) of every array in every container have to match. Example: >>> corpus = audiomate.Corpus.load('/path/to/corpus') >>> container_inputs = containers.FeatureContainer('/path/to/features.hdf5') >>> container_outputs = containers.Container('/path/to/targets.hdf5') >>> >>> ds = MultiFrameIterator(corpus, [container_inputs, container_outputs], '1G', 5, shuffle=True, seed=23) >>> next(ds) # Next Chunk (inputs, outputs) ( array([[0.72991909, 0.20258683, 0.30574747, 0.53783217], [0.38875413, 0.83611128, 0.49054591, 0.15710017], [0.35153358, 0.40051009, 0.93647765, 0.29589257], [0.97465772, 0.80160451, 0.81871436, 0.4892925 ], [0.59310933, 0.8565602 , 0.95468696, 0.07933512]]) array([[0.0, 1.0], [0.0, 1.0],[0.0, 1.0],[0.0, 1.0], [0.0, 1.0]]) ) """ def __init__(self, corpus_or_utt_ids, container, partition_size, frames_per_chunk, return_length=False, pad=False, shuffle=True, seed=None): super(MultiFrameIterator, self).__init__(corpus_or_utt_ids, container, shuffle=shuffle, seed=seed) self.partition_size = partition_size self.frames_per_chunk = frames_per_chunk self.pad = pad if self.pad: self.return_length = True else: self.return_length = return_length self.loader = None self.current_partition = None self.current_partition_index = -1 self.current_chunk_index = 0 self.loader = partitioning.PartitioningContainerLoader(self.utt_ids, self.containers, self.partition_size, shuffle=self.shuffle, seed=self.rand.random()) def __iter__(self): self.current_partition = None self.current_partition_index = -1 self.current_chunk_index = 0 self.loader.reload() return self def __next__(self): if self.current_partition is None or self.current_chunk_index >= len(self.current_partition): self.current_partition_index += 1 self.current_chunk_index = 0 if self.current_partition_index < len(self.loader.partitions): partition_data = self.loader.load_partition_data(self.current_partition_index) self.current_partition = MultiFramePartitionData(partition_data, self.frames_per_chunk, return_length=self.return_length, pad=self.pad, shuffle=self.shuffle, seed=self.rand.random()) else: raise StopIteration next_chunk = self.current_partition[self.current_chunk_index] self.current_chunk_index += 1 return next_chunk
[docs]class FrameIterator(MultiFrameIterator): """ A data-iterator wrapping frames of a corpus. A single sample represents a single frame. Args: corpus_or_utt_ids (Corpus, list): Either a corpus or a list of utterances. This defines which utterances are considered for iterating. container (list, Container): A single container or a list of containers. partition_size (str): Size of the partitions in bytes. The units ``k`` (kibibytes), ``m`` (mebibytes) and ``g`` (gibibytes) are supported, i.e. a ``partition_size`` of ``1g`` equates :math:`2^{30}` bytes. shuffle (bool): Indicates whether the data should be returned in random order (``True``) or not (``False``). seed (int): Seed to be used for the random number generator. Note: For a FrameIterator it is expected that every container contains exactly one value/vector for every frame. So the first dimension of every array in every container have to match. Example: >>> corpus = audiomate.Corpus.load('/path/to/corpus') >>> container_inputs = containers.FeatureContainer('/path/to/features.hdf5') >>> container_outputs = containers.Container('/path/to/targets.hdf5') >>> >>> ds = FrameIterator(corpus, [container_inputs, container_outputs], '1G', shuffle=True, seed=23) >>> next(ds) # Next Frame (inputs, outputs) ( array([0.58843831, 0.18128443, 0.19718328, 0.25284105]), array([0.0, 1.0]) ) """ def __init__(self, corpus_or_utt_ids, container, partition_size, shuffle=True, seed=None): super(FrameIterator, self).__init__(corpus_or_utt_ids, container, partition_size, 1, return_length=False, shuffle=shuffle, seed=seed) def __next__(self): data = super(FrameIterator, self).__next__() # We have to remove the outermost dimension, which is 1 for chunk-size of 1 frame return [x[0] for x in data]
class MultiFramePartitionData: """ Wrapper for PartitionData to access chunks of frames via indexes. Args: partition_data (PartitionData): The loaded partition-data. frames_per_chunk (int): Number of subsequent frames in a chunk. return_length (bool): If True, the length of the chunk is returned as well. (default ``False``) The length is appended to tuple as the last element. (e.g. [container1-data, container2-data, length]) pad (bool): If True, samples that are shorter are padded with zeros to match ``frames_per_chunk``. If padding is enabled, the lengths are always returned ``return_length = True``. shuffle (bool): If True the frames are shuffled randomly for access. seed (int): The seed to use for shuffling. """ def __init__(self, partition_data, frames_per_chunk, return_length=False, pad=False, shuffle=True, seed=None): if frames_per_chunk < 1: raise ValueError('Number of frames per chunk has to higher than 0.') self.data = partition_data self.frames_per_chunk = frames_per_chunk self.pad = pad if self.pad: self.return_length = True else: self.return_length = return_length self.shuffle = shuffle self.rand = random.Random() self.rand.seed(a=seed) # Regions are used to provide indexed access across all utterances self.regions = self.get_utt_regions() self.region_offsets = [x[0] for x in self.regions] # Sampling used to access frames self.sampling = list(range(len(self))) if self.shuffle: self.rand.shuffle(self.sampling) def __len__(self): last_region = self.regions[-1] return last_region[0] + last_region[1] def __getitem__(self, item): index = self.sampling[item] # we search the region before the offset is higher than the index. region_index = bisect.bisect_right(self.region_offsets, index) - 1 region = self.regions[region_index] frame_offset = (index - region[0]) * self.frames_per_chunk frame_end = frame_offset + self.frames_per_chunk data = [x[frame_offset:frame_end].astype(np.float32) for x in region[2]] size = data[0].shape[0] if self.pad and size < self.frames_per_chunk: padded_data = [] for x in data: # Only pad the outermost (first) dimension pad_widths = [(0, 0)] * (len(x.shape) - 1) pad_widths.insert(0, (0, self.frames_per_chunk - size)) padded_x = np.pad(x, pad_widths, mode='constant', constant_values=0) padded_data.append(padded_x) data = padded_data if self.return_length: data.append(size) return data def get_utt_regions(self): """ Return the regions of all utterances, assuming all utterances are concatenated. A region is defined by offset, length (num-frames) and a list of references to the utterance datasets in the containers. Returns: list: List of with a tuple for every utterances containing the region info. """ regions = [] current_offset = 0 for utt_idx, utt_data in zip(self.data.info.utt_ids, self.data.utt_data): offset = current_offset num_frames = [] refs = [] for part in utt_data: num_frames.append(part.shape[0]) refs.append(part) if len(set(num_frames)) != 1: raise ValueError('Utterance {} has not the same number of frames in all containers!'.format(utt_idx)) num_chunks = math.ceil(num_frames[0] / float(self.frames_per_chunk)) region = (offset, num_chunks, refs) regions.append(region) # Sets the offset for the next utterances current_offset += num_chunks return regions