Source code for audiomate.corpus.subset.selection

import collections
import random

from . import subview
from . import utils


[docs]class SubsetGenerator(object): """ This class is used to generate subsets of a corpus. Args: corpus (Corpus): The corpus to create subsets from. random_seed (int): Seed to use for random number generation. """ def __init__(self, corpus, random_seed=None): self.corpus = corpus self.random_seed = random_seed self.rand = random.Random() self.rand.seed(a=random_seed)
[docs] def random_subset(self, relative_size, balance_labels=False): """ Create a subview of random utterances with a approximate size relative to the full corpus. By default x random utterances are selected with x equal to ``relative_size * corpus.num_utterances``. Args: relative_size (float): A value between 0 and 1. (0.5 will create a subset with approximately 50% of the full corpus size) balance_labels (bool): If True, the labels of the selected utterances are balanced as far as possible. So the count/duration of every label within the subset is equal. Returns: Subview: The subview representing the subset. """ num_utterances_in_subset = round(relative_size * self.corpus.num_utterances) all_utterance_ids = sorted(list(self.corpus.utterances.keys())) if balance_labels: all_label_values = self.corpus.all_label_values() utterance_with_label_counts = collections.defaultdict(dict) for utterance_idx, utterance in self.corpus.utterances.items(): utterance_with_label_counts[utterance_idx] = utterance.label_count() subset_utterance_ids = utils.select_balanced_subset(utterance_with_label_counts, num_utterances_in_subset, list(all_label_values), seed=self.rand.random()) else: subset_utterance_ids = self.rand.sample(all_utterance_ids, num_utterances_in_subset) filter = subview.MatchingUtteranceIdxFilter(utterance_idxs=set(subset_utterance_ids)) return subview.Subview(self.corpus, filter_criteria=[filter])
[docs] def random_subset_by_duration(self, relative_duration, balance_labels=False): """ Create a subview of random utterances with a approximate duration relative to the full corpus. Random utterances are selected so that the sum of all utterance durations equals to the relative duration of the full corpus. Args: relative_duration (float): A value between 0 and 1. (e.g. 0.5 will create a subset with approximately 50% of the full corpus duration) balance_labels (bool): If True, the labels of the selected utterances are balanced as far as possible. So the count/duration of every label within the subset is equal. Returns: Subview: The subview representing the subset. """ total_duration = self.corpus.total_duration subset_duration = relative_duration * total_duration all_label_values = self.corpus.all_label_values() utterance_durations = {utt_idx: utt.duration for utt_idx, utt in self.corpus.utterances.items()} if balance_labels: label_durations = {utt_idx: utt.label_total_duration() for utt_idx, utt in self.corpus.utterances.items()} subset_utterance_ids = utils.select_balanced_subset(label_durations, subset_duration, list(all_label_values), select_count_values=utterance_durations, seed=self.rand.random()) else: dummy_weights = {utt_idx: {'w': 1} for utt_idx in self.corpus.utterances.keys()} subset_utterance_ids = utils.select_balanced_subset(dummy_weights, subset_duration, ['w'], select_count_values=utterance_durations, seed=self.rand.random()) filter = subview.MatchingUtteranceIdxFilter(utterance_idxs=set(subset_utterance_ids)) return subview.Subview(self.corpus, filter_criteria=[filter])
[docs] def random_subsets(self, relative_sizes, by_duration=False, balance_labels=False): """ Create a bunch of subsets with the given sizes relative to the size or duration of the full corpus. Basically the same as calling ``random_subset`` or ``random_subset_by_duration`` multiple times with different values. But this method makes sure that every subset contains only utterances, that are also contained in the next bigger subset. Args: relative_sizes (list): A list of numbers between 0 and 1 indicating the sizes of the desired subsets, relative to the full corpus. by_duration (bool): If True the size measure is the duration of all utterances in a subset/corpus. balance_labels (bool): If True the labels contained in a subset are chosen to be balanced as far as possible. Returns: dict : A dictionary containing all subsets with the relative size as key. """ resulting_sets = {} next_bigger_subset = self.corpus for relative_size in reversed(relative_sizes): generator = SubsetGenerator(next_bigger_subset, random_seed=self.random_seed) if by_duration: sv = generator.random_subset_by_duration(relative_size, balance_labels=balance_labels) else: sv = generator.random_subset(relative_size, balance_labels=balance_labels) resulting_sets[relative_size] = sv return resulting_sets