Source code for audiomate.corpus.subset.selection

import collections
import random

from . import subview
from . import utils


[docs]class SubsetGenerator: """ This class is used to generate subsets of a corpus. Args: corpus (Corpus): The corpus to create subsets from. random_seed (int): Seed to use for random number generation. """ def __init__(self, corpus, random_seed=None): self.corpus = corpus self.random_seed = random_seed self.rand = random.Random() self.rand.seed(a=random_seed)
[docs] def random_subset(self, relative_size, balance_labels=False, label_list_ids=None): """ Create a subview of random utterances with a approximate size relative to the full corpus. By default x random utterances are selected with x equal to ``relative_size * corpus.num_utterances``. Args: relative_size (float): A value between 0 and 1. (0.5 will create a subset with approximately 50% of the full corpus size) balance_labels (bool): If True, the labels of the selected utterances are balanced as far as possible. So the count/duration of every label within the subset is equal. label_list_ids (list): List of label-list ids. If none is given, all label-lists are considered for balancing. Otherwise only the ones that are in the list are considered. Returns: Subview: The subview representing the subset. """ num_utterances_in_subset = round(relative_size * self.corpus.num_utterances) all_utterance_ids = sorted(list(self.corpus.utterances.keys())) if balance_labels: all_label_values = self.corpus.all_label_values(label_list_ids=label_list_ids) utterance_with_label_counts = collections.defaultdict(dict) for utterance_idx, utterance in self.corpus.utterances.items(): utterance_with_label_counts[utterance_idx] = utterance.label_count(label_list_ids=label_list_ids) subset_utterance_ids = utils.select_balanced_subset(utterance_with_label_counts, num_utterances_in_subset, list(all_label_values), seed=self.rand.random()) else: subset_utterance_ids = self.rand.sample(all_utterance_ids, num_utterances_in_subset) utt_filter = subview.MatchingUtteranceIdxFilter(utterance_idxs=set(subset_utterance_ids)) return subview.Subview(self.corpus, filter_criteria=[utt_filter])
[docs] def random_subset_by_duration(self, relative_duration, balance_labels=False, label_list_ids=None): """ Create a subview of random utterances with a approximate duration relative to the full corpus. Random utterances are selected so that the sum of all utterance durations equals to the relative duration of the full corpus. Args: relative_duration (float): A value between 0 and 1. (e.g. 0.5 will create a subset with approximately 50% of the full corpus duration) balance_labels (bool): If True, the labels of the selected utterances are balanced as far as possible. So the count/duration of every label within the subset is equal. label_list_ids (list): List of label-list ids. If none is given, all label-lists are considered for balancing. Otherwise only the ones that are in the list are considered. Returns: Subview: The subview representing the subset. """ total_duration = self.corpus.total_duration subset_duration = relative_duration * total_duration utterance_durations = {utt_idx: utt.duration for utt_idx, utt in self.corpus.utterances.items()} if balance_labels: all_label_values = self.corpus.all_label_values(label_list_ids=label_list_ids) label_durations = {} for utt_idx, utt in self.corpus.utterances.items(): label_durations[utt_idx] = utt.label_total_duration(label_list_ids) subset_utterance_ids = utils.select_balanced_subset(label_durations, subset_duration, list(all_label_values), select_count_values=utterance_durations, seed=self.rand.random()) else: dummy_weights = {utt_idx: {'w': 1} for utt_idx in self.corpus.utterances.keys()} subset_utterance_ids = utils.select_balanced_subset(dummy_weights, subset_duration, ['w'], select_count_values=utterance_durations, seed=self.rand.random()) utt_filter = subview.MatchingUtteranceIdxFilter(utterance_idxs=set(subset_utterance_ids)) return subview.Subview(self.corpus, filter_criteria=[utt_filter])
[docs] def random_subsets(self, relative_sizes, by_duration=False, balance_labels=False, label_list_ids=None): """ Create a bunch of subsets with the given sizes relative to the size or duration of the full corpus. Basically the same as calling ``random_subset`` or ``random_subset_by_duration`` multiple times with different values. But this method makes sure that every subset contains only utterances, that are also contained in the next bigger subset. Args: relative_sizes (list): A list of numbers between 0 and 1 indicating the sizes of the desired subsets, relative to the full corpus. by_duration (bool): If True the size measure is the duration of all utterances in a subset/corpus. balance_labels (bool): If True the labels contained in a subset are chosen to be balanced as far as possible. label_list_ids (list): List of label-list ids. If none is given, all label-lists are considered for balancing. Otherwise only the ones that are in the list are considered. Returns: dict : A dictionary containing all subsets with the relative size as key. """ resulting_sets = {} next_bigger_subset = self.corpus for relative_size in reversed(relative_sizes): generator = SubsetGenerator(next_bigger_subset, random_seed=self.random_seed) if by_duration: sv = generator.random_subset_by_duration(relative_size, balance_labels=balance_labels, label_list_ids=label_list_ids) else: sv = generator.random_subset(relative_size, balance_labels=balance_labels, label_list_ids=label_list_ids) resulting_sets[relative_size] = sv return resulting_sets
[docs] def maximal_balanced_subset(self, by_duration=False, label_list_ids=None): """ Create a subset of the corpus as big as possible, so that the labels are balanced approximately. The label with the shortest duration (or with the fewest utterance if by_duration=False) is taken as reference. All other labels are selected so they match the shortest one as far as possible. Args: by_duration (bool): If True the size measure is the duration of all utterances in a subset/corpus. label_list_ids (list): List of label-list ids. If none is given, all label-lists are considered for balancing. Otherwise only the ones that are in the list are considered. Returns: Subview: The subview representing the subset. """ all_label_values = self.corpus.all_label_values(label_list_ids=label_list_ids) if by_duration: utterance_durations = {utt_idx: utt.duration for utt_idx, utt in self.corpus.utterances.items()} total_duration_per_label = self.corpus.label_durations(label_list_ids=label_list_ids) rarest_label_duration = sorted(total_duration_per_label.values())[0] target_duration = len(all_label_values) * rarest_label_duration label_durations_per_utterance = {} for utt_idx, utt in self.corpus.utterances.items(): label_durations_per_utterance[utt_idx] = utt.label_total_duration(label_list_ids) subset_utterance_ids = utils.select_balanced_subset(label_durations_per_utterance, target_duration, list(all_label_values), select_count_values=utterance_durations, seed=self.rand.random()) else: total_count_per_label = self.corpus.label_count(label_list_ids=label_list_ids) lowest_label_count = sorted(total_count_per_label.values())[0] target_label_count = lowest_label_count * len(all_label_values) utterance_with_label_counts = collections.defaultdict(dict) for utterance_idx, utterance in self.corpus.utterances.items(): utterance_with_label_counts[utterance_idx] = utterance.label_count(label_list_ids=label_list_ids) subset_utterance_ids = utils.select_balanced_subset(utterance_with_label_counts, target_label_count, list(all_label_values), seed=self.rand.random()) utt_filter = subview.MatchingUtteranceIdxFilter(utterance_idxs=set(subset_utterance_ids)) return subview.Subview(self.corpus, filter_criteria=[utt_filter])