Source code for audiomate.corpus.subset.selection

import collections
import random

from . import subview
from . import utils


[docs]class SubsetGenerator:
    """
    This class is used to generate subsets of a corpus.

    Args:
        corpus (Corpus): The corpus to create subsets from.
        random_seed (int): Seed to use for random number generation.
    """

    def __init__(self, corpus, random_seed=None):
        self.corpus = corpus
        self.random_seed = random_seed
        self.rand = random.Random()
        self.rand.seed(a=random_seed)

[docs]    def random_subset(self, relative_size, balance_labels=False, label_list_ids=None):
        """
        Create a subview of random utterances with a approximate size relative to the full corpus.
        By default x random utterances are selected with x equal to ``relative_size * corpus.num_utterances``.

        Args:
            relative_size (float): A value between 0 and 1.
                                   (0.5 will create a subset with approximately 50% of the full corpus size)
            balance_labels (bool): If True, the labels of the selected utterances are balanced as far as possible.
                                   So the count/duration of every label within the subset is equal.
            label_list_ids (list): List of label-list ids. If none is given, all label-lists are considered
                                   for balancing. Otherwise only the ones that are in the list are considered.

        Returns:
            Subview: The subview representing the subset.
        """

        num_utterances_in_subset = round(relative_size * self.corpus.num_utterances)
        all_utterance_ids = sorted(list(self.corpus.utterances.keys()))

        if balance_labels:
            all_label_values = self.corpus.all_label_values(label_list_ids=label_list_ids)
            utterance_with_label_counts = collections.defaultdict(dict)

            for utterance_idx, utterance in self.corpus.utterances.items():
                utterance_with_label_counts[utterance_idx] = utterance.label_count(label_list_ids=label_list_ids)

            subset_utterance_ids = utils.select_balanced_subset(utterance_with_label_counts,
                                                                num_utterances_in_subset,
                                                                list(all_label_values),
                                                                seed=self.rand.random())

        else:
            subset_utterance_ids = self.rand.sample(all_utterance_ids,
                                                    num_utterances_in_subset)

        utt_filter = subview.MatchingUtteranceIdxFilter(utterance_idxs=set(subset_utterance_ids))
        return subview.Subview(self.corpus, filter_criteria=[utt_filter])

[docs]    def random_subset_by_duration(self, relative_duration, balance_labels=False, label_list_ids=None):
        """
        Create a subview of random utterances with a approximate duration relative to the full corpus.
        Random utterances are selected so that the sum of all utterance durations
        equals to the relative duration of the full corpus.

        Args:
            relative_duration (float): A value between 0 and 1. (e.g. 0.5 will create a subset with approximately
                                       50% of the full corpus duration)
            balance_labels (bool): If True, the labels of the selected utterances are balanced as far as possible.
                                   So the count/duration of every label within the subset is equal.
            label_list_ids (list): List of label-list ids. If none is given, all label-lists are considered
                                   for balancing. Otherwise only the ones that are in the list are considered.

        Returns:
            Subview: The subview representing the subset.
        """
        total_duration = self.corpus.total_duration
        subset_duration = relative_duration * total_duration
        utterance_durations = {utt_idx: utt.duration for utt_idx, utt in self.corpus.utterances.items()}

        if balance_labels:
            all_label_values = self.corpus.all_label_values(label_list_ids=label_list_ids)

            label_durations = {}
            for utt_idx, utt in self.corpus.utterances.items():
                label_durations[utt_idx] = utt.label_total_duration(label_list_ids)

            subset_utterance_ids = utils.select_balanced_subset(label_durations,
                                                                subset_duration,
                                                                list(all_label_values),
                                                                select_count_values=utterance_durations,
                                                                seed=self.rand.random())

        else:
            dummy_weights = {utt_idx: {'w': 1} for utt_idx in self.corpus.utterances.keys()}
            subset_utterance_ids = utils.select_balanced_subset(dummy_weights,
                                                                subset_duration,
                                                                ['w'],
                                                                select_count_values=utterance_durations,
                                                                seed=self.rand.random())

        utt_filter = subview.MatchingUtteranceIdxFilter(utterance_idxs=set(subset_utterance_ids))
        return subview.Subview(self.corpus, filter_criteria=[utt_filter])

[docs]    def random_subsets(self, relative_sizes, by_duration=False, balance_labels=False, label_list_ids=None):
        """
        Create a bunch of subsets with the given sizes relative to the size or duration of the full corpus.
        Basically the same as calling ``random_subset`` or ``random_subset_by_duration`` multiple times
        with different values. But this method makes sure that every subset contains only utterances,
        that are also contained in the next bigger subset.

        Args:
            relative_sizes (list): A list of numbers between 0 and 1 indicating the sizes of the desired subsets,
                                   relative to the full corpus.
            by_duration (bool): If True the size measure is the duration of all utterances in a subset/corpus.
            balance_labels (bool): If True the labels contained in a subset are chosen to be balanced
                                   as far as possible.
            label_list_ids (list): List of label-list ids. If none is given, all label-lists are considered
                                   for balancing. Otherwise only the ones that are in the list are considered.

        Returns:
            dict : A dictionary containing all subsets with the relative size as key.
        """
        resulting_sets = {}
        next_bigger_subset = self.corpus

        for relative_size in reversed(relative_sizes):
            generator = SubsetGenerator(next_bigger_subset, random_seed=self.random_seed)

            if by_duration:
                sv = generator.random_subset_by_duration(relative_size, balance_labels=balance_labels,
                                                         label_list_ids=label_list_ids)
            else:
                sv = generator.random_subset(relative_size, balance_labels=balance_labels,
                                             label_list_ids=label_list_ids)

            resulting_sets[relative_size] = sv

        return resulting_sets

[docs]    def maximal_balanced_subset(self, by_duration=False, label_list_ids=None):
        """
        Create a subset of the corpus as big as possible, so that the labels are balanced approximately.
        The label with the shortest duration (or with the fewest utterance if by_duration=False) is taken as reference.
        All other labels are selected so they match the shortest one as far as possible.

        Args:
            by_duration (bool): If True the size measure is the duration of all utterances in a subset/corpus.
            label_list_ids (list): List of label-list ids. If none is given, all label-lists are considered
                                   for balancing. Otherwise only the ones that are in the list are considered.

        Returns:
            Subview: The subview representing the subset.
        """
        all_label_values = self.corpus.all_label_values(label_list_ids=label_list_ids)

        if by_duration:
            utterance_durations = {utt_idx: utt.duration for utt_idx, utt in self.corpus.utterances.items()}
            total_duration_per_label = self.corpus.label_durations(label_list_ids=label_list_ids)
            rarest_label_duration = sorted(total_duration_per_label.values())[0]
            target_duration = len(all_label_values) * rarest_label_duration

            label_durations_per_utterance = {}
            for utt_idx, utt in self.corpus.utterances.items():
                label_durations_per_utterance[utt_idx] = utt.label_total_duration(label_list_ids)

            subset_utterance_ids = utils.select_balanced_subset(label_durations_per_utterance,
                                                                target_duration,
                                                                list(all_label_values),
                                                                select_count_values=utterance_durations,
                                                                seed=self.rand.random())

        else:
            total_count_per_label = self.corpus.label_count(label_list_ids=label_list_ids)
            lowest_label_count = sorted(total_count_per_label.values())[0]
            target_label_count = lowest_label_count * len(all_label_values)
            utterance_with_label_counts = collections.defaultdict(dict)

            for utterance_idx, utterance in self.corpus.utterances.items():
                utterance_with_label_counts[utterance_idx] = utterance.label_count(label_list_ids=label_list_ids)

            subset_utterance_ids = utils.select_balanced_subset(utterance_with_label_counts,
                                                                target_label_count,
                                                                list(all_label_values),
                                                                seed=self.rand.random())

        utt_filter = subview.MatchingUtteranceIdxFilter(utterance_idxs=set(subset_utterance_ids))
        return subview.Subview(self.corpus, filter_criteria=[utt_filter])