Source code for audiomate.corpus.subset.splitting

import random
import collections

from audiomate.corpus.subset import subview
from audiomate.corpus.subset import utils


[docs]class Splitter(object):
    """
    A splitter provides methods for splitting a corpus into different subsets.
    It provides different approaches for splitting the corpus. (Methods indicated by ``split_by_``)
    These methods mostly take some proportions parameter, which defines how big (in relation) the
    subsets should be. The subsets are returned as :py:class:`audiomate.corpus.Subview`.

    Args:
        corpus (Corpus): The corpus that should be splitted.
        random_seed (int): Seed to use for random number generation.
    """

    def __init__(self, corpus, random_seed=None):
        self.corpus = corpus
        self.rand = random.Random()
        self.rand.seed(a=random_seed)

[docs]    def split_by_length_of_utterances(self, proportions={}, separate_issuers=False):
        """
        Split the corpus into subsets where the total duration of subsets are proportional to the given proportions.
        The corpus gets splitted into len(proportions) parts, so the number of utterances are
        distributed according to the proportions.

        Args:
            proportions (dict): A dictionary containing the relative size of the target subsets.
                                The key is an identifier for the subset.
            separate_issuers (bool): If True it makes sure that all utterances of an issuer are in the same subset.

        Returns:
            (dict): A dictionary containing the subsets with the identifier from the input as key.

        Example::

            >>> spl = Splitter(corpus)
            >>> corpus.num_utterances
            100
            >>> subsets = spl.split_by_length_of_utterances(proportions={
            >>>     "train" : 0.6,
            >>>     "dev" : 0.2,
            >>>     "test" : 0.2
            >>> })
            >>> print(subsets)
            {'dev': <audiomate.corpus.subview.Subview at 0x104ce7400>,
            'test': <audiomate.corpus.subview.Subview at 0x104ce74e0>,
            'train': <audiomate.corpus.subview.Subview at 0x104ce7438>}
            >>> subsets['train'].num_utterances
            60
            >>> subsets['test'].num_utterances
            20
        """

        utterance_to_duration = {}

        if separate_issuers:
            # Count total length of utterances per issuer
            issuer_utts_total_duration = collections.defaultdict(float)
            issuer_utts = collections.defaultdict(list)

            for utterance in self.corpus.utterances.values():
                issuer_utts_total_duration[utterance.issuer.idx] += utterance.duration
                issuer_utts[utterance.issuer.idx].append(utterance.idx)

            issuer_utts_total_duration = {k: {'duration': int(v)} for k, v in issuer_utts_total_duration.items()}

            # Split with total utt duration per issuer as weight
            issuer_splits = utils.get_identifiers_splitted_by_weights(issuer_utts_total_duration,
                                                                      proportions=proportions)

            # Collect utterances of all issuers per split
            splits = collections.defaultdict(list)

            for split_idx, issuer_ids in issuer_splits.items():
                for issuer_idx in issuer_ids:
                    splits[split_idx].extend(issuer_utts[issuer_idx])
        else:
            for utterance in self.corpus.utterances.values():
                utterance_to_duration[utterance.idx] = {'length': int(utterance.duration * 100)}

            splits = utils.get_identifiers_splitted_by_weights(utterance_to_duration, proportions=proportions)

        return self._subviews_from_utterance_splits(splits)

[docs]    def split_by_number_of_utterances(self, proportions={}, separate_issuers=False):
        """
        Split the corpus into subsets with the given number of utterances.
        The corpus gets splitted into len(proportions) parts, so the number of utterances are
        distributed according to the proportions.

        Args:
            proportions (dict): A dictionary containing the relative size of the target subsets.
                                The key is an identifier for the subset.
            separate_issuers (bool): If True it makes sure that all utterances of an issuer are in the same subset.

        Returns:
            (dict): A dictionary containing the subsets with the identifier from the input as key.

        Example::

            >>> spl = Splitter(corpus)
            >>> corpus.num_utterances
            100
            >>> subsets = spl.split_by_number_of_utterances(proportions={
            >>>     "train" : 0.6,
            >>>     "dev" : 0.2,
            >>>     "test" : 0.2
            >>> })
            >>> print(subsets)
            {'dev': <audiomate.corpus.subview.Subview at 0x104ce7400>,
            'test': <audiomate.corpus.subview.Subview at 0x104ce74e0>,
            'train': <audiomate.corpus.subview.Subview at 0x104ce7438>}
            >>> subsets['train'].num_utterances
            60
            >>> subsets['test'].num_utterances
            20
        """

        if separate_issuers:
            # Count number of utterances per issuer
            issuer_utt_count = collections.defaultdict(int)
            issuer_utts = collections.defaultdict(list)

            for utterance in self.corpus.utterances.values():
                issuer_utt_count[utterance.issuer.idx] += 1
                issuer_utts[utterance.issuer.idx].append(utterance.idx)

            issuer_utt_count = {k: {'count': int(v)} for k, v in issuer_utt_count.items()}

            # Split with total utt duration per issuer as weight
            issuer_splits = utils.get_identifiers_splitted_by_weights(issuer_utt_count,
                                                                      proportions=proportions)

            # Collect utterances of all issuers per split
            splits = collections.defaultdict(list)

            for split_idx, issuer_ids in issuer_splits.items():
                for issuer_idx in issuer_ids:
                    splits[split_idx].extend(issuer_utts[issuer_idx])
        else:
            utterance_idxs = sorted(list(self.corpus.utterances.keys()))
            self.rand.shuffle(utterance_idxs)
            splits = utils.split_identifiers(identifiers=utterance_idxs,
                                             proportions=proportions)

        return self._subviews_from_utterance_splits(splits)

[docs]    def split_by_proportionally_distribute_labels(self, proportions={}, use_lengths=True):
        """
        Split the corpus into subsets, so the occurrence of the labels is distributed amongst the
        subsets according to the given proportions.

        Args:
            proportions (dict): A dictionary containing the relative size of the target subsets.
                                The key is an identifier for the subset.
            use_lengths (bool): If True the lengths of the labels are considered for splitting proportionally,
                                otherwise only the number of occurrences is taken into account.

        Returns:
            (dict): A dictionary containing the subsets with the identifier from the input as key.
        """

        identifiers = {}

        for utterance in self.corpus.utterances.values():
            if use_lengths:
                identifiers[utterance.idx] = {l: int(d * 100) for l, d in utterance.label_total_duration().items()}
            else:
                identifiers[utterance.idx] = utterance.label_count()

        splits = utils.get_identifiers_splitted_by_weights(identifiers, proportions)

        return self._subviews_from_utterance_splits(splits)

    def _subviews_from_utterance_splits(self, splits):
        """
        Create subviews from a dict containing utterance-ids for each subview.

        e.g. {'train': ['utt-1', 'utt-2'], 'test': [...], ...}
        """
        subviews = {}

        for idx, subview_utterances in splits.items():
            filter = subview.MatchingUtteranceIdxFilter(utterance_idxs=subview_utterances)
            split = subview.Subview(self.corpus, filter_criteria=filter)
            subviews[idx] = split

        return subviews