Source code for audiomate.corpus.subset.splitting

import random
import collections

from audiomate.corpus.subset import subview
from audiomate.corpus.subset import utils


[docs]class Splitter:
    """
    A splitter provides methods for splitting a corpus into different subsets.
    It provides different approaches for splitting the corpus.
    (Methods indicated by ``split_by_``)
    These methods mostly take some proportions parameter,
    which defines how big (in relation) the
    subsets should be. The subsets are returned
    as :py:class:`audiomate.corpus.Subview`.

    Args:
        corpus (Corpus): The corpus that should be splitted.
        random_seed (int): Seed to use for random number generation.
    """

    def __init__(self, corpus, random_seed=None):
        self.corpus = corpus
        self.rand = random.Random()
        self.rand.seed(a=random_seed)

[docs]    def split(self, proportions, separate_issuers=False):
        """
        Split the corpus based on the number of utterances.
        The utterances are distributed to `len(proportions)` subsets,
        according to the ratios `proportions[subset]`.

        Args:
            proportions (dict): A dictionary containing the relative size of
                                the target subsets. The key is an identifier
                                for the subset.
            separate_issuers (bool): If True it makes sure that all utterances
                                     of an issuer are in the same subset.

        Returns:
            (dict): A dictionary containing the subsets with the identifier
                    from the input as key.

        Example::

            >>> spl = Splitter(corpus)
            >>> corpus.num_utterances
            100
            >>> subsets = spl.split(proportions={
            >>>     "train" : 0.6,
            >>>     "dev" : 0.2,
            >>>     "test" : 0.2
            >>> })
            >>> print(subsets)
            {'dev': <audiomate.corpus.subview.Subview at 0x104ce7400>,
            'test': <audiomate.corpus.subview.Subview at 0x104ce74e0>,
            'train': <audiomate.corpus.subview.Subview at 0x104ce7438>}
            >>> subsets['train'].num_utterances
            60
            >>> subsets['dev'].num_utterances
            20
            >>> subsets['test'].num_utterances
            20
        """
        if separate_issuers:
            # Count number of utterances per issuer
            issuer_utt_count = collections.defaultdict(int)
            issuer_utts = collections.defaultdict(list)

            for utterance in self.corpus.utterances.values():
                issuer_utt_count[utterance.issuer.idx] += 1
                issuer_utts[utterance.issuer.idx].append(utterance.idx)

            issuer_utt_count = {
                k: {'count': int(v)}
                for k, v in issuer_utt_count.items()
            }

            # Split with total utt count per issuer as weight
            issuer_splits = utils.get_identifiers_splitted_by_weights(
                issuer_utt_count,
                proportions,
                seed=self.rand.random()
            )

            return self._subviews_from_issuer_splits(issuer_splits, issuer_utts)

        else:
            utterance_idxs = list(self.corpus.utterances.keys())
            splits = utils.split_identifiers(
                utterance_idxs,
                proportions,
                seed=self.rand.random()
            )

            return self._subviews_from_utterance_splits(splits)

[docs]    def split_by_audio_duration(self, proportions, separate_issuers=False):
        """
        Split the corpus based on the the total duration of audio.
        The utterances are distributed to `len(proportions)` subsets.
        Utterances are split up in a way that each subset contains
        audio with a duration proportional to the given proportions.

        Args:
            proportions (dict): A dictionary containing the relative size of
                                the target subsets. The key is an identifier
                                for the subset.
            separate_issuers (bool): If True it makes sure that all utterances
                                     of an issuer are in the same subset.

        Returns:
            (dict): A dictionary containing the subsets with the identifier
                    from the input as key.

        Example::

            >>> spl = Splitter(corpus)
            >>> corpus.num_utterances
            100
            >>> subsets = spl.split_by_audio_duration(proportions={
            >>>     "train" : 0.6,
            >>>     "dev" : 0.2,
            >>>     "test" : 0.2
            >>> })
            >>> print(subsets)
            {'dev': <audiomate.corpus.subview.Subview at 0x104ce7400>,
            'test': <audiomate.corpus.subview.Subview at 0x104ce74e0>,
            'train': <audiomate.corpus.subview.Subview at 0x104ce7438>}
            >>> subsets['train'].num_utterances
            55
            >>> subsets['dev'].num_utterances
            35
            >>> subsets['test'].num_utterances
            10
        """

        if separate_issuers:
            # Count total length of utterances per issuer
            issuer_utts_duration = collections.defaultdict(float)
            issuer_utts = collections.defaultdict(list)

            for utterance in self.corpus.utterances.values():
                issuer_utts_duration[utterance.issuer.idx] += utterance.duration
                issuer_utts[utterance.issuer.idx].append(utterance.idx)

            issuer_utts_duration = {
                k: {'duration': int(v * 1000)}
                for k, v in issuer_utts_duration.items()
            }

            # Split with total utt duration per issuer as weight
            issuer_splits = utils.get_identifiers_splitted_by_weights(
                issuer_utts_duration,
                proportions,
                seed=self.rand.random()
            )

            return self._subviews_from_issuer_splits(issuer_splits, issuer_utts)

        else:
            utterance_to_duration = {}

            for utterance in self.corpus.utterances.values():
                utterance_to_duration[utterance.idx] = {
                    'duration': int(utterance.duration * 1000)
                }

            splits = utils.get_identifiers_splitted_by_weights(
                utterance_to_duration,
                proportions,
                seed=self.rand.random()
            )

            return self._subviews_from_utterance_splits(splits)

[docs]    def split_by_label_length(self, proportions,
                              label_list_idx=None,
                              separate_issuers=False):
        """
        Split the corpus based on the the total length of the label-list.
        The utterances are distributed to `len(proportions)` subsets.
        Utterances are split up in a way that each subset contains
        labels summed up to a length proportional to the given proportions.
        Length is defined as the number of characters.

        Args:
            proportions (dict): A dictionary containing the relative size
                                of the target subsets.
                                The key is an identifier for the subset.
            label_list_idx (str): The idx of the label-list to use for compute
                                  the length. If `None` all label-lists are used.
            separate_issuers (bool): If True it makes sure that all utterances
                                     of an issuer are in the same subset.

        Returns:
            (dict): A dictionary containing the subsets with the identifier
                    from the input as key.

        Example::

            >>> spl = Splitter(corpus)
            >>> corpus.num_utterances
            100
            >>> subsets = spl.split_by_label_length(proportions={
            >>>     "train" : 0.6,
            >>>     "dev" : 0.2,
            >>>     "test" : 0.2
            >>> })
            >>> print(subsets)
            {'dev': <audiomate.corpus.subview.Subview at 0x104ce7400>,
            'test': <audiomate.corpus.subview.Subview at 0x104ce74e0>,
            'train': <audiomate.corpus.subview.Subview at 0x104ce7438>}
            >>> subsets['train'].num_utterances
            55
            >>> subsets['dev'].num_utterances
            35
            >>> subsets['test'].num_utterances
            10
        """

        if separate_issuers:
            # Count total length of utterances per issuer
            issuer_utts_length = collections.defaultdict(int)
            issuer_utts = collections.defaultdict(list)

            for utterance in self.corpus.utterances.values():
                lls = utterance.label_lists

                if label_list_idx is None:
                    num_char = sum(ll.total_length for ll in lls.values())
                else:
                    num_char = lls[label_list_idx].total_length

                issuer_utts_length[utterance.issuer.idx] += num_char
                issuer_utts[utterance.issuer.idx].append(utterance.idx)

            issuer_utts_length = {
                k: {'length': v}
                for k, v in issuer_utts_length.items()
            }

            # Split with total utt duration per issuer as weight
            issuer_splits = utils.get_identifiers_splitted_by_weights(
                issuer_utts_length,
                proportions,
                seed=self.rand.random()
            )

            return self._subviews_from_issuer_splits(issuer_splits, issuer_utts)

        else:
            utterance_to_length = {}

            for utterance in self.corpus.utterances.values():
                lls = utterance.label_lists

                if label_list_idx is None:
                    num_char = sum(ll.total_length for ll in lls.values())
                else:
                    num_char = lls[label_list_idx].total_length

                utterance_to_length[utterance.idx] = {
                    'length': num_char
                }

            splits = utils.get_identifiers_splitted_by_weights(
                utterance_to_length,
                proportions,
                seed=self.rand.random()
            )

            return self._subviews_from_utterance_splits(splits)

[docs]    def split_by_label_occurence(self, proportions, separate_issuers=False):
        """
        Split the corpus based on the total number of occcurences of labels.
        The utterances are distributed to `len(proportions)` subsets.
        Utterances are split up in a way that each subset contains
        labels-occurences proportional to the given proportions.

        Args:
            proportions (dict): A dictionary containing the relative size
                                of the target subsets.
                                The key is an identifier for the subset.
            separate_issuers (bool): If True it makes sure that all utterances
                                     of an issuer are in the same subset.

        Returns:
            (dict): A dictionary containing the subsets with the identifier
                    from the input as key.

        Example::

            >>> spl = Splitter(corpus)
            >>> corpus.num_utterances
            100
            >>> subsets = spl.split_by_label_occurence(proportions={
            >>>     "train" : 0.6,
            >>>     "dev" : 0.2,
            >>>     "test" : 0.2
            >>> })
            >>> print(subsets)
            {'dev': <audiomate.corpus.subview.Subview at 0x104ce7400>,
            'test': <audiomate.corpus.subview.Subview at 0x104ce74e0>,
            'train': <audiomate.corpus.subview.Subview at 0x104ce7438>}
            >>> subsets['train'].num_utterances
            55
            >>> subsets['dev'].num_utterances
            35
            >>> subsets['test'].num_utterances
            10
        """

        if separate_issuers:
            # Count total length of utterances per issuer
            issuer_label_count = collections.defaultdict(collections.Counter)
            issuer_utts = collections.defaultdict(list)

            for utterance in self.corpus.utterances.values():
                label_count = utterance.label_count()
                issuer_label_count[utterance.issuer.idx].update(label_count)
                issuer_utts[utterance.issuer.idx].append(utterance.idx)

            # issuer_label_count = {
            #     k: dict(v) for k, v in issuer_label_count.items()
            # }

            # Split with total utt duration per issuer as weight
            issuer_splits = utils.get_identifiers_splitted_by_weights(
                issuer_label_count,
                proportions,
                seed=self.rand.random()
            )

            return self._subviews_from_issuer_splits(issuer_splits, issuer_utts)

        else:
            utterance_label_count = {
                utt.idx: dict(utt.label_count())
                for utt in self.corpus.utterances.values()
            }

            splits = utils.get_identifiers_splitted_by_weights(
                utterance_label_count,
                proportions,
                seed=self.rand.random()
            )

            return self._subviews_from_utterance_splits(splits)

[docs]    def split_by_label_duration(self, proportions, separate_issuers=False):
        """
        Split the corpus based on the total duration of labels (end - start).
        The utterances are distributed to `len(proportions)` subsets.
        Utterances are split up in a way that each subset contains
        labels with a duration proportional to the given proportions.

        Args:
            proportions (dict): A dictionary containing the relative size of
                                the target subsets. The key is an identifier
                                for the subset.
            separate_issuers (bool): If True it makes sure that all utterances
                                     of an issuer are in the same subset.

        Returns:
            (dict): A dictionary containing the subsets with the identifier
                    from the input as key.

        Example::

            >>> spl = Splitter(corpus)
            >>> corpus.num_utterances
            100
            >>> subsets = spl.split_by_label_duration(proportions={
            >>>     "train" : 0.6,
            >>>     "dev" : 0.2,
            >>>     "test" : 0.2
            >>> })
            >>> print(subsets)
            {'dev': <audiomate.corpus.subview.Subview at 0x104ce7400>,
            'test': <audiomate.corpus.subview.Subview at 0x104ce74e0>,
            'train': <audiomate.corpus.subview.Subview at 0x104ce7438>}
            >>> subsets['train'].num_utterances
            55
            >>> subsets['dev'].num_utterances
            35
            >>> subsets['test'].num_utterances
            10
        """

        if separate_issuers:
            # Count total length of utterances per issuer
            issuer_label_duration = collections.defaultdict(collections.Counter)
            issuer_utts = collections.defaultdict(list)

            for utterance in self.corpus.utterances.values():
                issuer_label_duration[utterance.issuer.idx].update(
                    utterance.label_total_duration()
                )
                issuer_utts[utterance.issuer.idx].append(utterance.idx)

            # Split with total utt duration per issuer as weight
            issuer_splits = utils.get_identifiers_splitted_by_weights(
                issuer_label_duration,
                proportions,
                seed=self.rand.random()
            )

            return self._subviews_from_issuer_splits(issuer_splits, issuer_utts)

        else:
            utterance_label_duration = {
                utt.idx: dict(utt.label_total_duration())
                for utt in self.corpus.utterances.values()
            }

            splits = utils.get_identifiers_splitted_by_weights(
                utterance_label_duration,
                proportions,
                seed=self.rand.random()
            )

            return self._subviews_from_utterance_splits(splits)

    def _subviews_from_utterance_splits(self, splits):
        """
        Create subviews from a dict containing utterance-ids for each subview.

        e.g. {'train': ['utt-1', 'utt-2'], 'test': [...], ...}
        """
        subviews = {}

        for idx, subview_utterances in splits.items():
            utt_filter = subview.MatchingUtteranceIdxFilter(utterance_idxs=subview_utterances)
            split = subview.Subview(self.corpus, filter_criteria=utt_filter)
            subviews[idx] = split

        return subviews

    def _subviews_from_issuer_splits(self, splits, issuer_to_utt):
        """
        Create subviews from a dict containing issuer-ids for each subview
        and a map from issuer-ids to utterance-ids.

        e.g. {'train': ['issuer-1', 'issuer-2'], 'test': [...], ...}
        """

        # Collect utterances of all issuers per split
        utt_splits = collections.defaultdict(list)

        for split_idx, issuer_ids in splits.items():
            for issuer_idx in issuer_ids:
                utt_splits[split_idx].extend(issuer_to_utt[issuer_idx])

        return self._subviews_from_utterance_splits(utt_splits)