Source code for audiomate.corpus.conversion.base

import abc
import copy
import os

import audiomate
from audiomate import tracks
from audiomate import logutil

logger = logutil.getLogger()


[docs]class AudioFileConverter(metaclass=abc.ABCMeta):
    """
    Base class for converters that convert all audio to a specific format.
    A converter creates a new instance of a corpus,
    so that all audio files meet given requirements.

    Args:
        sampling_rate (int): Target sampling rate to convert audio to.
        separate_file_per_utterance (bool): If ``True``, every utterance in the
                                       resulting corpus is in a separate file.
                                       If ``False``, the file/utt structure will
                                       be preserved.
        force_conversion (bool): If ``True``, all utterances will be converted
                                 whether or not it already matches the target
                                 format. If ``False``, only utterances not
                                 matching the target format will be converted.
                                 Others are reference to the original files.
    """

    def __init__(self, sampling_rate=16000, separate_file_per_utterance=False,
                 force_conversion=False):
        self.sampling_rate = sampling_rate
        self.separate_file_per_utterance = separate_file_per_utterance
        self.force_conversion = force_conversion

[docs]    def convert(self, corpus, target_audio_path):
        """
        Convert the given corpus.

        Args:
            corpus (Corpus): The input corpus.
            target_audio_path (str): The path where the audio files of the
                                     converted corpus should be saved.

        Returns:
            Corpus: The newly created corpus.
        """

        out_corpus = audiomate.Corpus()
        files_to_convert = []

        for utterance in logger.progress(
                corpus.utterances.values(),
                total=corpus.num_utterances,
                description='Find utterances to convert'):

            if utterance.issuer.idx not in out_corpus.issuers.keys():
                out_corpus.import_issuers(utterance.issuer)

            if self._does_utt_need_conversion(utterance):
                # Store audio in a new file

                if self.separate_file_per_utterance:
                    filename = '{}.{}'.format(utterance.idx, self._file_extension())
                    path = os.path.join(target_audio_path, filename)
                    files_to_convert.append((
                        utterance.track.path,
                        utterance.start,
                        utterance.end,
                        path
                    ))

                    track = out_corpus.new_file(path, utterance.idx)
                    start = 0
                    end = float('inf')

                else:
                    if utterance.track.idx not in out_corpus.tracks.keys():
                        filename = '{}.{}'.format(utterance.track.idx, self._file_extension())
                        path = os.path.join(target_audio_path, filename)
                        files_to_convert.append((
                            utterance.track.path,
                            0,
                            float('inf'),
                            path
                        ))
                        out_corpus.new_file(path, utterance.track.idx)

                    track = utterance.track
                    start = utterance.start
                    end = utterance.end

                utt = out_corpus.new_utterance(
                    utterance.idx,
                    track.idx,
                    issuer_idx=utterance.issuer.idx,
                    start=start,
                    end=end
                )

                lls = copy.deepcopy(list(utterance.label_lists.values()))
                utt.set_label_list(lls)

            else:
                # Just copy everything to the output corpus
                self._copy_utterance_to_corpus(utterance, out_corpus)

        self._copy_subviews_to_corpus(corpus, out_corpus)
        self._convert_files(files_to_convert)

        return out_corpus

    @abc.abstractmethod
    def _file_extension(self):
        """ Return the file-extension that will be used. """
        raise NotImplementedError()

    @abc.abstractmethod
    def _does_utt_match_target_format(self, utterance):
        """
        Return ``True`` if the utterance already matches the target format,
        ``False`` otherwise.
        """
        raise NotImplementedError()

    @abc.abstractmethod
    def _convert_files(self, files):
        """
        Store the given samples with the target format
        at ``path``.
        """
        raise NotImplementedError()

    def _does_utt_need_conversion(self, utterance):
        """ Return True if an utterance needs to be converted. """
        if self.force_conversion:
            return True

        elif type(utterance.track) != tracks.FileTrack:
            return True

        elif self.separate_file_per_utterance and (utterance.start > 0 or utterance.end != float('inf')):
            return True

        elif not self._does_utt_match_target_format(utterance):
            return True

        return False

    def _copy_utterance_to_corpus(self, utterance, corpus):
        """ Create a copy of the utterance and add it to the given corpus. """

        if utterance.track.idx not in corpus.tracks.keys():
            corpus.import_tracks(utterance.track)

        corpus.import_utterances(utterance)

    def _copy_subviews_to_corpus(self, from_corpus, to_corpus):
        """ Create copy of all subviews from ``from_corpus`` in ``to_corpus``. """

        subviews = copy.deepcopy(from_corpus.subviews)
        for subview_idx, subview in subviews.items():
            to_corpus.import_subview(subview_idx, subview)