Source code for audiomate.corpus.corpus

import copy
import os
import shutil

from audiomate.corpus import assets
from audiomate.utils import naming
from . import base
from . import subset

DEFAULT_FILE_SUBDIR = 'files'
DEFAULT_FEAT_SUBDIR = 'features'


[docs]class Corpus(base.CorpusView): """ The Corpus class represents a single corpus. It extends :py:class:`audiomate.corpus.CorpusView` with the functionality for loading and saving. Furthermore it provides the functionality for adding/modifying assets of the corpus like files and utterances. Args: path (str): Path where the corpus is stored. (Optional) """ def __init__(self, path=None): super(Corpus, self).__init__() self.path = path self._files = {} self._utterances = {} self._issuers = {} self._feature_containers = {} self._subviews = {} @property def name(self): if self.path is None: return 'undefined' else: return os.path.basename(os.path.abspath(self.path)) @property def files(self): return self._files @property def utterances(self): return self._utterances @property def issuers(self): return self._issuers @property def feature_containers(self): return self._feature_containers @property def subviews(self): return self._subviews # # IO #
[docs] def save(self, writer=None): """ If self.path is defined, it tries to save the corpus at the given path. """ if self.path is None: raise ValueError('No path given to save the data set.') self.save_at(self.path, writer)
[docs] def save_at(self, path, writer=None): """ Save this corpus at the given path. If the path differs from the current path set, the path gets updated. Parameters: path (str): Path to save the data set to. writer (str, CorpusWriter): The writer or the name of the reader to use. """ if writer is None: from . import io writer = io.DefaultWriter() elif type(writer) == str: # If a loader is given as string, try to create such a loader. from . import io writer = io.create_writer_of_type(writer) writer.save(self, path) self.path = path
[docs] @classmethod def load(cls, path, reader=None): """ Loads the corpus from the given path, using the given reader. If no reader is given the :py:class:`audiomate.corpus.io.DefaultReader` is used. Args: path (str): Path to load the corpus from. reader (str, CorpusReader): The reader or the name of the reader to use. Returns: Corpus: The loaded corpus. """ if reader is None: from . import io reader = io.DefaultReader() elif type(reader) == str: from . import io reader = io.create_reader_of_type(reader) return reader.load(path)
# # File #
[docs] def new_file(self, path, file_idx, copy_file=False): """ Adds a new file to the corpus with the given data. Parameters: path (str): Path of the file to add. file_idx (str): The id to associate the file with. copy_file (bool): If True the file is copied to the data set folder, otherwise the given path is used directly. Returns: File: The newly added File. """ new_file_idx = file_idx new_file_path = os.path.abspath(path) # Add index to idx if already existing if new_file_idx in self._files.keys(): new_file_idx = naming.index_name_if_in_list(new_file_idx, self._files.keys()) # Copy file to default file dir if copy_file: if not os.path.isdir(self.path): raise ValueError('To copy file the dataset needs to have a path.') __, ext = os.path.splitext(path) new_file_folder = os.path.join(self.path, DEFAULT_FILE_SUBDIR) new_file_path = os.path.join(new_file_folder, '{}{}'.format(new_file_idx, ext)) os.makedirs(new_file_folder, exist_ok=True) shutil.copy(path, new_file_path) # Create file obj new_file = assets.File(new_file_idx, new_file_path) self._files[new_file_idx] = new_file return new_file
[docs] def import_files(self, files): """ Add the given files/file to the corpus. If any of the given file-ids already exists, a suffix is appended so it is unique. Args: files (list): Either a list of or a single :py:class:`audiomate.corpus.assets.File`. Returns: dict: A dictionary containing file idx mappings (old-file-idx/file-instance). If a file is imported, whose id already exists this mapping can be used to check the new id. """ if isinstance(files, assets.File): files = [files] idx_mapping = {} for file in files: idx_mapping[file.idx] = file # Add index to idx if already existing if file.idx in self._files.keys(): file.idx = naming.index_name_if_in_list(file.idx, self._files.keys()) self._files[file.idx] = file return idx_mapping
# # Utterances #
[docs] def new_utterance(self, utterance_idx, file_idx, issuer_idx=None, start=0, end=-1): """ Add a new utterance to the corpus with the given data. Parameters: file_idx (str): The file id the utterance is in. utterance_idx (str): The id to associate with the utterance. If None or already exists, one is generated. issuer_idx (str): The issuer id to associate with the utterance. start (float): Start of the utterance within the file [seconds]. end (float): End of the utterance within the file [seconds]. -1 equals the end of the file. Returns: Utterance: The newly added utterance. """ new_utt_idx = utterance_idx # Check if there is a file with the given idx if file_idx not in self._files.keys(): raise ValueError('File with id {} does not exist!'.format(file_idx)) # Check if issuer exists issuer = None if issuer_idx is not None: if issuer_idx not in self._issuers.keys(): raise ValueError('Issuer with id {} does not exist!'.format(issuer_idx)) else: issuer = self._issuers[issuer_idx] # Add index to idx if already existing if new_utt_idx in self._utterances.keys(): new_utt_idx = naming.index_name_if_in_list(new_utt_idx, self._utterances.keys()) new_utt = assets.Utterance(new_utt_idx, self.files[file_idx], issuer=issuer, start=start, end=end) self._utterances[new_utt_idx] = new_utt return new_utt
[docs] def import_utterances(self, utterances): """ Add the given utterances/utterance to the corpus. If any of the given utterance-ids already exists, a suffix is appended so it is unique. Args: utterances (list): Either a list of or a single :py:class:`audiomate.corpus.assets.Utterance`. Returns: dict: A dictionary containing file idx mappings (old-utterance-idx/utterance-instance). If a utterance is imported, whose id already exists this mapping can be used to check the new id. """ if isinstance(utterances, assets.Utterance): utterances = [utterances] idx_mapping = {} for utterance in utterances: idx_mapping[utterance.idx] = utterance # Check if there is a file with the given idx if utterance.file not in self._files.values(): raise ValueError('File with id {} is not in the corpus.'.format(utterance.file.idx, utterance.idx)) # Check if there is a issuer with the given idx if utterance.issuer is not None and utterance.issuer not in self._issuers.values(): raise ValueError('No issuer in corpus with id {} to add utterance {}.'.format( utterance.issuer.idx, utterance.idx)) # Add index to idx if already existing if utterance.idx in self._utterances.keys(): utterance.idx = naming.index_name_if_in_list(utterance.idx, self._utterances.keys()) self._utterances[utterance.idx] = utterance return idx_mapping
# # Issuer #
[docs] def new_issuer(self, issuer_idx, info=None): """ Add a new issuer to the dataset with the given data. Parameters: issuer_idx (str): The id to associate the issuer with. If None or already exists, one is generated. info (dict, list): Additional info of the issuer. Returns: Issuer: The newly added issuer. """ new_issuer_idx = issuer_idx # Add index to idx if already existing if new_issuer_idx in self._issuers.keys(): new_issuer_idx = naming.index_name_if_in_list(new_issuer_idx, self._issuers.keys()) new_issuer = assets.Issuer(new_issuer_idx, info=info) self._issuers[new_issuer_idx] = new_issuer return new_issuer
[docs] def import_issuers(self, issuers): """ Add the given issuers/issuer to the corpus. If any of the given issuer-ids already exists, a suffix is appended so it is unique. Args: issuers (list): Either a list of or a single :py:class:`audiomate.corpus.assets.Issuer`. Returns: dict: A dictionary containing file idx mappings (old-issuer-idx/issuer-instance). If a issuer is imported, whose id already exists this mapping can be used to check the new id. """ if isinstance(issuers, assets.Issuer): issuers = [issuers] idx_mapping = {} for issuer in issuers: idx_mapping[issuer.idx] = issuer # Add index to idx if already existing if issuer.idx in self._issuers.keys(): issuer.idx = naming.index_name_if_in_list(issuer.idx, self._issuers.keys()) self._issuers[issuer.idx] = issuer return idx_mapping
# # FEATURES #
[docs] def new_feature_container(self, idx, path=None): """ Add a new feature container with the given data. Parameters: idx (str): An unique identifier within the dataset. path (str): The path to store the feature file. If None a default path is used. Returns: FeatureContainer: The newly added feature-container. """ new_feature_idx = idx new_feature_path = path # Add index to idx if already existing if new_feature_idx in self._feature_containers.keys(): new_feature_idx = naming.index_name_if_in_list(new_feature_idx, self._feature_containers.keys()) # Set default path if none given if new_feature_path is None: if not os.path.isdir(self.path): raise ValueError('To copy file the dataset needs to have a path.') new_feature_path = os.path.join(self.path, DEFAULT_FEAT_SUBDIR, new_feature_idx) else: new_feature_path = os.path.abspath(new_feature_path) container = assets.FeatureContainer(new_feature_path) self._feature_containers[new_feature_idx] = container return container
# # Subviews #
[docs] def import_subview(self, idx, subview): """ Add the given subview to the corpus. Args: idx (str): An idx that is unique in the corpus for identifying the subview. If already a subview exists with the given id it will be overridden. subview (Subview): The subview to add. """ subview.corpus = self self._subviews[idx] = subview
# # Merge #
[docs] def merge_corpus(self, corpus): """ Merge the given corpus into this corpus. All assets (files, utterances, issuers, ...) are copied into this corpus. If any ids (utt-idx, file-idx, issuer-idx, subview-idx, ...) are occurring in both corpora, the ids from the merging corpus are suffixed by a number (starting from 1 until no other is matching). Args: corpus (CorpusView): The corpus to merge. """ # Create a copy, so objects aren't changed in the original merging corpus merging_corpus = Corpus.from_corpus(corpus) self.import_files(corpus.files.values()) self.import_issuers(corpus.issuers.values()) utterance_idx_mapping = self.import_utterances(corpus.utterances.values()) for subview_idx, subview in merging_corpus.subviews.items(): for filter in subview.filter_criteria: if isinstance(filter, subset.MatchingUtteranceIdxFilter): new_filtered_utt_ids = set() for utt_idx in filter.utterance_idxs: new_filtered_utt_ids.add(utterance_idx_mapping[utt_idx].idx) filter.utterance_idxs = new_filtered_utt_ids new_idx = naming.index_name_if_in_list(subview_idx, self.subviews.keys()) self.import_subview(new_idx, subview) for feat_container_idx, feat_container in merging_corpus.feature_containers.items(): self.new_feature_container(feat_container_idx, feat_container.path)
# # Creation #
[docs] @classmethod def from_corpus(cls, corpus): """ Create a new modifiable corpus from any other CorpusView. This for example can be used to create a independent modifiable corpus from a subview. Args: corpus (CorpusView): The corpus to create a copy from. Returns: Corpus: A new corpus with the same data as the given one. """ ds = Corpus() # Files files = copy.deepcopy(list(corpus.files.values())) file_mapping = ds.import_files(files) # Issuers issuers = copy.deepcopy(list(corpus.issuers.values())) issuer_mapping = ds.import_issuers(issuers) # Utterances, with replacing changed file- and issuer-ids utterances = copy.deepcopy(list(corpus.utterances.values())) for utterance in utterances: utterance.file = file_mapping[utterance.file.idx] if utterance.issuer is not None: utterance.issuer = issuer_mapping[utterance.issuer.idx] ds.import_utterances(utterances) # Subviews subviews = copy.deepcopy(corpus.subviews) for subview_idx, subview in subviews.items(): ds.import_subview(subview_idx, subview) # Feat-Containers for feat_container_idx, feature_container in corpus.feature_containers.items(): ds.new_feature_container(feat_container_idx, feature_container.path) return ds
[docs] @classmethod def merge_corpora(cls, corpora): """ Merge a list of corpora into one. Args: corpora (Iterable): An iterable of :py:class:`audiomate.corpus.CorpusView`. Returns: Corpus: A corpus with the data from all given corpora merged into one. """ ds = Corpus() for merging_corpus in corpora: ds.merge_corpus(merging_corpus) return ds