import copy
import os
import shutil
from audiomate import tracks
from audiomate import containers
from audiomate import issuers
from audiomate.utils import naming
from audiomate.utils import audio
from . import base
from . import subset
DEFAULT_FILE_SUBDIR = 'files'
DEFAULT_FEAT_SUBDIR = 'features'
[docs]class Corpus(base.CorpusView):
"""
The Corpus class represents a single corpus.
It extends :py:class:`audiomate.corpus.CorpusView` with the functionality for loading and saving.
Furthermore it provides the functionality for adding/modifying assets of the corpus like tracks
and utterances.
Args:
path (str): Path where the corpus is stored. (Optional)
"""
def __init__(self, path=None):
super(Corpus, self).__init__()
self.path = path
self._tracks = {}
self._utterances = {}
self._issuers = {}
self._feature_containers = {}
self._subviews = {}
@property
def name(self):
if self.path is None:
return 'undefined'
else:
return os.path.basename(os.path.abspath(self.path))
@property
def tracks(self):
return self._tracks
@property
def utterances(self):
return self._utterances
@property
def issuers(self):
return self._issuers
@property
def feature_containers(self):
return self._feature_containers
@property
def subviews(self):
return self._subviews
#
# IO
#
[docs] def save(self, writer=None):
"""
If self.path is defined, it tries to save the corpus at the given path.
"""
if self.path is None:
raise ValueError('No path given to save the data set.')
self.save_at(self.path, writer)
[docs] def save_at(self, path, writer=None):
"""
Save this corpus at the given path. If the path differs from the current path set, the path
gets updated.
Parameters:
path (str): Path to save the data set to.
writer (str, CorpusWriter): The writer or the name of the reader to use.
"""
if writer is None:
from . import io
writer = io.DefaultWriter()
elif type(writer) == str:
# If a loader is given as string, try to create such a loader.
from . import io
writer = io.create_writer_of_type(writer)
writer.save(self, path)
self.path = path
[docs] @classmethod
def load(cls, path, reader=None):
"""
Loads the corpus from the given path, using the given reader. If no reader is given the
:py:class:`audiomate.corpus.io.DefaultReader` is used.
Args:
path (str): Path to load the corpus from.
reader (str, CorpusReader): The reader or the name of the reader to use.
Returns:
Corpus: The loaded corpus.
"""
if reader is None:
from . import io
reader = io.DefaultReader()
elif type(reader) == str:
from . import io
reader = io.create_reader_of_type(reader)
return reader.load(path)
#
# Track
#
[docs] def new_file(self, path, track_idx, copy_file=False):
"""
Adds a new audio file to the corpus with the given data.
Parameters:
path (str): Path of the file to add.
track_idx (str): The id to associate the file-track with.
copy_file (bool): If True the file is copied to the data set folder, otherwise the given
path is used directly.
Returns:
FileTrack: The newly added file.
"""
new_file_idx = track_idx
new_file_path = os.path.abspath(path)
# Add index to idx if already existing
if new_file_idx in self._tracks.keys():
new_file_idx = naming.index_name_if_in_list(new_file_idx, self._tracks.keys())
# Copy file to default file dir
if copy_file:
if not os.path.isdir(self.path):
raise ValueError('To copy file the dataset needs to have a path.')
__, ext = os.path.splitext(path)
new_file_folder = os.path.join(self.path, DEFAULT_FILE_SUBDIR)
new_file_path = os.path.join(new_file_folder, '{}{}'.format(new_file_idx, ext))
os.makedirs(new_file_folder, exist_ok=True)
shutil.copy(path, new_file_path)
# Create file obj
new_file = tracks.FileTrack(new_file_idx, new_file_path)
self._tracks[new_file_idx] = new_file
return new_file
[docs] def import_tracks(self, import_tracks):
"""
Add the given tracks/track to the corpus.
If any of the given track-ids already exists, a suffix is appended so it is unique.
Args:
import_tracks (list): Either a list of or a single :py:class:`audiomate.tracks.Track`.
Returns:
dict: A dictionary containing track-idx mappings (old-track-idx/track-instance).
If a track is imported, whose idx already exists this mapping can be used to check
the new id.
"""
if isinstance(import_tracks, tracks.Track):
import_tracks = [import_tracks]
idx_mapping = {}
for track in import_tracks:
idx_mapping[track.idx] = track
# Add index to idx if already existing
if track.idx in self._tracks.keys():
track.idx = naming.index_name_if_in_list(track.idx, self._tracks.keys())
self._tracks[track.idx] = track
return idx_mapping
#
# Utterances
#
[docs] def new_utterance(self, utterance_idx, track_idx, issuer_idx=None, start=0, end=float('inf')):
"""
Add a new utterance to the corpus with the given data.
Parameters:
track_idx (str): The track id the utterance is in.
utterance_idx (str): The id to associate with the utterance.
If None or already exists, one is generated.
issuer_idx (str): The issuer id to associate with the utterance.
start (float): Start of the utterance within the track [seconds].
end (float): End of the utterance within the track [seconds].
``inf`` equals the end of the track.
Returns:
Utterance: The newly added utterance.
"""
new_utt_idx = utterance_idx
# Check if there is a track with the given idx
if track_idx not in self._tracks.keys():
raise ValueError('Track with id {} does not exist!'.format(track_idx))
# Check if issuer exists
issuer = None
if issuer_idx is not None:
if issuer_idx not in self._issuers.keys():
raise ValueError('Issuer with id {} does not exist!'.format(issuer_idx))
else:
issuer = self._issuers[issuer_idx]
# Add index to idx if already existing
if new_utt_idx in self._utterances.keys():
new_utt_idx = naming.index_name_if_in_list(new_utt_idx, self._utterances.keys())
new_utt = tracks.Utterance(new_utt_idx,
self.tracks[track_idx],
issuer=issuer,
start=start,
end=end)
self._utterances[new_utt_idx] = new_utt
return new_utt
[docs] def import_utterances(self, utterances):
"""
Add the given utterances/utterance to the corpus.
If any of the given utterance-ids already exists, a suffix is appended so it is unique.
Args:
utterances (list): Either a list of or a single :py:class:`audiomate.tracks.Utterance`.
Returns:
dict: A dictionary containing idx mappings (old-utterance-idx/utterance-instance).
If a utterance is imported, whose id already exists this mapping can be used to
check the new id.
"""
if isinstance(utterances, tracks.Utterance):
utterances = [utterances]
idx_mapping = {}
for utterance in utterances:
idx_mapping[utterance.idx] = utterance
# Check if there is a track with the given idx
if not self.contains_track(utterance.track):
raise ValueError('Track with id {} is not in the corpus.'.format(utterance.track.idx, utterance.idx))
# Check if there is a issuer with the given idx
if utterance.issuer is not None and not self.contains_issuer(utterance.issuer):
raise ValueError('No issuer in corpus with id {} to add utterance {}.'.format(
utterance.issuer.idx, utterance.idx))
# Add index to idx if already existing
if utterance.idx in self._utterances.keys():
utterance.idx = naming.index_name_if_in_list(utterance.idx, self._utterances.keys())
self._utterances[utterance.idx] = utterance
return idx_mapping
#
# Issuer
#
[docs] def new_issuer(self, issuer_idx, info=None):
"""
Add a new issuer to the dataset with the given data.
Parameters:
issuer_idx (str): The id to associate the issuer with. If None or already exists, one is
generated.
info (dict, list): Additional info of the issuer.
Returns:
Issuer: The newly added issuer.
"""
new_issuer_idx = issuer_idx
# Add index to idx if already existing
if new_issuer_idx in self._issuers.keys():
new_issuer_idx = naming.index_name_if_in_list(new_issuer_idx, self._issuers.keys())
new_issuer = issuers.Issuer(new_issuer_idx, info=info)
self._issuers[new_issuer_idx] = new_issuer
return new_issuer
[docs] def import_issuers(self, new_issuers):
"""
Add the given issuers/issuer to the corpus.
If any of the given issuer-ids already exists, a suffix is appended so it is unique.
Args:
issuers (list): Either a list of or a single :py:class:`audiomate.issuers.Issuer`.
Returns:
dict: A dictionary containing idx mappings (old-issuer-idx/issuer-instance).
If a issuer is imported, whose id already exists this mapping can be used to check
the new id.
"""
if isinstance(new_issuers, issuers.Issuer):
new_issuers = [new_issuers]
idx_mapping = {}
for issuer in new_issuers:
idx_mapping[issuer.idx] = issuer
# Add index to idx if already existing
if issuer.idx in self._issuers.keys():
issuer.idx = naming.index_name_if_in_list(issuer.idx, self._issuers.keys())
self._issuers[issuer.idx] = issuer
return idx_mapping
#
# FEATURES
#
[docs] def new_feature_container(self, idx, path=None):
"""
Add a new feature container with the given data.
Parameters:
idx (str): An unique identifier within the dataset.
path (str): The path to store the feature file. If None a default path is used.
Returns:
FeatureContainer: The newly added feature-container.
"""
new_feature_idx = idx
new_feature_path = path
# Add index to idx if already existing
if new_feature_idx in self._feature_containers.keys():
new_feature_idx = naming.index_name_if_in_list(new_feature_idx,
self._feature_containers.keys())
# Set default path if none given
if new_feature_path is None:
if not os.path.isdir(self.path):
raise ValueError('To copy file the dataset needs to have a path.')
new_feature_path = os.path.join(self.path, DEFAULT_FEAT_SUBDIR, new_feature_idx)
else:
new_feature_path = os.path.abspath(new_feature_path)
feat_container = containers.FeatureContainer(new_feature_path)
self._feature_containers[new_feature_idx] = feat_container
return feat_container
#
# Subviews
#
[docs] def import_subview(self, idx, subview):
"""
Add the given subview to the corpus.
Args:
idx (str): An idx that is unique in the corpus for identifying the subview.
If already a subview exists with the given id it will be overridden.
subview (Subview): The subview to add.
"""
subview.corpus = self
self._subviews[idx] = subview
#
# Merge
#
[docs] def merge_corpus(self, corpus):
"""
Merge the given corpus into this corpus. All assets (tracks, utterances, issuers, ...) are copied into
this corpus. If any ids (utt-idx, track-idx, issuer-idx, subview-idx, ...) are occurring in both corpora,
the ids from the merging corpus are suffixed by a number (starting from 1 until no other is matching).
Args:
corpus (CorpusView): The corpus to merge.
"""
# Create a copy, so objects aren't changed in the original merging corpus
merging_corpus = Corpus.from_corpus(corpus)
self.import_tracks(corpus.tracks.values())
self.import_issuers(corpus.issuers.values())
utterance_idx_mapping = self.import_utterances(corpus.utterances.values())
for subview_idx, subview in merging_corpus.subviews.items():
for filter in subview.filter_criteria:
if isinstance(filter, subset.MatchingUtteranceIdxFilter):
new_filtered_utt_ids = set()
for utt_idx in filter.utterance_idxs:
new_filtered_utt_ids.add(utterance_idx_mapping[utt_idx].idx)
filter.utterance_idxs = new_filtered_utt_ids
new_idx = naming.index_name_if_in_list(subview_idx, self.subviews.keys())
self.import_subview(new_idx, subview)
for feat_container_idx, feat_container in merging_corpus.feature_containers.items():
self.new_feature_container(feat_container_idx, feat_container.path)
#
# VARIA
#
[docs] def relocate_audio_to_single_container(self, target_path):
"""
Copies every track to a single container.
Afterwards all tracks in the container are linked against
this single container.
"""
cont = containers.AudioContainer(target_path)
cont.open()
new_tracks = {}
# First create a new container track for all existing tracks
for track in self.tracks.values():
sr = track.sampling_rate
samples = track.read_samples()
cont.set(track.idx, samples, sr)
new_track = tracks.ContainerTrack(track.idx, cont)
new_tracks[track.idx] = new_track
# Update track list of corpus
self._tracks = new_tracks
# Update utterances to point to new tracks
for utterance in self.utterances.values():
new_track = self.tracks[utterance.track.idx]
utterance.track = new_track
cont.close()
[docs] def relocate_audio_to_wav_files(self, target_path):
"""
Copies every track to its own wav file in the given folder.
Every track will be stored at ``target_path/track_id.wav``.
"""
if not os.path.isdir(target_path):
os.makedirs(target_path)
new_tracks = {}
# First create a new container track for all existing tracks
for track in self.tracks.values():
track_path = os.path.join(target_path, '{}.wav'.format(track.idx))
sr = track.sampling_rate
samples = track.read_samples()
audio.write_wav(track_path, samples, sr=sr)
new_track = tracks.FileTrack(track.idx, track_path)
new_tracks[track.idx] = new_track
# Update track list of corpus
self._tracks = new_tracks
# Update utterances to point to new tracks
for utterance in self.utterances.values():
new_track = self.tracks[utterance.track.idx]
utterance.track = new_track
#
# Creation
#
[docs] @classmethod
def from_corpus(cls, corpus):
"""
Create a new modifiable corpus from any other CorpusView.
This for example can be used to create a independent modifiable corpus from a subview.
Args:
corpus (CorpusView): The corpus to create a copy from.
Returns:
Corpus: A new corpus with the same data as the given one.
"""
ds = Corpus()
# Tracks
tracks = copy.deepcopy(list(corpus.tracks.values()))
track_mapping = ds.import_tracks(tracks)
# Issuers
issuers = copy.deepcopy(list(corpus.issuers.values()))
issuer_mapping = ds.import_issuers(issuers)
# Utterances, with replacing changed track- and issuer-ids
utterances = copy.deepcopy(list(corpus.utterances.values()))
for utterance in utterances:
utterance.track = track_mapping[utterance.track.idx]
if utterance.issuer is not None:
utterance.issuer = issuer_mapping[utterance.issuer.idx]
ds.import_utterances(utterances)
# Subviews
subviews = copy.deepcopy(corpus.subviews)
for subview_idx, subview in subviews.items():
ds.import_subview(subview_idx, subview)
# Feat-Containers
for feat_container_idx, feature_container in corpus.feature_containers.items():
ds.new_feature_container(feat_container_idx, feature_container.path)
return ds
[docs] @classmethod
def merge_corpora(cls, corpora):
"""
Merge a list of corpora into one.
Args:
corpora (Iterable): An iterable of :py:class:`audiomate.corpus.CorpusView`.
Returns:
Corpus: A corpus with the data from all given corpora merged into one.
"""
ds = Corpus()
for merging_corpus in corpora:
ds.merge_corpus(merging_corpus)
return ds