import collections
import copy
import numpy as np
from audiomate import annotations
from audiomate.utils import units
[docs]class Utterance:
"""
An utterance defines a sample of audio.
It is part of a track or can span over the whole track.
Args:
idx (str): A unique identifier for the utterance within a dataset.
track (Track): The track this utterance is belonging to.
issuer (Issuer): The issuer this utterance was created from.
start (float): The start of the utterance
within the audio track in seconds. (default 0)
end (float): The end of the utterance within the audio track in seconds.
``inf`` indicates that the utterance ends
at the end of the track. (default ``inf``)
label_lists (LabelList, list): A single or multiple label-lists.
Attributes:
label_lists (dict): A dictionary containing label-lists
with the label-list-idx as key.
"""
__slots__ = ['idx', 'track', 'issuer', 'start', 'end', 'label_lists']
def __init__(self, idx, track, issuer=None, start=0, end=float('inf'), label_lists=None):
self.idx = idx
self.track = track
self.issuer = issuer
self.start = start
self.end = end
self.label_lists = {}
if label_lists is not None:
self.set_label_list(label_lists)
if self.issuer is not None:
self.issuer.utterances.add(self)
def __copy__(self):
return Utterance(
self.idx,
self.track,
issuer=self.issuer,
start=self.start,
end=self.end,
label_lists=list(self.label_lists.values())
)
def __deepcopy__(self, memo):
return Utterance(
self.idx,
copy.deepcopy(self.track, memo),
issuer=copy.deepcopy(self.issuer, memo),
start=self.start,
end=self.end,
label_lists=copy.deepcopy(list(self.label_lists.values()), memo)
)
@property
def end_abs(self):
""" Return the absolute end of the utterance relative to the signal. """
if self.end == float('inf'):
return self.track.duration
else:
return self.end
@property
def duration(self):
""" Return the absolute duration in seconds. """
return self.end_abs - self.start
[docs] def num_samples(self, sr=None):
"""
Return the number of samples.
Args:
sr (int): Calculate the number of samples with the given
sampling-rate. If None use the native sampling-rate.
Returns:
int: Number of samples
"""
native_sr = self.sampling_rate
num_samples = units.seconds_to_sample(self.duration, native_sr)
if sr is not None:
ratio = float(sr) / native_sr
num_samples = int(np.ceil(num_samples * ratio))
return num_samples
#
# Signal
#
[docs] def read_samples(self, sr=None, offset=0, duration=None):
"""
Read the samples of the utterance.
Args:
sr (int): If None uses the sampling rate given by the track,
otherwise resamples to the given sampling rate.
offset (float): Offset in seconds to read samples from.
duration (float): If not ``None`` read only this
number of seconds in maximum.
Returns:
np.ndarray: A numpy array containing the samples
as a floating point (numpy.float32) time series.
"""
offset = self.start + offset
if self.end != float('inf'):
if duration is not None:
duration = min(duration, self.end - offset)
else:
duration = self.end - offset
return self.track.read_samples(
sr=sr,
offset=offset,
duration=duration
)
@property
def sampling_rate(self):
""" Return the sampling rate. """
return self.track.sampling_rate
#
# Labels
#
[docs] def set_label_list(self, label_lists):
"""
Set the given label-list for this utterance.
If the label-list-idx is not set, ``default`` is used.
If there is already a label-list with the given idx,
it will be overriden.
Args:
label_list (LabelList, list): A single or multi. label-lists to add.
"""
if isinstance(label_lists, annotations.LabelList):
label_lists = [label_lists]
for label_list in label_lists:
if label_list.idx is None:
label_list.idx = 'default'
label_list.utterance = self
self.label_lists[label_list.idx] = label_list
[docs] def all_label_values(self, label_list_ids=None):
"""
Return a set of all label-values occurring in this utterance.
Args:
label_list_ids (list): If not None, only label-values from
label-lists with an id contained in this list
are considered.
Returns:
:class:`set`: A set of distinct label-values.
"""
values = set()
for label_list in self.label_lists.values():
if label_list_ids is None or label_list.idx in label_list_ids:
values = values.union(label_list.label_values())
return values
[docs] def label_count(self, label_list_ids=None):
"""
Return a dictionary containing the number of times,
every label-value in this utterance is occurring.
Args:
label_list_ids (list): If not None, only labels from label-lists
with an id contained in this list
are considered.
Returns:
dict: A dictionary containing the number of occurrences
with the label-value as key.
"""
count = collections.defaultdict(int)
for label_list in self.label_lists.values():
if label_list_ids is None or label_list.idx in label_list_ids:
for label_value, label_count in label_list.label_count().items():
count[label_value] += label_count
return count
[docs] def all_tokens(self, delimiter=' ', label_list_ids=None):
"""
Return a list of all tokens occurring in
one of the labels in the label-lists.
Args:
delimiter (str): The delimiter used to split labels into tokens
(see :meth:`audiomate.annotations.Label.tokenized`).
label_list_ids (list): If not None, only labels from label-lists with
an idx contained in this list are considered.
Returns:
:class:`set`: A set of distinct tokens.
"""
tokens = set()
for label_list in self.label_lists.values():
if label_list_ids is None or label_list.idx in label_list_ids:
tokens = tokens.union(label_list.all_tokens(delimiter=delimiter))
return tokens
[docs] def label_total_duration(self, label_list_ids=None):
"""
Return a dictionary containing the number of seconds,
every label-value is occurring in this utterance.
Args:
label_list_ids (list): If not None, only labels from label-lists
with an id contained in this
list are considered.
Returns:
dict: A dictionary containing the number of seconds
with the label-value as key.
"""
duration = collections.defaultdict(float)
for label_list in self.label_lists.values():
if label_list_ids is None or label_list.idx in label_list_ids:
for label_value, label_duration in label_list.label_total_duration().items():
duration[label_value] += label_duration
return duration
[docs] def split(self, cutting_points, track_relative=False, overlap=0.0):
"""
Split the utterance into x parts (sub-utterances) and
return them as new utterances. x is defined by cutting_points
(``x = len(cutting_points) + 1``).
By default cutting-points are relative to the start of the utterance.
For example if an utterance starts at 50s, a cutting-point
of 10.0 will split the utterance at 60s relative to the track.
Args:
cutting_points (list): List of floats defining the times
in seconds where to split the utterance.
track_relative (bool): If ``True``, cutting-points are relative
to the start of the track. Otherwise they
are relative to the start of the utterance.
overlap (float): Amount of overlap in seconds. This amount is
subtracted from a start-cutting-point,
and added to a end-cutting-point.
Returns:
list: List of :class:`Utterance`'s.
Example:
>>> utt = Utterance('utt-1', 'file-x', start=0.0, end=30.0)
>>> sub_utts = utt.split([10.0, 20.0])
>>> len(sub_utts)
3
>>> sub_utts[0].start
0.0
>>> sub_utts[0].end
10.0
"""
if not track_relative:
cutting_points = [c + self.start for c in cutting_points]
if len(cutting_points) == 0:
raise ValueError('At least 1 cutting point is needed!')
splitted_label_lists = collections.defaultdict(list)
for idx, label_list in self.label_lists.items():
label_cutting_points = [x - self.start for x in cutting_points]
parts = label_list.split(
label_cutting_points,
shift_times=True,
overlap=overlap
)
splitted_label_lists[idx] = parts
# Only consider cutting-points within utterance.
filtered_cutting_points = []
for cutting_point in cutting_points:
if self.start < cutting_point < self.end:
filtered_cutting_points.append(cutting_point)
sub_utterances = []
for index in range(len(filtered_cutting_points) + 1):
if index == 0:
sub_start = self.start
else:
sub_start = max(self.start, cutting_points[index - 1] - overlap)
if index >= len(filtered_cutting_points):
sub_end = self.end
else:
sub_end = min(self.end, filtered_cutting_points[index] + overlap)
new_idx = '{}_{}'.format(self.idx, index)
new_utt = Utterance(
new_idx,
track=self.track,
issuer=self.issuer,
start=sub_start,
end=sub_end
)
for parts in splitted_label_lists.values():
new_utt.set_label_list(parts[index])
sub_utterances.append(new_utt)
return sub_utterances