Source code for audiomate.tracks.utterance

import collections
import copy

import numpy as np

from audiomate import annotations
from audiomate.utils import units


[docs]class Utterance: """ An utterance defines a sample of audio. It is part of a track or can span over the whole track. Args: idx (str): A unique identifier for the utterance within a dataset. track (Track): The track this utterance is belonging to. issuer (Issuer): The issuer this utterance was created from. start (float): The start of the utterance within the audio track in seconds. (default 0) end (float): The end of the utterance within the audio track in seconds. ``inf`` indicates that the utterance ends at the end of the track. (default ``inf``) label_lists (LabelList, list): A single or multiple label-lists. Attributes: label_lists (dict): A dictionary containing label-lists with the label-list-idx as key. """ __slots__ = ['idx', 'track', 'issuer', 'start', 'end', 'label_lists'] def __init__(self, idx, track, issuer=None, start=0, end=float('inf'), label_lists=None): self.idx = idx self.track = track self.issuer = issuer self.start = start self.end = end self.label_lists = {} if label_lists is not None: self.set_label_list(label_lists) if self.issuer is not None: self.issuer.utterances.add(self) def __copy__(self): return Utterance( self.idx, self.track, issuer=self.issuer, start=self.start, end=self.end, label_lists=list(self.label_lists.values()) ) def __deepcopy__(self, memo): return Utterance( self.idx, copy.deepcopy(self.track, memo), issuer=copy.deepcopy(self.issuer, memo), start=self.start, end=self.end, label_lists=copy.deepcopy(list(self.label_lists.values()), memo) ) @property def end_abs(self): """ Return the absolute end of the utterance relative to the signal. """ if self.end == float('inf'): return self.track.duration else: return self.end @property def duration(self): """ Return the absolute duration in seconds. """ return self.end_abs - self.start
[docs] def num_samples(self, sr=None): """ Return the number of samples. Args: sr (int): Calculate the number of samples with the given sampling-rate. If None use the native sampling-rate. Returns: int: Number of samples """ native_sr = self.sampling_rate num_samples = units.seconds_to_sample(self.duration, native_sr) if sr is not None: ratio = float(sr) / native_sr num_samples = int(np.ceil(num_samples * ratio)) return num_samples
# # Signal #
[docs] def read_samples(self, sr=None, offset=0, duration=None): """ Read the samples of the utterance. Args: sr (int): If None uses the sampling rate given by the track, otherwise resamples to the given sampling rate. offset (float): Offset in seconds to read samples from. duration (float): If not ``None`` read only this number of seconds in maximum. Returns: np.ndarray: A numpy array containing the samples as a floating point (numpy.float32) time series. """ offset = self.start + offset if self.end != float('inf'): if duration is not None: duration = min(duration, self.end - offset) else: duration = self.end - offset return self.track.read_samples( sr=sr, offset=offset, duration=duration )
@property def sampling_rate(self): """ Return the sampling rate. """ return self.track.sampling_rate # # Labels #
[docs] def set_label_list(self, label_lists): """ Set the given label-list for this utterance. If the label-list-idx is not set, ``default`` is used. If there is already a label-list with the given idx, it will be overriden. Args: label_list (LabelList, list): A single or multi. label-lists to add. """ if isinstance(label_lists, annotations.LabelList): label_lists = [label_lists] for label_list in label_lists: if label_list.idx is None: label_list.idx = 'default' label_list.utterance = self self.label_lists[label_list.idx] = label_list
[docs] def all_label_values(self, label_list_ids=None): """ Return a set of all label-values occurring in this utterance. Args: label_list_ids (list): If not None, only label-values from label-lists with an id contained in this list are considered. Returns: :class:`set`: A set of distinct label-values. """ values = set() for label_list in self.label_lists.values(): if label_list_ids is None or label_list.idx in label_list_ids: values = values.union(label_list.label_values()) return values
[docs] def label_count(self, label_list_ids=None): """ Return a dictionary containing the number of times, every label-value in this utterance is occurring. Args: label_list_ids (list): If not None, only labels from label-lists with an id contained in this list are considered. Returns: dict: A dictionary containing the number of occurrences with the label-value as key. """ count = collections.defaultdict(int) for label_list in self.label_lists.values(): if label_list_ids is None or label_list.idx in label_list_ids: for label_value, label_count in label_list.label_count().items(): count[label_value] += label_count return count
[docs] def all_tokens(self, delimiter=' ', label_list_ids=None): """ Return a list of all tokens occurring in one of the labels in the label-lists. Args: delimiter (str): The delimiter used to split labels into tokens (see :meth:`audiomate.annotations.Label.tokenized`). label_list_ids (list): If not None, only labels from label-lists with an idx contained in this list are considered. Returns: :class:`set`: A set of distinct tokens. """ tokens = set() for label_list in self.label_lists.values(): if label_list_ids is None or label_list.idx in label_list_ids: tokens = tokens.union(label_list.all_tokens(delimiter=delimiter)) return tokens
[docs] def label_total_duration(self, label_list_ids=None): """ Return a dictionary containing the number of seconds, every label-value is occurring in this utterance. Args: label_list_ids (list): If not None, only labels from label-lists with an id contained in this list are considered. Returns: dict: A dictionary containing the number of seconds with the label-value as key. """ duration = collections.defaultdict(float) for label_list in self.label_lists.values(): if label_list_ids is None or label_list.idx in label_list_ids: for label_value, label_duration in label_list.label_total_duration().items(): duration[label_value] += label_duration return duration
[docs] def split(self, cutting_points, track_relative=False, overlap=0.0): """ Split the utterance into x parts (sub-utterances) and return them as new utterances. x is defined by cutting_points (``x = len(cutting_points) + 1``). By default cutting-points are relative to the start of the utterance. For example if an utterance starts at 50s, a cutting-point of 10.0 will split the utterance at 60s relative to the track. Args: cutting_points (list): List of floats defining the times in seconds where to split the utterance. track_relative (bool): If ``True``, cutting-points are relative to the start of the track. Otherwise they are relative to the start of the utterance. overlap (float): Amount of overlap in seconds. This amount is subtracted from a start-cutting-point, and added to a end-cutting-point. Returns: list: List of :class:`Utterance`'s. Example: >>> utt = Utterance('utt-1', 'file-x', start=0.0, end=30.0) >>> sub_utts = utt.split([10.0, 20.0]) >>> len(sub_utts) 3 >>> sub_utts[0].start 0.0 >>> sub_utts[0].end 10.0 """ if not track_relative: cutting_points = [c + self.start for c in cutting_points] if len(cutting_points) == 0: raise ValueError('At least 1 cutting point is needed!') splitted_label_lists = collections.defaultdict(list) for idx, label_list in self.label_lists.items(): label_cutting_points = [x - self.start for x in cutting_points] parts = label_list.split( label_cutting_points, shift_times=True, overlap=overlap ) splitted_label_lists[idx] = parts # Only consider cutting-points within utterance. filtered_cutting_points = [] for cutting_point in cutting_points: if self.start < cutting_point < self.end: filtered_cutting_points.append(cutting_point) sub_utterances = [] for index in range(len(filtered_cutting_points) + 1): if index == 0: sub_start = self.start else: sub_start = max(self.start, cutting_points[index - 1] - overlap) if index >= len(filtered_cutting_points): sub_end = self.end else: sub_end = min(self.end, filtered_cutting_points[index] + overlap) new_idx = '{}_{}'.format(self.idx, index) new_utt = Utterance( new_idx, track=self.track, issuer=self.issuer, start=sub_start, end=sub_end ) for parts in splitted_label_lists.values(): new_utt.set_label_list(parts[index]) sub_utterances.append(new_utt) return sub_utterances