import collections
import copy
import intervaltree
from .label import Label
[docs]class LabelList:
"""
Represents a list of labels which describe an utterance.
An utterance can have multiple label-lists.
Args:
idx (str): An unique identifier for the label-list
within a corpus for one utterance.
labels (list): The list containing the
:py:class:`audiomate.annotations.Label`.
Attributes:
utterance (Utterance): The utterance this label-list is belonging to.
label_tree (IntervalTree): The interval-tree storing the labels.
Example:
>>> label_list = LabelList(idx='transcription', labels=[
>>> Label('this', 0, 2),
>>> Label('is', 2, 4),
>>> Label('timmy', 4, 8)
>>> ])
"""
__slots__ = ['idx', 'label_tree', 'utterance']
def __init__(self, idx='default', labels=None):
self.idx = idx
self.utterance = None
self.label_tree = intervaltree.IntervalTree()
if labels is not None:
self.update(labels)
def __eq__(self, other):
data_this = (self.idx, self.label_tree)
data_other = (other.idx, other.label_tree)
return data_this == data_other
def __iter__(self):
for interval in self.label_tree:
yield interval.data
def __len__(self):
return self.label_tree.__len__()
def __copy__(self):
# utterance is ignored intentionally,
# since it is kind of a weak ref
return LabelList(
idx=self.idx,
labels=[iv.data for iv in self.label_tree]
)
def __deepcopy__(self, memo):
# utterance is ignored intentionally,
# since it is kind of a weak ref
return LabelList(
idx=self.idx,
labels=copy.deepcopy([iv.data for iv in self.label_tree], memo)
)
@property
def labels(self):
""" Return list of labels. """
return list(self)
@property
def start(self):
""" Return start of the earliest starting label (lower bound). """
return self.label_tree.begin()
@property
def end(self):
""" Return end of the lastly ending label (upper bound). """
return self.label_tree.end()
@property
def total_length(self):
"""
Return the cumulative length of all labels
(Number of characters).
"""
return sum(label.length for label in self.labels)
#
# Alteration
#
[docs] def add(self, label):
"""
Add a label to the end of the list.
Args:
label (Label): The label to add.
"""
label.label_list = self
self.label_tree.addi(label.start, label.end, label)
[docs] def addl(self, value, start=0.0, end=float('inf')):
""" Shortcut for ``add(Label(value, start, end))``. """
self.add(Label(value, start=start, end=end))
[docs] def update(self, labels):
"""
Add a list of labels to the end of the list.
Args:
labels (list): Labels to add.
"""
ivs = []
for label in labels:
label.label_list = self
ivs.append(intervaltree.Interval(label.start, label.end, label))
self.label_tree.update(ivs)
[docs] def apply(self, fn):
"""
Apply the given function `fn` to every label in this label list.
`fn` is a function of one argument that receives the current label
which can then be edited in place.
Args:
fn (func): Function to apply to every label
Example:
>>> ll = LabelList(labels=[
... Label('a_label', 1.0, 2.0),
... Label('another_label', 2.0, 3.0)
... ])
>>> def shift_labels(label):
... label.start += 1.0
... label.end += 1.0
...
>>> ll.apply(shift_labels)
>>> ll.labels
[Label(a_label, 2.0, 3.0), Label(another_label, 3.0, 4.0)]
"""
for label in self.labels:
fn(label)
[docs] def merge_overlaps(self, threshold=0.0):
"""
Merge overlapping labels with the same value.
Two labels are considered overlapping,
if ``l2.start - l1.end < threshold``.
Args:
threshold (float): Maximal distance between two labels
to be considered as overlapping.
(default: 0.0)
Example:
>>> ll = LabelList(labels=[
... Label('a_label', 1.0, 2.0),
... Label('a_label', 1.5, 2.7),
... Label('b_label', 1.0, 2.0),
... ])
>>> ll.merge_overlapping_labels()
>>> ll.labels
[
Label('a_label', 1.0, 2.7),
Label('b_label', 1.0, 2.0),
]
"""
updated_labels = []
all_intervals = self.label_tree.copy()
# recursivly find a group of overlapping labels with the same value
def recursive_overlaps(interval):
range_start = interval.begin - threshold
range_end = interval.end + threshold
direct_overlaps = all_intervals.overlap(range_start, range_end)
all_overlaps = [interval]
all_intervals.discard(interval)
for overlap in direct_overlaps:
if overlap.data.value == interval.data.value:
all_overlaps.extend(recursive_overlaps(overlap))
return all_overlaps
# For every remaining interval
# - Find overlapping intervals recursively
# - Remove them
# - Create a concatenated new label
while not all_intervals.is_empty():
next_interval = list(all_intervals)[0]
overlapping = recursive_overlaps(next_interval)
ov_start = float('inf')
ov_end = 0.0
ov_value = next_interval.data.value
for overlap in overlapping:
ov_start = min(ov_start, overlap.begin)
ov_end = max(ov_end, overlap.end)
all_intervals.discard(overlap)
updated_labels.append(Label(
ov_value,
ov_start,
ov_end
))
# Replace the old labels with the updated ones
self.label_tree.clear()
self.update(updated_labels)
#
# Statistics
#
[docs] def label_total_duration(self):
"""
Return for each distinct label value the total duration of
all occurrences.
Returns:
dict: A dictionary containing for every label-value (key)
the total duration in seconds (value).
Example:
>>> ll = LabelList(labels=[
>>> Label('a', 3, 5),
>>> Label('b', 5, 8),
>>> Label('a', 8, 10),
>>> Label('b', 10, 14),
>>> Label('a', 15, 18.5)
>>> ])
>>> ll.label_total_duration()
{'a': 7.5 'b': 7.0}
"""
durations = collections.defaultdict(float)
for label in self:
durations[label.value] += label.duration
return durations
[docs] def label_values(self):
"""
Return a list of all occuring label values.
Returns:
list: Lexicographically sorted list (str) of label values.
Example:
>>> ll = LabelList(labels=[
>>> Label('a', 3.2, 4.5),
>>> Label('b', 5.1, 8.9),
>>> Label('c', 7.2, 10.5),
>>> Label('d', 10.5, 14),
>>> Label('d', 15, 18)
>>> ])
>>> ll.label_values()
['a', 'b', 'c', 'd']
"""
all_labels = {l.value for l in self}
return sorted(all_labels)
[docs] def label_count(self):
"""
Return for each label the number of occurrences within the list.
Returns:
dict: A dictionary containing for every label-value (key)
the number of occurrences (value).
Example:
>>> ll = LabelList(labels=[
>>> Label('a', 3.2, 4.5),
>>> Label('b', 5.1, 8.9),
>>> Label('a', 7.2, 10.5),
>>> Label('b', 10.5, 14),
>>> Label('a', 15, 18)
>>> ])
>>> ll.label_count()
{'a': 3 'b': 2}
"""
occurrences = collections.defaultdict(int)
for label in self:
occurrences[label.value] += 1
return occurrences
[docs] def all_tokens(self, delimiter=' '):
"""
Return a list of all tokens occurring in the label-list.
Args:
delimiter (str): The delimiter used to split labels into tokens.
See :meth:`audiomate.annotations.Label.tokenized`
Returns:
:class:`set`: A set of distinct tokens.
"""
tokens = set()
for label in self:
tokens = tokens.union(set(label.tokenized(delimiter=delimiter)))
return tokens
#
# Query Label Values
#
[docs] def join(self, delimiter=' ', overlap_threshold=0.1):
"""
Return a string with all labels concatenated together.
The order of the labels is defined by the start of the label.
If the overlapping between two labels is greater than
``overlap_threshold``, an Exception is thrown.
Args:
delimiter (str): A string to join two consecutive labels.
overlap_threshold (float): Maximum overlap between two
consecutive labels.
Returns:
str: A string with all labels concatenated together.
Example:
>>> ll = LabelList(idx='some', labels=[
>>> Label('a', start=0, end=4),
>>> Label('b', start=3.95, end=6.0),
>>> Label('c', start=7.0, end=10.2),
>>> Label('d', start=10.3, end=14.0)
>>> ])
>>> ll.join(' - ')
'a - b - c - d'
"""
sorted_by_start = sorted(self.labels)
concat_values = []
last_label_end = None
for label in sorted_by_start:
if last_label_end is None or (last_label_end - label.start < overlap_threshold and last_label_end > 0):
concat_values.append(label.value)
last_label_end = label.end
else:
raise ValueError('Labels overlap, not able to define the correct order')
return delimiter.join(concat_values)
[docs] def tokenized(self, delimiter=' ', overlap_threshold=0.1):
"""
Return a ordered list of tokens based on all labels.
Joins all token from all labels (``label.tokenized()```).
If the overlapping between two labels is greater than
``overlap_threshold``, an Exception is thrown.
Args:
delimiter (str): The delimiter used to split labels into tokens.
(default: space)
overlap_threshold (float): Maximum overlap between two
consecutive labels.
Returns:
str: A list containing tokens of all labels ordered according
to the label order.
Example:
>>> ll = LabelList(idx='some', labels=[
>>> Label('a d q', start=0, end=4),
>>> Label('b', start=3.95, end=6.0),
>>> Label('c a', start=7.0, end=10.2),
>>> Label('f g', start=10.3, end=14.0)
>>> ])
>>> ll.tokenized(delimiter=' ', overlap_threshold=0.1)
['a', 'd', 'q', 'b', 'c', 'a', 'f', 'g']
"""
sorted_by_start = sorted(self.labels)
tokens = []
last_label_end = None
for label in sorted_by_start:
if last_label_end is None or (last_label_end - label.start < overlap_threshold and last_label_end > 0):
tokens.extend(label.tokenized(delimiter=delimiter))
last_label_end = label.end
else:
raise ValueError('Labels overlap, not able to define the correct order')
return tokens
#
# Restructuring
#
[docs] def separated(self):
"""
Create a separate Label-List for every distinct label-value.
Returns:
dict: A dictionary with distinct label-values as keys. Every value
is a LabelList containing only labels with the same value.
Example:
>>> ll = LabelList(idx='some', labels=[
>>> Label('a', start=0, end=4),
>>> Label('b', start=3.95, end=6.0),
>>> Label('a', start=7.0, end=10.2),
>>> Label('b', start=10.3, end=14.0)
>>> ])
>>> s = ll.separate()
>>> s['a'].labels
[Label('a', start=0, end=4), Label('a', start=7.0, end=10.2)]
>>> s['b'].labels
[Label('b', start=3.95, end=6.0), Label('b', start=10.3, end=14.0)]
"""
separated_lls = collections.defaultdict(LabelList)
for label in self.labels:
separated_lls[label.value].add(label)
for ll in separated_lls.values():
ll.idx = self.idx
return separated_lls
[docs] def labels_in_range(self, start, end, fully_included=False):
"""
Return a list of labels, that are within the given range.
Also labels that only overlap are included.
Args:
start(float): Start-time in seconds.
end(float): End-time in seconds.
fully_included(bool): If ``True``, only labels fully included
in the range are returned. Otherwise
also overlapping ones are returned.
(default ``False``)
Returns:
list: List of labels in the range.
Example:
>>> ll = LabelList(labels=[
>>> Label('a', 3.2, 4.5),
>>> Label('b', 5.1, 8.9),
>>> Label('c', 7.2, 10.5),
>>> Label('d', 10.5, 14)
>>>])
>>> ll.labels_in_range(6.2, 10.1)
[Label('b', 5.1, 8.9), Label('c', 7.2, 10.5)]
"""
if fully_included:
intervals = self.label_tree.envelop(start, end)
else:
intervals = self.label_tree.overlap(start, end)
return [iv.data for iv in intervals]
[docs] def ranges(self, yield_ranges_without_labels=False, include_labels=None):
"""
Generate all ranges of the label-list. A range is defined
as a part of the label-list for which the same labels are defined.
Args:
yield_ranges_without_labels(bool): If True also yields ranges for
which no labels are defined.
include_labels(list): If not empty, only the label values in
the list will be considered.
Returns:
generator: A generator which yields one range
(tuple start/end/list-of-labels) at a time.
Example:
>>> ll = LabelList(labels=[
>>> Label('a', 3.2, 4.5),
>>> Label('b', 5.1, 8.9),
>>> Label('c', 7.2, 10.5),
>>> Label('d', 10.5, 14)
>>>])
>>> ranges = ll.ranges()
>>> next(ranges)
(3.2, 4.5, [ < audiomate.annotations.Label at 0x1090527c8 > ])
>>> next(ranges)
(4.5, 5.1, [])
>>> next(ranges)
(5.1, 7.2, [ < audiomate.annotations.label.Label at 0x1090484c8 > ])
"""
tree_copy = self.label_tree.copy()
# Remove labels not included
if include_labels is not None:
for iv in list(tree_copy):
if iv.data.value not in include_labels:
tree_copy.remove(iv)
def reduce(x, y):
x.append(y)
return x
# Split labels when overlapping and merge equal ranges to a list of labels
tree_copy.split_overlaps()
tree_copy.merge_equals(data_reducer=reduce, data_initializer=[])
intervals = sorted(tree_copy)
last_end = intervals[0].begin
# yield range by range
for iv in intervals:
# yield an empty range if necessary
if yield_ranges_without_labels and iv.begin > last_end:
yield (last_end, iv.begin, [])
yield (iv.begin, iv.end, iv.data)
last_end = iv.end
[docs] def split(self, cutting_points, shift_times=False, overlap=0.0):
"""
Split the label-list into x parts and return them as new label-lists.
x is defined by the number of cutting-points
(``x == len(cutting_points) + 1``).
The result is a list of label-lists corresponding to each part.
Label-list 0 contains labels between ``0`` and ``cutting_points[0]``.
Label-list 1 contains labels between ``cutting_points[0]`` and
``cutting_points[1]``. And so on.
Args:
cutting_points(list): List of floats defining the points in seconds,
where the label-list is splitted.
shift_times(bool): If True, start and end-time are shifted in
splitted label-lists. So the start is relative
to the cutting point and not to the beginning
of the original label-list.
overlap(float): Amount of overlap in seconds. This amount is
subtracted from a start-cutting-point, and added
to a end-cutting-point.
Returns:
list: A list of of: class: `audiomate.annotations.LabelList`.
Example:
>>> ll = LabelList(labels=[
>>> Label('a', 0, 5),
>>> Label('b', 5, 10),
>>> Label('c', 11, 15),
>>>])
>>>
>>> res = ll.split([4.1, 8.9, 12.0])
>>> len(res)
4
>>> res[0].labels
[Label('a', 0.0, 4.1)]
>>> res[1].labels
[
Label('a', 4.1, 5.0),
Label('b', 5.0, 8.9)
]
>>> res[2].labels
[
Label('b', 8.9, 10.0),
Label('c', 11.0, 12.0)
]
>>> res[3].labels
[Label('c', 12.0, 15.0)]
If ``shift_times = True``, the times are adjusted to be relative
to the cutting-points for every label-list but the first.
>>> ll = LabelList(labels=[
>>> Label('a', 0, 5),
>>> Label('b', 5, 10),
>>>])
>>>
>>> res = ll.split([4.6])
>>> len(res)
4
>>> res[0].labels
[Label('a', 0.0, 4.6)]
>>> res[1].labels
[
Label('a', 0.0, 0.4),
Label('b', 0.4, 5.4)
]
"""
if len(cutting_points) == 0:
raise ValueError('At least one cutting-point is needed!')
# we have to loop in sorted order
cutting_points = sorted(cutting_points)
splits = []
iv_start = 0.0
for i in range(len(cutting_points) + 1):
if i < len(cutting_points):
iv_end = cutting_points[i]
else:
iv_end = float('inf')
# get all intervals intersecting range
intervals = self.label_tree.overlap(
iv_start - overlap,
iv_end + overlap
)
cp_splits = LabelList(idx=self.idx)
# Extract labels from intervals with updated times
for iv in intervals:
label = copy.deepcopy(iv.data)
label.start = max(0, iv_start - overlap, label.start)
label.end = min(iv_end + overlap, label.end)
if shift_times:
orig_start = max(0, iv_start - overlap)
label.start -= orig_start
label.end -= orig_start
cp_splits.add(label)
splits.append(cp_splits)
iv_start = iv_end
return splits
#
# Convenience Constructors
#
[docs] @classmethod
def create_single(cls, value, idx='default'):
"""
Create a label-list with a single label
containing the given value.
"""
return LabelList(idx=idx, labels=[
Label(value=value)
])
[docs] @classmethod
def with_label_values(cls, values, idx='default'):
"""
Create a new label-list containing labels with the given values.
All labels will have default start/end values of 0 and ``inf``.
Args:
values(list): List of values(str) that should be created and
appended to the label-list.
idx(str): The idx of the label-list.
Returns:
(LabelList): New label-list.
Example:
>>> ll = LabelList.with_label_values(['a', 'x', 'z'], idx='letters')
>>> ll.idx
'letters'
>>> ll.labels
[
Label('a', 0, inf),
Label('x', 0, inf),
Label('z', 0, inf),
]
"""
ll = LabelList(idx=idx)
for label_value in values:
ll.add(Label(label_value))
return ll