import abc
from audiomate.corpus import base
[docs]class FilterCriterion(metaclass=abc.ABCMeta):
"""
A filter criterion decides wheter a given utterance contained
in a given corpus matches the filter.
"""
[docs] @abc.abstractmethod
def match(self, utterance, corpus):
"""
Check if the utterance matches the filter.
Args:
utterance (Utterance): The utterance to match.
corpus (CorpusView): The corpus that contains the utterance.
Returns:
bool: True if the filter matches the utterance, False otherwise.
"""
raise NotImplementedError()
[docs] @abc.abstractmethod
def serialize(self):
"""
Serialize this filter criterion to write to a file.
The output needs to be a single line without line breaks.
Returns:
str: A string representing this filter criterion.
"""
raise NotImplementedError()
[docs] @classmethod
@abc.abstractmethod
def parse(cls, representation):
"""
Create a filter criterion based on a string representation
(created with ``serialize``).
Args:
representation (str): The string representation.
Returns:
FilterCriterion: The filter criterion from that representation.
"""
raise NotImplementedError()
[docs] @classmethod
@abc.abstractmethod
def name(cls):
""" Returns a name identifying this type of filter criterion. """
return 'unknown'
[docs]class MatchingUtteranceIdxFilter(FilterCriterion):
"""
A filter criterion that matches utterances based on utterance-ids.
Args:
utterance_idxs (:class:`set`): A list of utterance-ids. Only utterances
in the list will pass the filter
inverse (bool): If True only utterance not in the list pass the filter.
"""
def __init__(self, utterance_idxs, inverse=False):
self.utterance_idxs = set(utterance_idxs)
self.inverse = inverse
def match(self, utterance, corpus):
return (not self.inverse and utterance.idx in self.utterance_idxs) \
or (self.inverse and utterance.idx not in self.utterance_idxs)
def serialize(self):
inverse_indication = 'exclude' if self.inverse else 'include'
id_string = ','.join(sorted(self.utterance_idxs))
return '{},{}'.format(inverse_indication, id_string)
@classmethod
def parse(cls, representation):
items = representation.strip().split(',')
inverse_indication = items.pop(0)
inverse = inverse_indication == 'exclude'
return cls(utterance_idxs=set(items), inverse=inverse)
@classmethod
def name(cls):
return 'matching_utterance_ids'
[docs]class MatchingLabelFilter(FilterCriterion):
"""
A filter criterion that only accepts utterances
which only have the given labels.
Args:
labels (:class:`set`): A set of labels which are accepted.
label_list_ids (:class:`set`): Only check label-lists with these ids.
If ``None``, checks all label-lists.
"""
def __init__(self, labels, label_list_ids=None):
self.labels = labels or set()
self.label_list_ids = label_list_ids or set()
def match(self, utterance, corpus):
for label_list_idx, label_list in utterance.label_lists.items():
if len(self.label_list_ids) == 0 or label_list_idx in self.label_list_ids:
for label in label_list:
if label.value not in self.labels:
return False
return True
def serialize(self):
ll_ids = ','.join(sorted(self.label_list_ids))
labels = ','.join(sorted(self.labels))
return '{}|||{}'.format(ll_ids, labels)
@classmethod
def parse(cls, representation):
parts = representation.strip().split('|||')
if len(parts) > 1:
ll_ids = parts[0].strip().split(',')
labels = parts[1].strip().split(',')
else:
ll_ids = set()
labels = parts[0].strip().split(',')
return cls(set(labels), set(ll_ids))
@classmethod
def name(cls):
return 'matching_labels'
__filter_criteria = {}
for filter_class in FilterCriterion.__subclasses__():
__filter_criteria[filter_class.name()] = filter_class
class UnknownFilterCriteriaException(Exception):
pass
def available_filter_criteria():
"""
Get a mapping of all available filter criteria.
Returns:
dict: A dictionary with filter-criterion classes
with the name of these criteria as key.
Example::
>>> available_filter_criteria()
{
"matching_utterance_ids" : subview.MatchingUtteranceIdxFilter
}
"""
return __filter_criteria
[docs]class Subview(base.CorpusView):
"""
A subview is a readonly layer representing some subset of a corpus.
The assets the subview contains are defined by filter criteria.
Only if an utterance passes all filter criteria it is contained
in the subview.
Args:
corpus (CorpusView): The corpus this subview is based on.
filter_criteria (list, FilterCriterion): List of
:py:class:`FilterCriterion`
Example::
>>> filter = subview.MatchingUtteranceIdxFilter(utterance_idxs=(['utt-1', 'utt-3']))
>>> corpus = audiomate.corpus.load('path/to/corpus')
>>> corpus.num_utterances
14
>>> subset = subview.Subview(self.corpus, filter_criteria=[filter])
>>> subset.num_utterances
2
"""
def __init__(self, corpus, filter_criteria):
self.corpus = corpus
if isinstance(filter_criteria, list):
self.filter_criteria = filter_criteria
else:
self.filter_criteria = [filter_criteria]
self._cached_utterances = None
self._all_utterance_ids = None
@property
def name(self):
return 'subview of {}'.format(self.corpus.name)
@property
def tracks(self):
return {utterance.track.idx: utterance.track for utterance in self.utterances.values()}
@property
def utterances(self):
# Check if filtered utterances are cached
# and return them if utterances in parent corpus haven't changed
if self._cached_utterances is not None:
if not set(self.corpus.utterances.keys()).isdisjoint(self._all_utterance_ids):
return self._cached_utterances
filtered_utterances = {}
idx_utterances = self.corpus.utterances.items()
for utt_idx, utterance in idx_utterances:
matches = True
for criterion in self.filter_criteria:
if not criterion.match(utterance, self.corpus):
matches = False
if matches:
filtered_utterances[utt_idx] = utterance
# Cache filtered utterances
self._cached_utterances = filtered_utterances
self._all_utterance_ids = {u[0] for u in idx_utterances}
return filtered_utterances
@property
def issuers(self):
issuers = {}
for utterance in self.utterances.values():
if utterance.issuer is not None:
issuers[utterance.issuer.idx] = utterance.issuer
return issuers
@property
def feature_containers(self):
return self.corpus.feature_containers
[docs] def serialize(self):
"""
Return a string representing the subview
with all of its filter criteria.
Returns:
str: String with subview definition.
"""
lines = []
for criterion in self.filter_criteria:
lines.append(criterion.name())
lines.append(criterion.serialize())
return '\n'.join(lines)
[docs] @classmethod
def parse(cls, representation, corpus=None):
"""
Creates a subview from a string representation
(created with ``self.serialize``).
Args:
representation (str): The representation.
Returns:
Subview: The created subview.
"""
criteria_definitions = representation.split('\n')
criteria = []
for i in range(0, len(criteria_definitions), 2):
filter_name = criteria_definitions[i]
filter_repr = criteria_definitions[i + 1]
if filter_name not in available_filter_criteria():
raise UnknownFilterCriteriaException('Unknown filter-criterion {}'.format(filter_name))
criterion = available_filter_criteria()[filter_name].parse(filter_repr)
criteria.append(criterion)
return cls(corpus, criteria)