Source code for audiomate.corpus.validation.label_list

import functools
import multiprocessing

from audiomate import corpus
from audiomate import logutil
from . import base

logger = logutil.getLogger()


[docs]class UtteranceTranscriptionRatioValidator(base.Validator):
    """
    Checks if the ratio between utterance-duration and transcription-length
    is below a given ratio. This is used to find utterances where the speech
    transcription is to long for a given utterance, meaning too much characters
    per second.

    Args:
        max_characters_per_second (int): If char/sec of an utterance is
                                         higher than this it is returned.
        label_list_idx (str): The label-list to use for validation.
        num_threads (int): Number of threads to use.
    """

    def __init__(self, max_characters_per_second=10, label_list_idx=corpus.LL_WORD_TRANSCRIPT,
                 num_threads=1):
        self.max_characters_per_second = max_characters_per_second
        self.label_list_idx = label_list_idx
        self.num_threads = num_threads

[docs]    def name(self):
        return 'Utterance-Transcription-Ratio ({})'.format(self.label_list_idx)

[docs]    def validate(self, corpus_to_validate):
        """
        Perform the validation on the given corpus.

        Args:
            corpus_to_validate (Corpus): The corpus to test/validate.

        Returns:
            InvalidItemsResult: Validation result.
        """

        with multiprocessing.pool.ThreadPool(self.num_threads) as p:
            func = functools.partial(
                self.ratio_of_utterance,
                ll_idx=self.label_list_idx
            )

            result = list(logger.progress(
                p.imap(
                    func,
                    list(corpus_to_validate.utterances.values())
                ),
                total=corpus_to_validate.num_utterances,
                description='Validate character ratio'
            ))

        invalid_utterances = {}

        for utt_idx, char_per_sec in result:
            if type(char_per_sec) is float:
                if char_per_sec > self.max_characters_per_second:
                    invalid_utterances[utt_idx] = char_per_sec
            else:
                invalid_utterances[utt_idx] = char_per_sec

        passed = len(invalid_utterances) <= 0
        info = {
            'Threshold max. characters per second': str(self.max_characters_per_second),
            'Label-List ID': self.label_list_idx
        }

        return base.InvalidItemsResult(passed, invalid_utterances, name=self.name(), info=info)

    def ratio_of_utterance(self, utterance,  ll_idx):
        try:
            duration = utterance.duration
            ll = utterance.label_lists[ll_idx]

            # We count the characters of all labels
            transcription = ' '.join(l.value for l in ll)
            num_chars = len(transcription.replace(' ', ''))

            char_per_sec = num_chars / duration

            return utterance.idx, char_per_sec

        # skipcq: PYL-W0703
        # We want to check all utterances and just skip ones failing for another reason.
        # We should try to figure out all specific exceptions that could fail this.
        except Exception as e:
            return utterance.idx, str(e)


[docs]class LabelCountValidator(base.Validator):
    """
    Checks if every utterance contains a label-list with the given id
    and has at least `min_number_of_labels`.

    Args:
        min_number_of_labels (int): Minimum number of expected labels.
        label_list_idx (str): The label-list to use for validation.
    """

    def __init__(self, min_number_of_labels=1, label_list_idx=corpus.LL_WORD_TRANSCRIPT):
        self.min_number_of_labels = min_number_of_labels
        self.label_list_idx = label_list_idx

[docs]    def name(self):
        return 'Label-Count ({})'.format(self.label_list_idx)

[docs]    def validate(self, corpus_to_validate):
        """
        Perform the validation on the given corpus.

        Args:
            corpus_to_validate (Corpus): The corpus to test/validate.

        Returns:
            InvalidItemsResult: Validation result.
        """
        invalid_utterances = {}

        for utterance in corpus_to_validate.utterances.values():
            if self.label_list_idx in utterance.label_lists.keys():
                ll = utterance.label_lists[self.label_list_idx]

                if len(ll) < self.min_number_of_labels:
                    invalid_utterances[utterance.idx] = 'Only {} labels'.format(len(ll))
            else:
                invalid_utterances[utterance.idx] = 'No label-list {}'.format(self.label_list_idx)

        passed = len(invalid_utterances) <= 0
        info = {
            'Min. number of labels': str(self.min_number_of_labels),
            'Label-List ID': self.label_list_idx
        }

        return base.InvalidItemsResult(passed, invalid_utterances, name=self.name(), info=info)


[docs]class LabelCoverageValidationResult(base.ValidationResult):
    """
    Result of a the :class:`LabelCoverageValidator`.

    Args:
        passed (bool): A boolean indicating, if the validation has
                       passed (``True``) or failed (``False``).
        uncovered_segments (dict): A dictionary containing a list of uncovered
                                   segments for every utterance.
        name (str): The name of the validator, that produced the result.
        info (dict): Dictionary containing key/value string-pairs with detailed
                     information of the validation. For example id of the
                     label-list that was validated.
    """

    def __init__(self, passed, uncovered_segments, name, info=None):
        super(LabelCoverageValidationResult, self).__init__(passed, name=name, info=info)

        self.uncovered_segments = uncovered_segments

[docs]    def get_report(self):
        """
        Return a string containing a report of the result.
        This can used to print or save to a text file.

        Returns:
            str: String containing infos about the result
        """

        lines = [super(LabelCoverageValidationResult, self).get_report()]

        if len(self.uncovered_segments) > 0:
            lines.append('\nUncovered segments:')

            for utt_idx, utt_segments in self.uncovered_segments.items():
                if len(utt_segments) > 0:
                    lines.append('\n{}'.format(utt_idx))
                    sorted_items = sorted(utt_segments, key=lambda x: x[0])
                    lines.extend(['    * {:10.2f}  -  {:10.2f}'.format(x[0], x[1]) for x in sorted_items])

        return '\n'.join(lines)


[docs]class LabelCoverageValidator(base.Validator):
    """
    Check if every portion of the utterance is covered with at least one label.
    The validator returns segments (start, end) of an utterance,
    where no label is defined within the given label-list.

    Args:
        label_list_idx (str): The idx of the label-list to check.
        threshold (float): A threshold for the length of a segment
                           to be considered as uncovered.
    """

    def __init__(self, label_list_idx, threshold=0.01):
        self.label_list_idx = label_list_idx
        self.threshold = threshold

[docs]    def name(self):
        return 'Label-Coverage ({})'.format(self.label_list_idx)

[docs]    def validate(self, corpus_to_validate):
        """
        Perform the validation on the given corpus.

        Args:
            corpus_to_validate (Corpus): The corpus to test/validate.

        Returns:
            LabelCoverageValidationResult: Validation result.
        """

        uncovered_segments = {}

        for utterance in corpus_to_validate.utterances.values():
            utt_segments = self.validate_utterance(utterance)

            if len(utt_segments) > 0:
                uncovered_segments[utterance.idx] = utt_segments

        passed = len(uncovered_segments) <= 0
        info = {
            'Label-List ID': self.label_list_idx,
            'Threshold': str(self.threshold)
        }

        return LabelCoverageValidationResult(passed, uncovered_segments, self.name(), info)

[docs]    def validate_utterance(self, utterance):
        """
        Validate the given utterance and
        return a list of uncovered segments (start, end).
        """
        uncovered_segments = []

        if self.label_list_idx in utterance.label_lists.keys():
            start = 0
            end = utterance.duration
            ll = utterance.label_lists[self.label_list_idx]
            ranges = list(ll.ranges(yield_ranges_without_labels=True))

            # Check coverage at start
            if ranges[0][0] - start > self.threshold:
                uncovered_segments.append((start, ranges[0][0]))

            # Check for empty ranges
            for ll_range in ranges:
                if len(ll_range[2]) == 0 and ll_range[1] - ll_range[0] > self.threshold:
                    uncovered_segments.append((ll_range[0], ll_range[1]))

            # Check coverage at end
            if ranges[-1][1] > 0 and end - ranges[-1][1] > self.threshold:
                uncovered_segments.append((ranges[-1][1], end))

        else:
            uncovered_segments.append((utterance.start, utterance.end))

        return uncovered_segments


[docs]class LabelOverflowValidationResult(base.ValidationResult):
    """
    Result of a the :class:`LabelOverflowValidator`.

    Args:
        passed (bool): A boolean indicating, if the validation has
                       passed (``True``) or failed (``False``).
        overflow_segments (dict): A dictionary containing a list of
                                  overflowing segments for every utterance.
        name (str): The name of the validator, that produced the result.
        info (dict): Dictionary containing key/value string-pairs with
                     detailed information of the validation. For example
                     id of the label-list that was validated.
    """

    def __init__(self, passed, overflow_segments, name, info=None):
        super(LabelOverflowValidationResult, self).__init__(passed, name=name, info=info)

        self.overflow_segments = overflow_segments

[docs]    def get_report(self):
        """
        Return a string containing a report of the result.
        This can used to print or save to a text file.

        Returns:
            str: String containing infos about the result
        """

        lines = [super(LabelOverflowValidationResult, self).get_report()]

        if len(self.overflow_segments) > 0:
            lines.append('\nSegments outside of the utterance:')

            for utt_idx, utt_segments in self.overflow_segments.items():
                if len(utt_segments) > 0:
                    lines.append('\n{}'.format(utt_idx))
                    sorted_items = sorted(utt_segments, key=lambda x: x[0])
                    lines.extend(['    * {:10.2f}  -  {:10.2f} :  {}'.format(x[0], x[1], x[2]) for x in sorted_items])

        return '\n'.join(lines)


[docs]class LabelOverflowValidator(base.Validator):
    """
    Check if all labels are within the boundaries of an utterance.
    Finds all segments of labels that lie outside of an utterance.

    Args:
        label_list_idx (str): The idx of the label-list to check.
        threshold (float): A threshold for a time distance to be
                           considered for an overflow.
    """

    def __init__(self, label_list_idx, threshold=0.01):
        self.label_list_idx = label_list_idx
        self.threshold = threshold

[docs]    def name(self):
        return 'Label-Overflow ({})'.format(self.label_list_idx)

[docs]    def validate(self, corpus_to_validate):
        """
        Perform the validation on the given corpus.

        Args:
            corpus_to_validate (Corpus): The corpus to test/validate.

        Returns:
            LabelOverflowValidationResult: Validation result.
        """

        overflow_segments = {}

        for utterance in corpus_to_validate.utterances.values():
            utt_segments = self.validate_utterance(utterance)

            if len(utt_segments) > 0:
                overflow_segments[utterance.idx] = utt_segments

        passed = len(overflow_segments) <= 0
        info = {
            'Label-List ID': self.label_list_idx,
            'Threshold': str(self.threshold)
        }

        return LabelOverflowValidationResult(passed, overflow_segments, self.name(), info)

[docs]    def validate_utterance(self, utterance):
        """
        Validate the given utterance and return a list of
        segments (start, end, label-value), that are outside of the utterance.
        """
        overflow_segments = []

        if self.label_list_idx in utterance.label_lists.keys():
            ll = utterance.label_lists[self.label_list_idx]
            start = 0
            end = utterance.duration

            for label in ll:
                if start - label.start > self.threshold:
                    label_end = label.end if label.end != float('inf') else end
                    overflow_end = min(start, label_end)
                    overflow_segments.append((label.start, overflow_end, label.value))

                if label.end != float('inf') and label.end - end > self.threshold:
                    overflow_start = max(end, label.start)
                    overflow_segments.append((overflow_start, label.end, label.value))

        return overflow_segments