Source code for audiomate.corpus.io.musan

import os

import audiomate
from audiomate import annotations
from audiomate import issuers
from audiomate.utils import textfile
from . import base
from . import downloader

DOWNLOAD_URL = 'http://www.openslr.org/resources/17/musan.tar.gz'

AUDIO_TYPES_ = ['music', 'noise', 'speech']

ANN_NUM_COLUMS_ = {'music': 4, 'noise': 1, 'speech': 3}

ANN_FILE_NAME_ = 'ANNOTATIONS'


[docs]class MusanDownloader(downloader.ArchiveDownloader):
    """
    Downloader for the MUSAN Corpus.

    Args:
        url (str): The url to download the dataset from. If not given the default URL is used.
                   It is expected to be a tar.gz file.
    """

    def __init__(self, url=None):
        if url is None:
            url = DOWNLOAD_URL

        super(MusanDownloader, self).__init__(
            url,
            move_files_up=True
        )

[docs]    @classmethod
    def type(cls):
        return 'musan'


[docs]class MusanReader(base.CorpusReader):
    """
    Reader for the MUSAN corpus. MUSAN is a corpus of music, speech, and noise recordings.

    .. seealso::

       `MUSAN: A Music, Speech, and Noise Corpus <https://arxiv.org/pdf/1510.08484v1.pdf>`_
          Paper explaining the structure and characteristics of the corpus

       `OpenSLR: MUSAN <http://www.openslr.org/17/>`_
          Download page
    """

[docs]    @classmethod
    def type(cls):
        return 'musan'

    def _check_for_missing_files(self, path):
        # Some label files are missing anyway in the original data set
        # (e.g. speech/us-gov/ANNOTATIONS). What's left would be checking for missing directories.
        return []

    def _load(self, path):
        create_or_get_issuer = {
            'music': self._create_or_get_music_issuer,
            'noise': self._create_or_get_noise_issuer,
            'speech': self._create_or_get_speech_issuer,
        }

        corpus = audiomate.Corpus(path=path)

        for type_name, type_directory in self._directories(path).items():
            for _, source_directory in self._directories(type_directory).items():
                labels_path = os.path.join(source_directory, ANN_FILE_NAME_)
                labels = {}

                if os.path.exists(labels_path):
                    labels = textfile.read_separated_lines_with_first_key(
                        labels_path, separator=' ', max_columns=ANN_NUM_COLUMS_[type_name])

                it = os.scandir(source_directory)

                for entry in it:
                    if not entry.name.endswith('.wav'):
                        continue

                    file_path = os.path.join(source_directory, entry.name)
                    file_idx = entry.name[0:-4]  # chop of .wav
                    utterance_idx = file_idx  # every file is a separate utterance
                    issuer_idx = create_or_get_issuer[type_name](corpus, file_idx, labels)

                    corpus.new_file(file_path, track_idx=file_idx, copy_file=False)
                    utterance = corpus.new_utterance(utterance_idx, file_idx, issuer_idx)
                    utterance.set_label_list(annotations.LabelList.create_single(
                        type_name, idx=audiomate.corpus.LL_DOMAIN))

        return corpus

    @staticmethod
    def _directories(path):
        directories = {}
        it = os.scandir(path)
        for entry in it:
            if not entry.is_dir():
                continue

            directories[entry.name] = os.path.join(path, entry.name)

        return directories

    # noinspection PyUnusedLocal
    @staticmethod
    def _create_or_get_noise_issuer(corpus, file_idx, labels):
        return None

    @staticmethod
    def _create_or_get_music_issuer(corpus, file_idx, labels):
        if file_idx not in labels:
            return None

        issuer_idx = labels[file_idx][2]

        if issuer_idx not in corpus.issuers:
            issuer = issuers.Artist(issuer_idx, name=issuer_idx)
            corpus.import_issuers(issuer)

        return issuer_idx

    @staticmethod
    def _create_or_get_speech_issuer(corpus, file_idx, labels):
        if file_idx not in labels:
            return None

        issuer = issuers.Speaker(file_idx)

        if file_idx in labels:
            if labels[file_idx][0] == 'm':
                issuer.gender = issuers.Gender.MALE
            elif labels[file_idx][0] == 'f':
                issuer.gender = issuers.Gender.FEMALE

        corpus.import_issuers(issuer)

        return file_idx