Source code for audiomate.corpus.io.common_voice

import os
import glob

import audiomate
from audiomate import annotations
from audiomate import issuers
from audiomate.corpus import subset
from audiomate.utils import textfile
from . import base


INVALID_UTTS = [
    'common_voice_de_17876483',
    'common_voice_de_17876495',
    'common_voice_de_17876535',
    'common_voice_de_17876537',
    'common_voice_de_17876539',
    'common_voice_de_17876540',
    'common_voice_de_17876541',
    'common_voice_de_17876542',
    'common_voice_de_17876543',
    'common_voice_de_17876544',
    'common_voice_de_17876545',
    'common_voice_de_17876546',
    'common_voice_de_17876547',
    'common_voice_de_17876548',
    'common_voice_de_17876549',
    'common_voice_de_17876550',
    'common_voice_de_17876551',
    'common_voice_de_17876552',
    'common_voice_de_17876553',
    'common_voice_de_17876554',
    'common_voice_de_17876555',
    'common_voice_de_17876556',
    'common_voice_de_17876557',
    'common_voice_de_17876304'
]


[docs]class CommonVoiceReader(base.CorpusReader): """ Reader for the Common Voice Corpus. .. seealso:: `Common-Voice <https://voice.mozilla.org/>`_ Project Page """
[docs] @classmethod def type(cls): return 'common-voice'
def _check_for_missing_files(self, path): return [] def _load(self, path): corpus = audiomate.Corpus(path=path) subset_ids = CommonVoiceReader.get_subset_ids(path) for subset_idx in subset_ids: CommonVoiceReader.load_subset(corpus, path, subset_idx) return corpus
[docs] @staticmethod def get_subset_ids(path): """ Return a list with ids of all available subsets (based on existing csv-files). """ all = [] for path in glob.glob(os.path.join(path, '*.tsv')): file_name = os.path.split(path)[1] basename = os.path.splitext(file_name)[0] # We don't want to include the invalidated files # since there maybe corrupt files if basename != 'invalidated': all.append(basename) return all
[docs] @staticmethod def load_subset(corpus, path, subset_idx): """ Load subset into corpus. """ csv_file = os.path.join(path, '{}.tsv'.format(subset_idx)) subset_utt_ids = [] entries = textfile.read_separated_lines_generator( csv_file, separator='\t', max_columns=8, ignore_lines_starting_with=['client_id'], keep_empty=True ) for entry in entries: file_idx = CommonVoiceReader.create_assets_if_needed( corpus, path, entry ) if file_idx is not None: subset_utt_ids.append(file_idx) filter = subset.MatchingUtteranceIdxFilter(utterance_idxs=set(subset_utt_ids)) subview = subset.Subview(corpus, filter_criteria=[filter]) corpus.import_subview(subset_idx, subview)
[docs] @staticmethod def create_assets_if_needed(corpus, path, entry): """ Create File/Utterance/Issuer, if they not already exist and return utt-idx. """ file_name = entry[1] file_idx, _ = os.path.splitext(file_name) if file_idx in INVALID_UTTS: return None if file_idx not in corpus.utterances.keys(): speaker_idx = entry[0] transcription = entry[2] if len(entry) >= 6: age = CommonVoiceReader.map_age(entry[5]) else: age = issuers.AgeGroup.UNKNOWN if len(entry) >= 7: gender = CommonVoiceReader.map_gender(entry[6]) else: gender = issuers.Gender.UNKNOWN file_path = os.path.join(path, 'clips', file_name) corpus.new_file(file_path, file_idx) if speaker_idx in corpus.issuers.keys(): issuer = corpus.issuers[speaker_idx] else: issuer = issuers.Speaker(speaker_idx, gender=gender, age_group=age) corpus.import_issuers(issuer) utterance = corpus.new_utterance(file_idx, file_idx, issuer.idx) utterance.set_label_list( annotations.LabelList.create_single( transcription, idx=audiomate.corpus.LL_WORD_TRANSCRIPT ) ) return file_idx
[docs] @staticmethod def map_age(age): """ Map age to correct age-group. """ if age in [None, '']: return issuers.AgeGroup.UNKNOWN elif age == 'teens': return issuers.AgeGroup.YOUTH elif age in ['sixties', 'seventies', 'eighties', 'nineties']: return issuers.AgeGroup.SENIOR else: return issuers.AgeGroup.ADULT
[docs] @staticmethod def map_gender(gender): """ Map gender to correct value. """ if gender == 'male': return issuers.Gender.MALE elif gender == 'female': return issuers.Gender.FEMALE else: return issuers.Gender.UNKNOWN