Source code for audiomate.corpus.io.default

import collections
import glob
import os
import re
import json

import audiomate
from audiomate.corpus import assets
from audiomate.corpus.subset import subview
from audiomate.utils import textfile
from audiomate.utils import jsonfile
from . import base

FILES_FILE_NAME = 'files.txt'
ISSUER_FILE_NAME = 'issuers.json'
UTTERANCE_FILE_NAME = 'utterances.txt'
UTT_ISSUER_FILE_NAME = 'utt_issuers.txt'
LABEL_FILE_PREFIX = 'labels'
FEAT_CONTAINER_FILE_NAME = 'features.txt'
SUBVIEW_FILE_PREFIX = 'subview'

LABEL_META_REGEX = r'(.*) \[(\{.*\})\]'
META_PATTERN = re.compile(LABEL_META_REGEX)


[docs]class DefaultReader(base.CorpusReader): """ Reads corpora in the Default format. """
[docs] @classmethod def type(cls): return 'default'
def _check_for_missing_files(self, path): necessary_files = [FILES_FILE_NAME, UTTERANCE_FILE_NAME] missing_files = [] for file_name in necessary_files: file_path = os.path.join(path, file_name) if not os.path.isfile(file_path): missing_files.append(file_name) return missing_files def _load(self, path): file_path = os.path.join(path, FILES_FILE_NAME) issuer_path = os.path.join(path, ISSUER_FILE_NAME) utt_issuer_path = os.path.join(path, UTT_ISSUER_FILE_NAME) utterance_path = os.path.join(path, UTTERANCE_FILE_NAME) feat_path = os.path.join(path, FEAT_CONTAINER_FILE_NAME) corpus = audiomate.Corpus(path=path) DefaultReader.read_files(file_path, corpus) DefaultReader.read_issuers(issuer_path, corpus) utt_id_to_issuer = DefaultReader.read_utt_to_issuer_mapping(utt_issuer_path, corpus) DefaultReader.read_utterances(utterance_path, corpus, utt_id_to_issuer) DefaultReader.read_labels(path, corpus) DefaultReader.read_feature_containers(feat_path, corpus) DefaultReader.read_subviews(path, corpus) return corpus @staticmethod def read_files(file_path, corpus): path = os.path.dirname(file_path) for file_idx, file_path in textfile.read_key_value_lines(file_path, separator=' ').items(): corpus.new_file(os.path.join(path, file_path), file_idx=file_idx, copy_file=False) @staticmethod def read_issuers(file_path, corpus): if not os.path.isfile(file_path): return data = jsonfile.read_json_file(file_path) for issuer_idx, issuer_data in data.items(): issuer_type = issuer_data.get('type', None) issuer_info = issuer_data.get('info', {}) if issuer_type == 'speaker': gender = assets.Gender(issuer_data.get('gender', 'unknown').lower()) age_group = assets.AgeGroup(issuer_data.get('age_group', 'unknown').lower()) native_language = issuer_data.get('native_language', None) issuer = assets.Speaker(issuer_idx, gender=gender, age_group=age_group, native_language=native_language, info=issuer_info) elif issuer_type == 'artist': name = issuer_data.get('name', None) issuer = assets.Artist(issuer_idx, name=name, info=issuer_info) else: issuer = assets.Issuer(issuer_idx, info=issuer_info) corpus.import_issuers(issuer) @staticmethod def read_utt_to_issuer_mapping(utt_issuer_path, corpus): utt_issuers = {} if os.path.isfile(utt_issuer_path): for utt_id, issuer_idx in textfile.read_key_value_lines(utt_issuer_path, separator=' ').items(): if issuer_idx in corpus.issuers.keys(): utt_issuers[utt_id] = corpus.issuers[issuer_idx] else: utt_issuers[utt_id] = corpus.new_issuer(issuer_idx=issuer_idx) return utt_issuers @staticmethod def read_utterances(utterance_path, corpus, utt_idx_to_issuer): utterances = textfile.read_separated_lines_with_first_key(utterance_path, separator=' ', max_columns=4) for utterance_idx, utt_info in utterances.items(): issuer_idx = None start = 0 end = -1 if len(utt_info) > 1: start = float(utt_info[1]) if len(utt_info) > 2: end = float(utt_info[2]) if utterance_idx in utt_idx_to_issuer.keys(): issuer_idx = utt_idx_to_issuer[utterance_idx].idx corpus.new_utterance(utterance_idx, utt_info[0], issuer_idx=issuer_idx, start=start, end=end) @staticmethod def read_labels(path, corpus): for label_file in glob.glob(os.path.join(path, '{}_*.txt'.format(LABEL_FILE_PREFIX))): file_name = os.path.basename(label_file) key = file_name[len('{}_'.format(LABEL_FILE_PREFIX)):len(file_name) - len('.txt')] utterance_labels = collections.defaultdict(list) labels = textfile.read_separated_lines_generator(label_file, separator=' ', max_columns=4) for record in labels: label = record[3] start = float(record[1]) end = float(record[2]) meta = None meta_match = META_PATTERN.match(label) if meta_match is not None: meta_json = meta_match.group(2) meta = json.loads(meta_json) label = meta_match.group(1) utterance_labels[record[0]].append(assets.Label(label, start, end, meta=meta)) for utterance_idx, labels in utterance_labels.items(): ll = assets.LabelList(idx=key, labels=labels) corpus.utterances[utterance_idx].set_label_list(ll) @staticmethod def read_feature_containers(feat_path, corpus): if os.path.isfile(feat_path): base_path = os.path.dirname(feat_path) containers = textfile.read_key_value_lines(feat_path, separator=' ') for container_name, container_path in containers.items(): corpus.new_feature_container(container_name, path=os.path.join(base_path, container_path)) @staticmethod def read_subviews(path, corpus): for sv_file in glob.glob(os.path.join(path, '{}_*.txt'.format(SUBVIEW_FILE_PREFIX))): file_name = os.path.basename(sv_file) key = file_name[len('{}_'.format(SUBVIEW_FILE_PREFIX)):len(file_name) - len('.txt')] with open(sv_file, 'r') as f: content = f.read().strip() sv = subview.Subview.parse(content) corpus.import_subview(key, sv)
[docs]class DefaultWriter(base.CorpusWriter): """ Writes corpora in the Default format. """
[docs] @classmethod def type(cls): return 'default'
def _save(self, corpus, path): file_path = os.path.join(path, FILES_FILE_NAME) issuer_path = os.path.join(path, ISSUER_FILE_NAME) utterance_path = os.path.join(path, UTTERANCE_FILE_NAME) utt_issuer_path = os.path.join(path, UTT_ISSUER_FILE_NAME) container_path = os.path.join(path, FEAT_CONTAINER_FILE_NAME) DefaultWriter.write_files(file_path, corpus, path) DefaultWriter.write_issuers(issuer_path, corpus) DefaultWriter.write_utterances(utterance_path, corpus) DefaultWriter.write_utt_to_issuer_mapping(utt_issuer_path, corpus) DefaultWriter.write_labels(path, corpus) DefaultWriter.write_feature_containers(container_path, corpus) DefaultWriter.write_subviews(path, corpus) @staticmethod def write_files(file_path, corpus, path): file_records = [[file.idx, os.path.relpath(file.path, path)] for file in corpus.files.values()] textfile.write_separated_lines(file_path, file_records, separator=' ', sort_by_column=0) @staticmethod def write_issuers(file_path, corpus): data = {} for issuer in corpus.issuers.values(): issuer_data = {} if issuer.info is not None and len(issuer.info) > 0: issuer_data['info'] = issuer.info if type(issuer) == assets.Speaker: issuer_data['type'] = 'speaker' if issuer.gender != assets.Gender.UNKNOWN: issuer_data['gender'] = issuer.gender.value if issuer.age_group != assets.AgeGroup.UNKNOWN: issuer_data['age_group'] = issuer.age_group.value if issuer.native_language not in ['', None]: issuer_data['native_language'] = issuer.native_language elif type(issuer) == assets.Artist: if issuer.name not in ['', None]: issuer_data['name'] = issuer.name data[issuer.idx] = issuer_data jsonfile.write_json_to_file(file_path, data) @staticmethod def write_utterances(utterance_path, corpus): utterance_records = {utterance.idx: [utterance.file.idx, utterance.start, utterance.end] for utterance in corpus.utterances.values()} textfile.write_separated_lines(utterance_path, utterance_records, separator=' ', sort_by_column=0) @staticmethod def write_utt_to_issuer_mapping(utt_issuer_path, corpus): utt_issuer_records = {} for utterance in corpus.utterances.values(): if utterance.issuer is not None: utt_issuer_records[utterance.idx] = utterance.issuer.idx textfile.write_separated_lines(utt_issuer_path, utt_issuer_records, separator=' ', sort_by_column=0) @staticmethod def write_labels(path, corpus): records = collections.defaultdict(list) for utterance in corpus.utterances.values(): for label_list_idx, label_list in utterance.label_lists.items(): utt_records = [] for l in label_list: if len(l.meta) > 0: value = '{} [{}]'.format(l.value, json.dumps(l.meta, sort_keys=True)) utt_records.append((utterance.idx, l.start, l.end, value)) else: utt_records.append((utterance.idx, l.start, l.end, l.value)) records[label_list_idx].extend(utt_records) for label_list_idx, label_list_records in records.items(): file_path = os.path.join(path, '{}_{}.txt'.format(LABEL_FILE_PREFIX, label_list_idx)) textfile.write_separated_lines(file_path, label_list_records, separator=' ') @staticmethod def write_feature_containers(container_path, corpus): feat_records = [(idx, container.path) for idx, container in corpus.feature_containers.items()] textfile.write_separated_lines(container_path, feat_records, separator=' ') @staticmethod def write_subviews(path, corpus): for name, sv in corpus.subviews.items(): sv_path = os.path.join(path, '{}_{}.txt'.format(SUBVIEW_FILE_PREFIX, name)) with open(sv_path, 'w') as f: f.write(sv.serialize())