Source code for audiomate.formats.trn

"""
Functions for reading/writing sclite transcription files.

Description of the format:
http://www1.icsi.berkeley.edu/Speech/docs/sctk-1.2/infmts.htm#trn_fmt_name_0
"""

import re

TRANSCRIPT_PATTERN = re.compile(r'(.*) \((.*?)\)')


[docs]def write(path, entries): """ Writes an transcription file. Args: path (str): Path to write the file to. entries (dict): List with entries to write. Example:: >>> data = { >>> 'utt-1': 'sie', >>> 'utt-2': 'hallo', >>> } >>> >>> write_label_file('/some/path/to/file.txt', data) """ sorted_entries = sorted(entries.items(), key=lambda x: x[0]) lines = ['{} ({})'.format(x[1], x[0]) for x in sorted_entries] with open(path, 'w') as f: f.write('\n'.join(lines))
[docs]def read(path): """ Read the labels from a transcription file. Args: path (str): Path to the label file. Returns: dict: Dictionary of transcriptions (utt-idx: transcription) Example:: >>> read_label_file('/path/to/label/file.txt') { 'utt-1': 'sie', 'utt-2': 'hallo' } """ entries = {} with open(path, 'r') as f: for line in f.readlines(): line = line.strip() m = TRANSCRIPT_PATTERN.match(line) if m is not None: utt_idx = m.group(2).strip() transcription = m.group(1).strip() entries[utt_idx] = transcription else: raise ValueError('Failed to parse line of trn file: {}'.format(line)) return entries