import os
import struct
import numpy as np
import scipy
import audiomate
from audiomate import tracks
from audiomate import annotations
from audiomate import issuers
from audiomate.utils import textfile
from . import base
from . import default
WAV_FILE_NAME = 'wav.scp'
SEGMENTS_FILE_NAME = 'segments'
UTT2SPK_FILE_NAME = 'utt2spk'
SPK2GENDER_FILE_NAME = 'spk2gender'
TRANSCRIPTION_FILE_NAME = 'text'
FEATS_FILE_NAME = 'feats'
[docs]class KaldiReader(base.CorpusReader):
"""
Supports reading data sets in Kaldi format.
.. seealso::
`Kaldi: Data preparation <http://kaldi-asr.org/doc/data_prep.html>`_
Describes how a data set has to be structured to be understood
by Kaldi and the format of the individual files.
"""
def __init__(self, main_label_list_idx=audiomate.corpus.LL_WORD_TRANSCRIPT,
main_feature_idx='default'):
self.main_label_list_idx = main_label_list_idx
self.main_feature_idx = main_feature_idx
[docs] @classmethod
def type(cls):
return 'kaldi'
def _check_for_missing_files(self, path):
necessary_files = [WAV_FILE_NAME, TRANSCRIPTION_FILE_NAME]
missing_files = []
for file_name in necessary_files:
file_path = os.path.join(path, file_name)
if not os.path.isfile(file_path):
missing_files.append(file_name)
return missing_files
def _load(self, path):
wav_file_path = os.path.join(path, WAV_FILE_NAME)
spk2gender_path = os.path.join(path, SPK2GENDER_FILE_NAME)
utt2spk_path = os.path.join(path, UTT2SPK_FILE_NAME)
segments_path = os.path.join(path, SEGMENTS_FILE_NAME)
text_path = os.path.join(path, TRANSCRIPTION_FILE_NAME)
corpus = audiomate.Corpus(path=path)
default.DefaultReader.read_files(wav_file_path, corpus)
KaldiReader.read_genders(spk2gender_path, corpus)
utt2spk = default.DefaultReader.read_utt_to_issuer_mapping(utt2spk_path, corpus)
KaldiReader.read_utterances(segments_path, corpus, utt2spk)
KaldiReader.read_transcriptions(text_path, corpus)
return corpus
@staticmethod
def read_genders(genders_path, corpus):
if os.path.isfile(genders_path):
speakers = textfile.read_key_value_lines(genders_path, separator=' ')
for speaker_idx, gender_str in speakers.items():
if gender_str == 'm':
gender = issuers.Gender.MALE
else:
gender = issuers.Gender.FEMALE
speaker = issuers.Speaker(speaker_idx, gender=gender)
corpus.import_issuers(speaker)
@staticmethod
def read_utterances(segments_path, corpus, utt2spk):
# load utterances
if os.path.isfile(segments_path):
utterances = textfile.read_separated_lines_with_first_key(
segments_path,
separator=' ',
max_columns=4
)
for utt_id, utt_info in utterances.items():
start = 0
end = float('inf')
if len(utt_info) > 1:
start = float(utt_info[1])
if len(utt_info) > 2:
end = float(utt_info[2])
if end == -1:
end = float('inf')
speaker_idx = None
if utt_id in utt2spk.keys():
speaker_idx = utt2spk[utt_id].idx
corpus.new_utterance(
utt_id, utt_info[0],
issuer_idx=speaker_idx,
start=start,
end=end
)
else:
for file_idx in corpus.files.keys():
speaker_idx = None
if file_idx in utt2spk.keys():
speaker_idx = utt2spk[file_idx].idx
corpus.new_utterance(file_idx, file_idx, issuer_idx=speaker_idx)
@staticmethod
def read_transcriptions(text_path, corpus):
transcriptions = textfile.read_key_value_lines(text_path, separator=' ')
for utt_id, transcription in transcriptions.items():
ll = annotations.LabelList.create_single(
transcription,
idx=audiomate.corpus.LL_WORD_TRANSCRIPT
)
corpus.utterances[utt_id].set_label_list(ll)
[docs]class KaldiWriter(base.CorpusWriter):
"""
Supports writing data sets in Kaldi format.
Args:
main_label_list_idx (str): The idx of the label-list to use
for writing to transcriptions file.
main_feature_idx (str): The idx of the feature-container to export.
use_utt_idx_if_no_speaker_available (bool): If ``True``, the
utterance-idx is used as
speaker-idx in the utt2spk
file, if no speaker exists
for an utterance.
create_spk2gender (bool): If ``True`` creates the file spk2gender.
default_gender (str): If ``create_spk2gender==True`` and the gender of
an issuer is not known,
this default value will be used (default 'm').
prefix_utterances_with_speaker (bool): If ``True``, add a prefix in
form of the issuer-idx to
every utterance.
use_absolute_times (bool): If ``True``, doesn't use -1 for segment ends,
but reads the audio to get absolute duration.
.. seealso::
`Kaldi: Data preparation <http://kaldi-asr.org/doc/data_prep.html>`_
Describes how a data set has to be structured to be
understood by Kaldi and the format of the individual files.
"""
def __init__(self, main_label_list_idx=audiomate.corpus.LL_WORD_TRANSCRIPT,
main_feature_idx='default', use_utt_idx_if_no_speaker_available=True,
create_spk2gender=False, default_gender='m',
prefix_utterances_with_speaker=True, use_absolute_times=False):
self.main_label_list_idx = main_label_list_idx
self.main_feature_idx = main_feature_idx
self.use_utt_idx_if_no_speaker_available = use_utt_idx_if_no_speaker_available
self.create_spk2gender = create_spk2gender
self.default_gender = default_gender
self.prefix_utterances_with_speaker = prefix_utterances_with_speaker
self.use_absolute_times = use_absolute_times
[docs] @classmethod
def type(cls):
return 'kaldi'
def _save(self, corpus, path):
wav_file_path = os.path.join(path, WAV_FILE_NAME)
spk2gender_path = os.path.join(path, SPK2GENDER_FILE_NAME)
utt2spk_path = os.path.join(path, UTT2SPK_FILE_NAME)
segments_path = os.path.join(path, SEGMENTS_FILE_NAME)
text_path = os.path.join(path, TRANSCRIPTION_FILE_NAME)
KaldiWriter.write_tracks(wav_file_path, corpus, path)
self._write_segments(segments_path, corpus)
self._write_utt_to_issuer_mapping(utt2spk_path, corpus)
self._write_transcriptions(text_path, corpus)
self._write_features(path, corpus)
if self.create_spk2gender:
self._write_genders(spk2gender_path, corpus)
@staticmethod
def write_tracks(file_path, corpus, path):
file_records = []
export_path = os.path.join(path, 'audio')
for track in corpus.tracks.values():
if isinstance(track, tracks.FileTrack):
file_records.append([
track.idx,
KaldiWriter.extended_filename(track)
])
elif isinstance(track, tracks.ContainerTrack):
if not os.path.isdir(export_path):
os.makedirs(export_path)
target_path = os.path.join(
export_path,
'{}.wav'.format(track.idx)
)
max_value = np.iinfo(np.int16).max
samples = (track.read_samples() * max_value).astype(np.int16)
sampling_rate = track.sampling_rate
scipy.io.wavfile.write(target_path, sampling_rate, samples)
file_records.append([
track.idx,
target_path
])
textfile.write_separated_lines(
file_path,
file_records,
separator=' ',
sort_by_column=0
)
[docs] @staticmethod
def extended_filename(file_track):
"""
Create extended filename.
Kaldi only supports wav.
Therefore other files have to be converted using sox.
"""
ext = os.path.splitext(file_track.path)[1]
abs_path = os.path.abspath(file_track.path)
if ext == '.wav':
return abs_path
else:
return 'sox {} -t wav - |'.format(abs_path)
def _write_segments(self, utterance_path, corpus):
utterances = corpus.utterances.values()
utterance_records = {}
for u in utterances:
utt_idx = self._get_utt_idx(u)
track_idx = u.track.idx
start = u.start
end = u.end
if end == float('inf'):
if self.use_absolute_times:
end = u.end_abs
else:
end = -1
utterance_records[utt_idx] = [track_idx, start, end]
textfile.write_separated_lines(
utterance_path,
utterance_records,
separator=' ',
sort_by_column=0
)
def _write_genders(self, gender_path, corpus):
genders = {}
for issuer in corpus.issuers.values():
gender = self.default_gender
if type(issuer) == issuers.Speaker:
if issuer.gender == issuers.Gender.MALE:
gender = 'm'
elif issuer.gender == issuers.Gender.FEMALE:
gender = 'f'
genders[issuer.idx] = gender
if len(genders) > 0:
textfile.write_separated_lines(
gender_path,
genders,
separator=' ',
sort_by_column=0
)
def _write_transcriptions(self, text_path, corpus):
transcriptions = {}
for utterance in corpus.utterances.values():
utt_idx = self._get_utt_idx(utterance)
if self.main_label_list_idx in utterance.label_lists.keys():
label_list = utterance.label_lists[self.main_label_list_idx]
transcriptions[utt_idx] = ' '.join([l.value for l in label_list])
textfile.write_separated_lines(
text_path,
transcriptions,
separator=' ',
sort_by_column=0
)
def _write_utt_to_issuer_mapping(self, utt_issuer_path, corpus):
utt_issuer_records = {}
for utterance in corpus.utterances.values():
utt_idx = self._get_utt_idx(utterance)
if utterance.issuer is not None:
utt_issuer_records[utt_idx] = utterance.issuer.idx
elif self.use_utt_idx_if_no_speaker_available:
utt_issuer_records[utt_idx] = utt_idx
textfile.write_separated_lines(
utt_issuer_path,
utt_issuer_records,
separator=' ',
sort_by_column=0
)
def _write_features(self, path, corpus):
if self.main_feature_idx in corpus.feature_containers.keys():
fc = corpus.features_containers[self.main_feature_idx]
fc.open()
matrices = {}
for utt_id in corpus.utterances.keys():
matrix = fc.get(utt_id)
if matrix is not None:
matrices[utt_id] = matrix
fc.close()
ark_path = os.path.join(path, '{}.ark'.format(FEATS_FILE_NAME))
ark_path = os.path.abspath(ark_path)
scp_path = os.path.join(path, '{}.scp'.format(FEATS_FILE_NAME))
self.write_float_matrices(scp_path, ark_path, matrices)
[docs] @staticmethod
def feature_scp_generator(path):
""" Return a generator over all feature matrices defined in a scp. """
scp_entries = textfile.read_key_value_lines(path, separator=' ')
for utterance_id, rx_specifier in scp_entries.items():
yield utterance_id, KaldiWriter.read_float_matrix(rx_specifier)
[docs] @staticmethod
def read_float_matrix(rx_specifier):
""" Return float matrix as np array for the given rx specifier. """
path, offset = rx_specifier.strip().split(':', maxsplit=1)
offset = int(offset)
sample_format = 4
with open(path, 'rb') as f:
# move to offset
f.seek(offset)
# assert binary ark
binary = f.read(2)
assert (binary == b'\x00B')
# assert type float 32
format = f.read(3)
assert (format == b'FM ')
# get number of mfcc features
f.read(1)
num_frames = struct.unpack('<i', f.read(4))[0]
# get size of mfcc features
f.read(1)
feature_size = struct.unpack('<i', f.read(4))[0]
# read feature data
data = f.read(num_frames * feature_size * sample_format)
feature_vector = np.frombuffer(data, dtype='float32')
feature_matrix = np.reshape(
feature_vector,
(num_frames, feature_size)
)
return feature_matrix
def _get_utt_idx(self, utt):
if (self.prefix_utterances_with_speaker and
utt.issuer is not None and
not utt.idx.startswith(utt.issuer.idx)):
return '{}-{}'.format(utt.issuer.idx, utt.idx)
else:
return utt.idx
[docs] @staticmethod
def write_float_matrices(scp_path, ark_path, matrices):
"""
Write the given dict matrices (utt-id/float ndarray)
to the given scp and ark files.
"""
scp_entries = []
with open(ark_path, 'wb') as f:
for utterance_id in sorted(list(matrices.keys())):
matrix = matrices[utterance_id]
assert (matrix.dtype == np.float32)
f.write(('{} '.format(utterance_id)).encode('utf-8'))
offset = f.tell()
f.write(b'\x00B')
f.write(b'FM ')
f.write(b'\x04')
f.write(struct.pack('<i', np.size(matrix, 0)))
f.write(b'\x04')
f.write(struct.pack('<i', np.size(matrix, 1)))
flattened = matrix.reshape(
np.size(matrix, 0) * np.size(matrix, 1)
)
flattened.tofile(f, sep='')
scp_entries.append('{} {}:{}'.format(
utterance_id,
ark_path,
offset
))
with open(scp_path, 'w') as f:
f.write('\n'.join(scp_entries))