Source code for audiomate.corpus.io.kaldi

import os
import struct

import numpy as np
import scipy

import audiomate
from audiomate import tracks
from audiomate import annotations
from audiomate import issuers
from audiomate.utils import textfile
from . import base
from . import default

WAV_FILE_NAME = 'wav.scp'
SEGMENTS_FILE_NAME = 'segments'
UTT2SPK_FILE_NAME = 'utt2spk'
SPK2GENDER_FILE_NAME = 'spk2gender'
TRANSCRIPTION_FILE_NAME = 'text'
FEATS_FILE_NAME = 'feats'


[docs]class KaldiReader(base.CorpusReader):
    """
    Supports reading data sets in Kaldi format.

    .. seealso::

       `Kaldi: Data preparation <http://kaldi-asr.org/doc/data_prep.html>`_
          Describes how a data set has to be structured to be understood
          by Kaldi and the format of the individual files.
    """

    def __init__(self, main_label_list_idx=audiomate.corpus.LL_WORD_TRANSCRIPT,
                 main_feature_idx='default'):
        self.main_label_list_idx = main_label_list_idx
        self.main_feature_idx = main_feature_idx

[docs]    @classmethod
    def type(cls):
        return 'kaldi'

    def _check_for_missing_files(self, path):
        necessary_files = [WAV_FILE_NAME, TRANSCRIPTION_FILE_NAME]
        missing_files = []

        for file_name in necessary_files:
            file_path = os.path.join(path, file_name)

            if not os.path.isfile(file_path):
                missing_files.append(file_name)

        return missing_files

    def _load(self, path):
        wav_file_path = os.path.join(path, WAV_FILE_NAME)
        spk2gender_path = os.path.join(path, SPK2GENDER_FILE_NAME)
        utt2spk_path = os.path.join(path, UTT2SPK_FILE_NAME)
        segments_path = os.path.join(path, SEGMENTS_FILE_NAME)
        text_path = os.path.join(path, TRANSCRIPTION_FILE_NAME)

        corpus = audiomate.Corpus(path=path)

        default.DefaultReader.read_files(wav_file_path, corpus)
        KaldiReader.read_genders(spk2gender_path, corpus)
        utt2spk = default.DefaultReader.read_utt_to_issuer_mapping(utt2spk_path, corpus)
        KaldiReader.read_utterances(segments_path, corpus, utt2spk)
        KaldiReader.read_transcriptions(text_path, corpus)

        return corpus

    @staticmethod
    def read_genders(genders_path, corpus):
        if os.path.isfile(genders_path):
            speakers = textfile.read_key_value_lines(genders_path, separator=' ')

            for speaker_idx, gender_str in speakers.items():
                if gender_str == 'm':
                    gender = issuers.Gender.MALE
                else:
                    gender = issuers.Gender.FEMALE

                speaker = issuers.Speaker(speaker_idx, gender=gender)
                corpus.import_issuers(speaker)

    @staticmethod
    def read_utterances(segments_path, corpus, utt2spk):
        # load utterances
        if os.path.isfile(segments_path):
            utterances = textfile.read_separated_lines_with_first_key(
                segments_path,
                separator=' ',
                max_columns=4
            )

            for utt_id, utt_info in utterances.items():
                start = 0
                end = float('inf')

                if len(utt_info) > 1:
                    start = float(utt_info[1])

                if len(utt_info) > 2:
                    end = float(utt_info[2])

                    if end == -1:
                        end = float('inf')

                speaker_idx = None

                if utt_id in utt2spk.keys():
                    speaker_idx = utt2spk[utt_id].idx

                corpus.new_utterance(
                    utt_id, utt_info[0],
                    issuer_idx=speaker_idx,
                    start=start,
                    end=end
                )
        else:
            for file_idx in corpus.files.keys():
                speaker_idx = None

                if file_idx in utt2spk.keys():
                    speaker_idx = utt2spk[file_idx].idx

                corpus.new_utterance(file_idx, file_idx, issuer_idx=speaker_idx)

    @staticmethod
    def read_transcriptions(text_path, corpus):
        transcriptions = textfile.read_key_value_lines(text_path, separator=' ')
        for utt_id, transcription in transcriptions.items():
            ll = annotations.LabelList.create_single(
                transcription,
                idx=audiomate.corpus.LL_WORD_TRANSCRIPT
            )
            corpus.utterances[utt_id].set_label_list(ll)


[docs]class KaldiWriter(base.CorpusWriter):
    """
    Supports writing data sets in Kaldi format.

    Args:
        main_label_list_idx (str): The idx of the label-list to use
                                   for writing to transcriptions file.
        main_feature_idx (str): The idx of the feature-container to export.
        use_utt_idx_if_no_speaker_available (bool): If ``True``, the
                                                    utterance-idx is used as
                                                    speaker-idx in the utt2spk
                                                    file, if no speaker exists
                                                    for an utterance.
        create_spk2gender (bool): If ``True`` creates the file spk2gender.
        default_gender (str): If ``create_spk2gender==True`` and the gender of
                              an issuer is not known,
                              this default value will be used (default 'm').
        prefix_utterances_with_speaker (bool): If ``True``, add a prefix in
                                               form of the issuer-idx to
                                               every utterance.
        use_absolute_times (bool): If ``True``, doesn't use -1 for segment ends,
                                   but reads the audio to get absolute duration.

    .. seealso::

       `Kaldi: Data preparation <http://kaldi-asr.org/doc/data_prep.html>`_
          Describes how a data set has to be structured to be
          understood by Kaldi and the format of the individual files.
    """

    def __init__(self, main_label_list_idx=audiomate.corpus.LL_WORD_TRANSCRIPT,
                 main_feature_idx='default', use_utt_idx_if_no_speaker_available=True,
                 create_spk2gender=False, default_gender='m',
                 prefix_utterances_with_speaker=True, use_absolute_times=False):
        self.main_label_list_idx = main_label_list_idx
        self.main_feature_idx = main_feature_idx
        self.use_utt_idx_if_no_speaker_available = use_utt_idx_if_no_speaker_available
        self.create_spk2gender = create_spk2gender
        self.default_gender = default_gender
        self.prefix_utterances_with_speaker = prefix_utterances_with_speaker
        self.use_absolute_times = use_absolute_times

[docs]    @classmethod
    def type(cls):
        return 'kaldi'

    def _save(self, corpus, path):
        wav_file_path = os.path.join(path, WAV_FILE_NAME)
        spk2gender_path = os.path.join(path, SPK2GENDER_FILE_NAME)
        utt2spk_path = os.path.join(path, UTT2SPK_FILE_NAME)
        segments_path = os.path.join(path, SEGMENTS_FILE_NAME)
        text_path = os.path.join(path, TRANSCRIPTION_FILE_NAME)

        KaldiWriter.write_tracks(wav_file_path, corpus, path)
        self._write_segments(segments_path, corpus)
        self._write_utt_to_issuer_mapping(utt2spk_path, corpus)
        self._write_transcriptions(text_path, corpus)
        self._write_features(path, corpus)

        if self.create_spk2gender:
            self._write_genders(spk2gender_path, corpus)

    @staticmethod
    def write_tracks(file_path, corpus, path):
        file_records = []

        export_path = os.path.join(path, 'audio')

        for track in corpus.tracks.values():
            if isinstance(track, tracks.FileTrack):
                file_records.append([
                    track.idx,
                    KaldiWriter.extended_filename(track)
                ])

            elif isinstance(track, tracks.ContainerTrack):
                if not os.path.isdir(export_path):
                    os.makedirs(export_path)

                target_path = os.path.join(
                    export_path,
                    '{}.wav'.format(track.idx)
                )

                max_value = np.iinfo(np.int16).max
                samples = (track.read_samples() * max_value).astype(np.int16)
                sampling_rate = track.sampling_rate
                scipy.io.wavfile.write(target_path, sampling_rate, samples)

                file_records.append([
                    track.idx,
                    target_path
                ])

        textfile.write_separated_lines(
            file_path,
            file_records,
            separator=' ',
            sort_by_column=0
        )

[docs]    @staticmethod
    def extended_filename(file_track):
        """
        Create extended filename.
        Kaldi only supports wav.
        Therefore other files have to be converted using sox.
        """
        ext = os.path.splitext(file_track.path)[1]
        abs_path = os.path.abspath(file_track.path)

        if ext == '.wav':
            return abs_path
        else:
            return 'sox {} -t wav - |'.format(abs_path)

    def _write_segments(self, utterance_path, corpus):
        utterances = corpus.utterances.values()
        utterance_records = {}

        for u in utterances:
            utt_idx = self._get_utt_idx(u)
            track_idx = u.track.idx
            start = u.start
            end = u.end

            if end == float('inf'):
                if self.use_absolute_times:
                    end = u.end_abs
                else:
                    end = -1

            utterance_records[utt_idx] = [track_idx, start, end]

        textfile.write_separated_lines(
            utterance_path,
            utterance_records,
            separator=' ',
            sort_by_column=0
        )

    def _write_genders(self, gender_path, corpus):
        genders = {}

        for issuer in corpus.issuers.values():
            gender = self.default_gender

            if type(issuer) == issuers.Speaker:
                if issuer.gender == issuers.Gender.MALE:
                    gender = 'm'
                elif issuer.gender == issuers.Gender.FEMALE:
                    gender = 'f'

            genders[issuer.idx] = gender

        if len(genders) > 0:
            textfile.write_separated_lines(
                gender_path,
                genders,
                separator=' ',
                sort_by_column=0
            )

    def _write_transcriptions(self, text_path, corpus):
        transcriptions = {}

        for utterance in corpus.utterances.values():
            utt_idx = self._get_utt_idx(utterance)

            if self.main_label_list_idx in utterance.label_lists.keys():
                label_list = utterance.label_lists[self.main_label_list_idx]
                transcriptions[utt_idx] = ' '.join([l.value for l in label_list])

        textfile.write_separated_lines(
            text_path,
            transcriptions,
            separator=' ',
            sort_by_column=0
        )

    def _write_utt_to_issuer_mapping(self, utt_issuer_path, corpus):
        utt_issuer_records = {}

        for utterance in corpus.utterances.values():
            utt_idx = self._get_utt_idx(utterance)
            if utterance.issuer is not None:
                utt_issuer_records[utt_idx] = utterance.issuer.idx
            elif self.use_utt_idx_if_no_speaker_available:
                utt_issuer_records[utt_idx] = utt_idx

        textfile.write_separated_lines(
            utt_issuer_path,
            utt_issuer_records,
            separator=' ',
            sort_by_column=0
        )

    def _write_features(self, path, corpus):
        if self.main_feature_idx in corpus.feature_containers.keys():
            fc = corpus.features_containers[self.main_feature_idx]
            fc.open()
            matrices = {}

            for utt_id in corpus.utterances.keys():
                matrix = fc.get(utt_id)

                if matrix is not None:
                    matrices[utt_id] = matrix

            fc.close()

            ark_path = os.path.join(path, '{}.ark'.format(FEATS_FILE_NAME))
            ark_path = os.path.abspath(ark_path)
            scp_path = os.path.join(path, '{}.scp'.format(FEATS_FILE_NAME))

            self.write_float_matrices(scp_path, ark_path, matrices)

[docs]    @staticmethod
    def feature_scp_generator(path):
        """ Return a generator over all feature matrices defined in a scp. """

        scp_entries = textfile.read_key_value_lines(path, separator=' ')

        for utterance_id, rx_specifier in scp_entries.items():
            yield utterance_id, KaldiWriter.read_float_matrix(rx_specifier)

[docs]    @staticmethod
    def read_float_matrix(rx_specifier):
        """ Return float matrix as np array for the given rx specifier. """

        path, offset = rx_specifier.strip().split(':', maxsplit=1)
        offset = int(offset)
        sample_format = 4

        with open(path, 'rb') as f:
            # move to offset
            f.seek(offset)

            # assert binary ark
            binary = f.read(2)
            assert (binary == b'\x00B')

            # assert type float 32
            format = f.read(3)
            assert (format == b'FM ')

            # get number of mfcc features
            f.read(1)
            num_frames = struct.unpack('<i', f.read(4))[0]

            # get size of mfcc features
            f.read(1)
            feature_size = struct.unpack('<i', f.read(4))[0]

            # read feature data
            data = f.read(num_frames * feature_size * sample_format)

            feature_vector = np.frombuffer(data, dtype='float32')
            feature_matrix = np.reshape(
                feature_vector,
                (num_frames, feature_size)
            )

            return feature_matrix

    def _get_utt_idx(self, utt):
        if (self.prefix_utterances_with_speaker and
                utt.issuer is not None and
                not utt.idx.startswith(utt.issuer.idx)):
            return '{}-{}'.format(utt.issuer.idx, utt.idx)
        else:
            return utt.idx

[docs]    @staticmethod
    def write_float_matrices(scp_path, ark_path, matrices):
        """
        Write the given dict matrices (utt-id/float ndarray)
        to the given scp and ark files.
        """

        scp_entries = []

        with open(ark_path, 'wb') as f:
            for utterance_id in sorted(list(matrices.keys())):
                matrix = matrices[utterance_id]

                assert (matrix.dtype == np.float32)

                f.write(('{} '.format(utterance_id)).encode('utf-8'))

                offset = f.tell()

                f.write(b'\x00B')
                f.write(b'FM ')
                f.write(b'\x04')
                f.write(struct.pack('<i', np.size(matrix, 0)))
                f.write(b'\x04')
                f.write(struct.pack('<i', np.size(matrix, 1)))

                flattened = matrix.reshape(
                    np.size(matrix, 0) * np.size(matrix, 1)
                )
                flattened.tofile(f, sep='')

                scp_entries.append('{} {}:{}'.format(
                    utterance_id,
                    ark_path,
                    offset
                ))

        with open(scp_path, 'w') as f:
            f.write('\n'.join(scp_entries))