import os
import shutil
import audiomate
from audiomate import annotations
from audiomate import issuers
from audiomate.utils import download
from audiomate.utils import textfile
from . import base
SENTENCE_LIST_URL = 'http://downloads.tatoeba.org/exports/sentences.tar.bz2'
AUDIO_LIST_URL = 'http://downloads.tatoeba.org/exports/sentences_with_audio.tar.bz2'
META_FILENAME = 'meta.txt'
[docs]class TatoebaDownloader(base.CorpusDownloader):
"""
Downloader for audio files from the tatoeba platform.
.. seealso::
`Tatoeba <https://tatoeba.org/>`_
Website
Args:
include_languages (list): List of languages to download. If None all are downloaded.
include_licenses (list): Sentences are downloaded only if their license is in this list.
If None all licenses are included.
load_empty_license (bool): Sentences with an empty license are not meant to be reused.
If False these sentences are ignored.
"""
def __init__(self, include_languages=None, include_licenses=None, include_empty_licence=False):
self.include_languages = include_languages
self.include_licenses = include_licenses
self.include_empty_licence = include_empty_licence
[docs] @classmethod
def type(cls):
return 'tatoeba'
def _download(self, target_path):
temp_path = os.path.join(target_path, 'temp')
os.makedirs(temp_path, exist_ok=True)
sentence_ark = os.path.join(temp_path, 'sentences.tar.bz2')
sentence_list = os.path.join(temp_path, 'sentences.csv')
audio_ark = os.path.join(temp_path, 'sentences_with_audio.tar.bz2')
audio_list = os.path.join(temp_path, 'sentences_with_audio.csv')
download.download_file(SENTENCE_LIST_URL, sentence_ark)
download.download_file(AUDIO_LIST_URL, audio_ark)
download.extract_tar(sentence_ark, temp_path)
download.extract_tar(audio_ark, temp_path)
audio_entries = self._load_audio_list(audio_list)
sentences = self._load_sentence_list(sentence_list)
valid_sentence_ids = set(audio_entries.keys()).intersection(set(sentences.keys()))
# sent-id, username, lang, transcript
all_records = [(k, audio_entries[k][0], sentences[k][0], sentences[k][1]) for k in valid_sentence_ids]
meta_path = os.path.join(target_path, META_FILENAME)
textfile.write_separated_lines(meta_path, all_records, separator='\t', sort_by_column=0)
self._download_audio_files(all_records, target_path)
shutil.rmtree(temp_path, ignore_errors=True)
def _load_audio_list(self, path):
"""
Load and filter the audio list.
Args:
path (str): Path to the audio list file.
Returns:
dict: Dictionary of filtered sentences (id : username, license, attribution-url)
"""
result = {}
for entry in textfile.read_separated_lines_generator(path, separator='\t', max_columns=4):
for i in range(len(entry)):
if entry[i] == '\\N':
entry[i] = None
if len(entry) < 4:
entry.extend([None] * (4 - len(entry)))
if not self.include_empty_licence and entry[2] is None:
continue
if self.include_licenses is not None and entry[2] not in self.include_licenses:
continue
result[entry[0]] = entry[1:]
return result
def _load_sentence_list(self, path):
"""
Load and filter the sentence list.
Args:
path (str): Path to the sentence list.
Returns:
dict: Dictionary of sentences (id : language, transcription)
"""
result = {}
for entry in textfile.read_separated_lines_generator(path, separator='\t', max_columns=3):
if self.include_languages is None or entry[1] in self.include_languages:
result[entry[0]] = entry[1:]
return result
def _download_audio_files(self, records, target_path):
"""
Download all audio files based on the given records.
"""
for record in records:
audio_folder = os.path.join(target_path, 'audio', record[2])
audio_file = os.path.join(audio_folder, '{}.mp3'.format(record[0]))
os.makedirs(audio_folder, exist_ok=True)
download_url = 'https://audio.tatoeba.org/sentences/{}/{}.mp3'.format(record[2], record[0])
download.download_file(download_url, audio_file)
[docs]class TatoebaReader(base.CorpusReader):
"""
Reader for audio data downloaded with the Tatoeba downloader.
"""
[docs] @classmethod
def type(cls):
return 'tatoeba'
def _check_for_missing_files(self, path):
meta_file = os.path.join(path, META_FILENAME)
if os.path.isfile(meta_file):
return []
else:
return [meta_file]
def _load(self, path):
corpus = audiomate.Corpus(path=path)
meta_file = os.path.join(path, META_FILENAME)
records = textfile.read_separated_lines_generator(meta_file, separator='\t', max_columns=4)
for record in records:
idx = record[0]
speaker_idx = record[1]
language = record[2]
transcript = record[3]
file_path = os.path.join(path, 'audio', language, '{}.mp3'.format(idx))
corpus.new_file(file_path, idx)
if speaker_idx not in corpus.issuers.keys():
issuer = issuers.Speaker(speaker_idx)
corpus.import_issuers(issuer)
utterance = corpus.new_utterance(idx, idx, speaker_idx)
utterance.set_label_list(annotations.LabelList.create_single(transcript,
idx=audiomate.corpus.LL_WORD_TRANSCRIPT_RAW))
return corpus