|
@@ -141,7 +141,6 @@ class DownloadOpenSubtitlesData :
|
|
|
max_sents_to_download = train_sentences + dev_sentences
|
|
|
for language in loaded_yaml_file :
|
|
|
output_file_train = open(f"{out_dirname}/tokenized_in_phonemes_train/{language}.one_sentence_per_line", "w")
|
|
|
- output_file_words = open(f"{out_dirname}/tokenized_in_words/{language}.one_sentence_per_line", "w")
|
|
|
output_file_dev = open(f"{out_dirname}/tokenized_in_phonemes_dev/{language}.one_sentence_per_line", "w")
|
|
|
espeak_language_id = loaded_yaml_file[language]["espeak_language_id"]
|
|
|
backend = EspeakBackend(language=espeak_language_id, language_switch="remove-utterance")
|
|
@@ -154,9 +153,6 @@ class DownloadOpenSubtitlesData :
|
|
|
sent = sent.strip()
|
|
|
# tokenization by phoneme
|
|
|
sent_phonemes = " ".join(phon for word in sent.split("@") for phon in word.split("$") if phon)
|
|
|
- sent_words = " ".join("".join(word.strip().split("$")) for word in sent.split("@"))
|
|
|
- output_file_words.write(sent_words + "\n")
|
|
|
-
|
|
|
if sent_phonemes not in added_sents :
|
|
|
added_sents.add(sent_phonemes)
|
|
|
self.total_sents += 1
|
|
@@ -203,8 +199,6 @@ if __name__ == "__main__" :
|
|
|
out_dirname = out_dirname[:-1] if out_dirname.endswith("/") else out_dirname
|
|
|
if not os.path.exists(f"{out_dirname}/tokenized_in_phonemes_train"):
|
|
|
os.makedirs(f"{out_dirname}/tokenized_in_phonemes_train")
|
|
|
- if not os.path.exists(f"{out_dirname}/tokenized_in_words"):
|
|
|
- os.makedirs(f"{out_dirname}/tokenized_in_words")
|
|
|
if not os.path.exists(f"{out_dirname}/tokenized_in_phonemes_dev"):
|
|
|
os.makedirs(f"{out_dirname}/tokenized_in_phonemes_dev")
|
|
|
languages_to_download_informations = yaml.safe_load(open(args.yaml_file))
|