Browse Source

opensubtitles downloader only phonemes

yaya-sy 1 year ago
parent
commit
24feb5e9fd
1 changed files with 0 additions and 6 deletions
  1. 0 6
      code/download_opensubtitles_corpora.py

+ 0 - 6
code/download_opensubtitles_corpora.py

@@ -141,7 +141,6 @@ class DownloadOpenSubtitlesData :
         max_sents_to_download = train_sentences + dev_sentences
         for language in loaded_yaml_file :
             output_file_train = open(f"{out_dirname}/tokenized_in_phonemes_train/{language}.one_sentence_per_line", "w")
-            output_file_words = open(f"{out_dirname}/tokenized_in_words/{language}.one_sentence_per_line", "w")
             output_file_dev = open(f"{out_dirname}/tokenized_in_phonemes_dev/{language}.one_sentence_per_line", "w")
             espeak_language_id = loaded_yaml_file[language]["espeak_language_id"]
             backend = EspeakBackend(language=espeak_language_id, language_switch="remove-utterance")
@@ -154,9 +153,6 @@ class DownloadOpenSubtitlesData :
                 sent = sent.strip()
                 # tokenization by phoneme
                 sent_phonemes = " ".join(phon for word in sent.split("@") for phon in word.split("$") if phon)
-                sent_words = " ".join("".join(word.strip().split("$")) for word in sent.split("@"))
-                output_file_words.write(sent_words + "\n")
-
                 if sent_phonemes not in added_sents :
                     added_sents.add(sent_phonemes)
                     self.total_sents += 1
@@ -203,8 +199,6 @@ if __name__ == "__main__" :
     out_dirname = out_dirname[:-1] if out_dirname.endswith("/") else out_dirname
     if not os.path.exists(f"{out_dirname}/tokenized_in_phonemes_train"):
         os.makedirs(f"{out_dirname}/tokenized_in_phonemes_train")
-    if not os.path.exists(f"{out_dirname}/tokenized_in_words"):
-        os.makedirs(f"{out_dirname}/tokenized_in_words")
     if not os.path.exists(f"{out_dirname}/tokenized_in_phonemes_dev"):
         os.makedirs(f"{out_dirname}/tokenized_in_phonemes_dev")
     languages_to_download_informations = yaml.safe_load(open(args.yaml_file))