|
@@ -0,0 +1,200 @@
|
|
|
+"""This module implements a class that download the data from \
|
|
|
+ opus nlp website.
|
|
|
+
|
|
|
+ The data from the website can be very huge, especially for high \
|
|
|
+ ressourced languages. Moreover, we are only interested on a subset \
|
|
|
+ of the data for each language. For the class implemented in this module,\
|
|
|
+ instead of downloading all the data, it iterates over small chunks \
|
|
|
+ and only extract the extract the necessary number of sentences only on these chunks.
|
|
|
+"""
|
|
|
+import os
|
|
|
+from random import shuffle
|
|
|
+import re
|
|
|
+from io import BytesIO
|
|
|
+import gzip
|
|
|
+import yaml
|
|
|
+import requests
|
|
|
+from tqdm import tqdm
|
|
|
+import string
|
|
|
+from phonemizer.backend import EspeakBackend
|
|
|
+from phonemizer.separator import Separator
|
|
|
+from typing import Iterator
|
|
|
+import random
|
|
|
+random.seed(80)
|
|
|
+
|
|
|
+class DownloadOpenSubtitlesData :
|
|
|
+ """
|
|
|
+ Class that download sentences from OpenSubtitles.
|
|
|
+
|
|
|
+ Atributes
|
|
|
+ ---------
|
|
|
+ - version : str
|
|
|
+ The version of the OpenSubtitles page of the opus.nlpl website.
|
|
|
+ - base_url : str
|
|
|
+ The opus.nlpl url where the data is stored for each language
|
|
|
+ - total_sents : int
|
|
|
+ Counter of sentences.
|
|
|
+ """
|
|
|
+
|
|
|
+ def __init__(self, version="2018") :
|
|
|
+ self.base_url = f"https://opus.nlpl.eu/download.php?f=OpenSubtitles/v{version}/mono/OpenSubtitles.raw."
|
|
|
+ self.separator = Separator(phone='$', word='@')
|
|
|
+ self.total_sents = 0
|
|
|
+
|
|
|
+ def _remove_ponctuations(self, sentence: str) -> str :
|
|
|
+ """
|
|
|
+ Method that removes ponctuations from a given sentence.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ - sent : str
|
|
|
+ The sentence for which punctuations need to be removed
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ - str :
|
|
|
+ The sentence without punctuations.
|
|
|
+ """
|
|
|
+ return sentence.translate(str.maketrans('', '', string.punctuation))
|
|
|
+
|
|
|
+ def _remove_brackets(self, sentence: str) -> str:
|
|
|
+ """
|
|
|
+ Method that removes brackets from a given sentence.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ - sentence : str
|
|
|
+ The sentence for which brackets need to be removed.
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ - str :
|
|
|
+ The sentence without brackets.
|
|
|
+ """
|
|
|
+ return re.sub(r"[\(\[].*?[\)\]]", "", sentence)
|
|
|
+
|
|
|
+ def get_sentences(self, language: str, max_sents_to_download: int, chunk: int=128) -> Iterator[tuple]:
|
|
|
+ """
|
|
|
+ Function for getting sentences from opensubtitles for a given language\
|
|
|
+ and a number of sentences.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ - language : str
|
|
|
+ The language for which to retrieve the sentences.
|
|
|
+ - max_sents_to_process : str
|
|
|
+ The number of sentences to retrieve.
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ - Iterator :
|
|
|
+ Iterator over sentences and progressbars
|
|
|
+ """
|
|
|
+ # stream in order to not load all on memory
|
|
|
+ response = requests.get(f"{self.base_url}{language}.gz", stream=True)
|
|
|
+ # "chunk" increase as max_sents_to_process increases
|
|
|
+ chunk_size = chunk * max_sents_to_download
|
|
|
+ # iterator over chunks
|
|
|
+ chunks = response.iter_content(chunk_size=chunk_size)
|
|
|
+ with tqdm(total=max_sents_to_download) as progress_bar :
|
|
|
+ progress_bar.set_description(f"Language={language}")
|
|
|
+ while self.total_sents < max_sents_to_download :
|
|
|
+ chunk = next(chunks)
|
|
|
+ try :
|
|
|
+ for sent in gzip.open(BytesIO(chunk), "rt") :
|
|
|
+ if self.total_sents >= max_sents_to_download :
|
|
|
+ break
|
|
|
+ else :
|
|
|
+ yield sent, progress_bar
|
|
|
+ except : # if exception, this means the chunk size is too small for gzip
|
|
|
+ print(f"The chunk size is to small for {max_sents_to_download} sentences to download")
|
|
|
+ break
|
|
|
+
|
|
|
+ def __call__(self, loaded_yaml_file, train_sentences, dev_sentences, chunk, out_dirname) -> None:
|
|
|
+ """
|
|
|
+ Collect the sentences for all languages.
|
|
|
+
|
|
|
+ Paramaters
|
|
|
+ ----------
|
|
|
+ - loaded_yaml_file : dict
|
|
|
+ This dictionary contains all informations relevant for this study, for each language. \
|
|
|
+ This dictionary also contains informations about espeak ids for the languages, and this is relevant
|
|
|
+ for phonemization.
|
|
|
+ - train_sentences : int
|
|
|
+ Number of sentences to download for train corpora. This number is the same for all languages.
|
|
|
+ - dev_sentences : int
|
|
|
+ Number of sentences to download for dev corpora. This number is the same for all languages.
|
|
|
+ - out_dirname : str
|
|
|
+ The folder where the outputs will be saved.
|
|
|
+ """
|
|
|
+ max_sents_to_download = train_sentences + dev_sentences
|
|
|
+ for language in loaded_yaml_file :
|
|
|
+ output_file_train = open(f"{out_dirname}/tokenized_in_phonemes_train/{language}.one_sentence_per_line", "w")
|
|
|
+ output_file_words = open(f"{out_dirname}/tokenized_in_words/{language}.one_sentence_per_line", "w")
|
|
|
+ output_file_dev = open(f"{out_dirname}/tokenized_in_phonemes_dev/{language}.one_sentence_per_line", "w")
|
|
|
+ espeak_language_id = loaded_yaml_file[language]["espeak_language_id"]
|
|
|
+ backend = EspeakBackend(language=espeak_language_id)
|
|
|
+ added_sents = set()
|
|
|
+ for sent, progress_bar in self.get_sentences(language, max_sents_to_download, chunk=chunk) :
|
|
|
+ sent = self._remove_ponctuations(sent)
|
|
|
+ sent = backend.phonemize([sent], separator=self.separator, strip=True)
|
|
|
+ # phonemizer will sometimes return string with brackets, we have to remove them.
|
|
|
+ sent = self._remove_brackets(*sent)
|
|
|
+ sent = sent.strip()
|
|
|
+ # tokenization by phoneme
|
|
|
+ sent_phonemes = " ".join(phon for word in sent.split("@") for phon in word.split("$") if phon)
|
|
|
+ sent_words = " ".join("".join(word.strip().split("$")) for word in sent.split("@"))
|
|
|
+ output_file_words.write(sent_words + "\n")
|
|
|
+
|
|
|
+ if sent_phonemes not in added_sents :
|
|
|
+ added_sents.add(sent_phonemes)
|
|
|
+ self.total_sents += 1
|
|
|
+ progress_bar.update(1)
|
|
|
+ added_sents = list(added_sents)
|
|
|
+ shuffle(added_sents)
|
|
|
+ train = added_sents[:train_sentences]
|
|
|
+ dev = added_sents[train_sentences:max_sents_to_download]
|
|
|
+ for sent_train in train :
|
|
|
+ output_file_train.write(sent_train + "\n")
|
|
|
+ for sent_dev in dev :
|
|
|
+ output_file_dev.write(sent_dev + "\n")
|
|
|
+ self.total_sents = 0
|
|
|
+
|
|
|
+if __name__ == "__main__" :
|
|
|
+ from argparse import ArgumentParser
|
|
|
+ parser = ArgumentParser()
|
|
|
+
|
|
|
+ parser.add_argument("--yaml_file",
|
|
|
+ help="YAML File containing for each language, all relevant information for downloading the data.",
|
|
|
+ required=True)
|
|
|
+ parser.add_argument("--out_dirname",
|
|
|
+ help="The directory where outputs will be stored.",
|
|
|
+ required=True)
|
|
|
+ parser.add_argument("--chunk",
|
|
|
+ help="For the chunk size. This number should grow as much as you want to download many sentences.\
|
|
|
+ 256 is a good number when you want to get 1_000_000 or less sentences",
|
|
|
+ default=1024,
|
|
|
+ required=False)
|
|
|
+ parser.add_argument("--train_sentences",
|
|
|
+ help="Number of sent for the train corpora.",
|
|
|
+ default=500_000,
|
|
|
+ required=False)
|
|
|
+ parser.add_argument("--dev_sentences",
|
|
|
+ help="Number of sent for the dev copora.",
|
|
|
+ default=10_000,
|
|
|
+ required=False)
|
|
|
+ args = parser.parse_args()
|
|
|
+ yaml_file = args.yaml_file
|
|
|
+ chunk = args.chunk
|
|
|
+ out_dirname = args.out_dirname
|
|
|
+ out_dirname = out_dirname[:-1] if out_dirname.endswith("/") else out_dirname
|
|
|
+ if not os.path.exists(f"{out_dirname}/tokenized_in_phonemes_train"):
|
|
|
+ os.makedirs(f"{out_dirname}/tokenized_in_phonemes_train")
|
|
|
+ if not os.path.exists(f"{out_dirname}/tokenized_in_words"):
|
|
|
+ os.makedirs(f"{out_dirname}/tokenized_in_words")
|
|
|
+ if not os.path.exists(f"{out_dirname}/tokenized_in_phonemes_dev"):
|
|
|
+ os.makedirs(f"{out_dirname}/tokenized_in_phonemes_dev")
|
|
|
+ languages_to_download_informations = yaml.safe_load(open(args.yaml_file))
|
|
|
+ downloader = DownloadOpenSubtitlesData()
|
|
|
+ downloader(languages_to_download_informations, args.train_sentences, args.dev_sentences, chunk, out_dirname)
|
|
|
+
|