"""This module implements a class that download the data from \ opus nlp website. The data from the website can be very huge, especially for high \ ressourced languages. Moreover, we are only interested on a subset \ of the data for each language. For the class implemented in this module,\ instead of downloading all the data, it iterates over small chunks \ and only extract the extract the necessary number of sentences only on these chunks. """ import os import re from typing import Iterator import string import random from io import BytesIO from random import shuffle import gzip import yaml import requests from tqdm import tqdm from phonemizer.backend import EspeakBackend from phonemizer.separator import Separator random.seed(80) class DownloadOpenSubtitlesData : """ Class that download sentences from OpenSubtitles. Atributes --------- - version : str The version of the OpenSubtitles page of the opus.nlpl website. - base_url : str The opus.nlpl url where the data is stored for each language - total_sents : int Counter of sentences. """ def __init__(self, version="2018") : self.base_url = f"https://opus.nlpl.eu/download.php?f=OpenSubtitles/v{version}/mono/OpenSubtitles.raw." self.separator = Separator(phone='$', word='@') self.total_sents = 0 def _remove_ponctuations(self, sentence: str) -> str : """ Method that removes ponctuations from a given sentence. Parameters ---------- - sent : str The sentence for which punctuations need to be removed Returns ------- - str : The sentence without punctuations. """ return sentence.translate(str.maketrans('', '', string.punctuation)) def _remove_brackets(self, sentence: str) -> str: """ Method that removes brackets from a given sentence. Parameters ---------- - sentence : str The sentence for which brackets need to be removed. Returns ------- - str : The sentence without brackets. """ return re.sub(r"[\(\[].*?[\)\]]", "", sentence) def get_sentences(self, language: str, max_sents_to_download: int, chunk: int=128) -> Iterator[tuple]: """ Function for getting sentences from opensubtitles for a given language\ and a number of sentences. Parameters ---------- - language : str The language for which to retrieve the sentences. - max_sents_to_process : str The number of sentences to retrieve. Returns ------- - Iterator : Iterator over sentences and progressbars """ # stream in order to not load all on memory response = requests.get(f"{self.base_url}{language}.gz", stream=True) # "chunk" increase as max_sents_to_process increases chunk_size = chunk * max_sents_to_download # iterator over chunks chunks = response.iter_content(chunk_size=chunk_size) with tqdm(total=max_sents_to_download) as progress_bar : progress_bar.set_description(f"Language={language}") while self.total_sents < max_sents_to_download : chunk = next(chunks) try : for sent in gzip.open(BytesIO(chunk), "rt") : if self.total_sents >= max_sents_to_download : break else : yield sent, progress_bar except : # if exception, this means the chunk size is too small for gzip print(f"The chunk size is to small for {max_sents_to_download}\ sentences to download") break def __call__(self, loaded_yaml_file, train_sentences, dev_sentences, chunk, out_dirname) -> None: """ Collect the sentences for all languages. Paramaters ---------- - loaded_yaml_file : dict This dictionary contains all informations relevant\ for this study, for each language. This dictionary also\ contains informations about espeak ids for the languages,\ and this is relevant for phonemization. - train_sentences : int Number of sentences to download for train corpora.\ This number is the same for all languages. - dev_sentences : int Number of sentences to download for dev corpora.\ This number is the same for all languages. - out_dirname : str The folder where the outputs will be saved. """ max_sents_to_download = train_sentences + dev_sentences for language in loaded_yaml_file : output_file_train = open(f"{out_dirname}/tokenized_in_phonemes_train/{language}.one_sentence_per_line", "w") output_file_dev = open(f"{out_dirname}/tokenized_in_phonemes_dev/{language}.one_sentence_per_line", "w") espeak_language_id = loaded_yaml_file[language]["espeak_language_id"] backend = EspeakBackend(language=espeak_language_id, language_switch="remove-utterance") added_sents = set() for sent, progress_bar in self.get_sentences(language, max_sents_to_download, chunk=chunk) : sent = self._remove_ponctuations(sent) sent = backend.phonemize([sent], separator=self.separator, strip=True) # phonemizer will sometimes return string with brackets, we have to remove them. sent = self._remove_brackets(*sent) sent = sent.strip() # tokenization by phoneme sent_phonemes = " ".join(phon for word in sent.split("@") for phon in word.split("$") if phon) if sent_phonemes not in added_sents : added_sents.add(sent_phonemes) self.total_sents += 1 progress_bar.update(1) added_sents = list(added_sents) shuffle(added_sents) train = added_sents[:train_sentences] dev = added_sents[train_sentences:max_sents_to_download] for sent_train in train : output_file_train.write(sent_train + "\n") for sent_dev in dev : output_file_dev.write(sent_dev + "\n") self.total_sents = 0 if __name__ == "__main__" : from argparse import ArgumentParser parser = ArgumentParser() parser.add_argument("--yaml_file", help="YAML File containing for each language,\ all relevant information for downloading the data.", required=True) parser.add_argument("--out_dirname", help="The directory where outputs will be stored.", required=True) parser.add_argument("--chunk", help="For the chunk size. This number should\ grow as much as you want to download many sentences.\ 256 is a good number when you want to get 1_000_000 or less sentences", default=1024, required=False) parser.add_argument("--train_sentences", help="Number of sent for the training corpora.", default=200_000, required=False) parser.add_argument("--dev_sentences", help="Number of sent for the dev or test copora.", default=10_000, required=False) args = parser.parse_args() yaml_file = args.yaml_file chunk = args.chunk out_dirname = args.out_dirname out_dirname = out_dirname[:-1] if out_dirname.endswith("/") else out_dirname if not os.path.exists(f"{out_dirname}/tokenized_in_phonemes_train"): os.makedirs(f"{out_dirname}/tokenized_in_phonemes_train") if not os.path.exists(f"{out_dirname}/tokenized_in_phonemes_dev"): os.makedirs(f"{out_dirname}/tokenized_in_phonemes_dev") languages_to_download_informations = yaml.safe_load(open(args.yaml_file)) downloader = DownloadOpenSubtitlesData() downloader(languages_to_download_informations, args.train_sentences, args.dev_sentences, chunk, out_dirname)