123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216 |
- """This module implements a class that download the data from \
- opus nlp website.
- The data from the website can be very huge, especially for high \
- ressourced languages. Moreover, we are only interested on a subset \
- of the data for each language. For the class implemented in this module,\
- instead of downloading all the data, it iterates over small chunks \
- and only extract the extract the necessary number of sentences only on these chunks.
- """
- import os
- import re
- from typing import Iterator
- import string
- import random
- from io import BytesIO
- from random import shuffle
- import gzip
- import yaml
- import requests
- from tqdm import tqdm
- from phonemizer.backend import EspeakBackend
- from phonemizer.separator import Separator
- random.seed(80)
- class DownloadOpenSubtitlesData :
- """
- Class that download sentences from OpenSubtitles.
- Atributes
- ---------
- - version : str
- The version of the OpenSubtitles page of the opus.nlpl website.
- - base_url : str
- The opus.nlpl url where the data is stored for each language
- - total_sents : int
- Counter of sentences.
- """
- def __init__(self, version="2018") :
- self.base_url = f"https://opus.nlpl.eu/download.php?f=OpenSubtitles/v{version}/mono/OpenSubtitles.raw."
- self.separator = Separator(phone='$', word='@')
- self.total_sents = 0
- def _remove_ponctuations(self, sentence: str) -> str :
- """
- Method that removes ponctuations from a given sentence.
- Parameters
- ----------
- - sent : str
- The sentence for which punctuations need to be removed
- Returns
- -------
- - str :
- The sentence without punctuations.
- """
- return sentence.translate(str.maketrans('', '', string.punctuation))
- def _remove_brackets(self, sentence: str) -> str:
- """
- Method that removes brackets from a given sentence.
- Parameters
- ----------
- - sentence : str
- The sentence for which brackets need to be removed.
- Returns
- -------
- - str :
- The sentence without brackets.
- """
- return re.sub(r"[\(\[].*?[\)\]]", "", sentence)
- def get_sentences(self, language: str,
- max_sents_to_download: int,
- chunk: int=128) -> Iterator[tuple]:
- """
- Function for getting sentences from opensubtitles for a given language\
- and a number of sentences.
- Parameters
- ----------
- - language : str
- The language for which to retrieve the sentences.
- - max_sents_to_process : str
- The number of sentences to retrieve.
- Returns
- -------
- - Iterator :
- Iterator over sentences and progressbars
- """
- # stream in order to not load all on memory
- response = requests.get(f"{self.base_url}{language}.gz", stream=True)
- # "chunk" increase as max_sents_to_process increases
- chunk_size = chunk * max_sents_to_download
- # iterator over chunks
- chunks = response.iter_content(chunk_size=chunk_size)
- with tqdm(total=max_sents_to_download) as progress_bar :
- progress_bar.set_description(f"Language={language}")
- while self.total_sents < max_sents_to_download :
- chunk = next(chunks)
- try :
- for sent in gzip.open(BytesIO(chunk), "rt") :
- if self.total_sents >= max_sents_to_download :
- break
- else :
- yield sent, progress_bar
- except : # if exception, this means the chunk size is too small for gzip
- print(f"The chunk size is to small for {max_sents_to_download}\
- sentences to download")
- break
- def __call__(self,
- loaded_yaml_file,
- train_sentences,
- dev_sentences,
- chunk,
- out_dirname) -> None:
- """
- Collect the sentences for all languages.
- Paramaters
- ----------
- - loaded_yaml_file : dict
- This dictionary contains all informations relevant\
- for this study, for each language. This dictionary also\
- contains informations about espeak ids for the languages,\
- and this is relevant for phonemization.
- - train_sentences : int
- Number of sentences to download for train corpora.\
- This number is the same for all languages.
- - dev_sentences : int
- Number of sentences to download for dev corpora.\
- This number is the same for all languages.
- - out_dirname : str
- The folder where the outputs will be saved.
- """
- max_sents_to_download = train_sentences + dev_sentences
- for language in loaded_yaml_file :
- output_file_train = open(f"{out_dirname}/tokenized_in_phonemes_train/{language}.one_sentence_per_line", "w")
- output_file_words = open(f"{out_dirname}/tokenized_in_words/{language}.one_sentence_per_line", "w")
- output_file_dev = open(f"{out_dirname}/tokenized_in_phonemes_dev/{language}.one_sentence_per_line", "w")
- espeak_language_id = loaded_yaml_file[language]["espeak_language_id"]
- backend = EspeakBackend(language=espeak_language_id, language_switch="remove-utterance")
- added_sents = set()
- for sent, progress_bar in self.get_sentences(language, max_sents_to_download, chunk=chunk) :
- sent = self._remove_ponctuations(sent)
- sent = backend.phonemize([sent], separator=self.separator, strip=True)
- # phonemizer will sometimes return string with brackets, we have to remove them.
- sent = self._remove_brackets(*sent)
- sent = sent.strip()
- # tokenization by phoneme
- sent_phonemes = " ".join(phon for word in sent.split("@") for phon in word.split("$") if phon)
- sent_words = " ".join("".join(word.strip().split("$")) for word in sent.split("@"))
- output_file_words.write(sent_words + "\n")
- if sent_phonemes not in added_sents :
- added_sents.add(sent_phonemes)
- self.total_sents += 1
- progress_bar.update(1)
- added_sents = list(added_sents)
- shuffle(added_sents)
- train = added_sents[:train_sentences]
- dev = added_sents[train_sentences:max_sents_to_download]
- for sent_train in train :
- output_file_train.write(sent_train + "\n")
- for sent_dev in dev :
- output_file_dev.write(sent_dev + "\n")
- self.total_sents = 0
- if __name__ == "__main__" :
- from argparse import ArgumentParser
- parser = ArgumentParser()
- parser.add_argument("--yaml_file",
- help="YAML File containing for each language,\
- all relevant information for downloading the data.",
- required=True)
- parser.add_argument("--out_dirname",
- help="The directory where outputs will be stored.",
- required=True)
- parser.add_argument("--chunk",
- help="For the chunk size. This number should\
- grow as much as you want to download many sentences.\
- 256 is a good number when you want to get 1_000_000 or less sentences",
- default=1024,
- required=False)
- parser.add_argument("--train_sentences",
- help="Number of sent for the training corpora.",
- default=200_000,
- required=False)
- parser.add_argument("--dev_sentences",
- help="Number of sent for the dev or test copora.",
- default=10_000,
- required=False)
- args = parser.parse_args()
- yaml_file = args.yaml_file
- chunk = args.chunk
- out_dirname = args.out_dirname
- out_dirname = out_dirname[:-1] if out_dirname.endswith("/") else out_dirname
- if not os.path.exists(f"{out_dirname}/tokenized_in_phonemes_train"):
- os.makedirs(f"{out_dirname}/tokenized_in_phonemes_train")
- if not os.path.exists(f"{out_dirname}/tokenized_in_words"):
- os.makedirs(f"{out_dirname}/tokenized_in_words")
- if not os.path.exists(f"{out_dirname}/tokenized_in_phonemes_dev"):
- os.makedirs(f"{out_dirname}/tokenized_in_phonemes_dev")
- languages_to_download_informations = yaml.safe_load(open(args.yaml_file))
- downloader = DownloadOpenSubtitlesData()
- downloader(languages_to_download_informations,
- args.train_sentences,
- args.dev_sentences,
- chunk,
- out_dirname)
|