download_opensubtitles_corpora.py 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
  1. """This module implements a class that download the data from \
  2. opus nlp website.
  3. The data from the website can be very huge, especially for high \
  4. ressourced languages. Moreover, we are only interested on a subset \
  5. of the data for each language. For the class implemented in this module,\
  6. instead of downloading all the data, it iterates over small chunks \
  7. and only extract the extract the necessary number of sentences only on these chunks.
  8. """
  9. import os
  10. from random import shuffle
  11. import re
  12. from io import BytesIO
  13. import gzip
  14. import yaml
  15. import requests
  16. from tqdm import tqdm
  17. import string
  18. from phonemizer.backend import EspeakBackend
  19. from phonemizer.separator import Separator
  20. from typing import Iterator
  21. import random
  22. random.seed(80)
  23. class DownloadOpenSubtitlesData :
  24. """
  25. Class that download sentences from OpenSubtitles.
  26. Atributes
  27. ---------
  28. - version : str
  29. The version of the OpenSubtitles page of the opus.nlpl website.
  30. - base_url : str
  31. The opus.nlpl url where the data is stored for each language
  32. - total_sents : int
  33. Counter of sentences.
  34. """
  35. def __init__(self, version="2018") :
  36. self.base_url = f"https://opus.nlpl.eu/download.php?f=OpenSubtitles/v{version}/mono/OpenSubtitles.raw."
  37. self.separator = Separator(phone='$', word='@')
  38. self.total_sents = 0
  39. def _remove_ponctuations(self, sentence: str) -> str :
  40. """
  41. Method that removes ponctuations from a given sentence.
  42. Parameters
  43. ----------
  44. - sent : str
  45. The sentence for which punctuations need to be removed
  46. Returns
  47. -------
  48. - str :
  49. The sentence without punctuations.
  50. """
  51. return sentence.translate(str.maketrans('', '', string.punctuation))
  52. def _remove_brackets(self, sentence: str) -> str:
  53. """
  54. Method that removes brackets from a given sentence.
  55. Parameters
  56. ----------
  57. - sentence : str
  58. The sentence for which brackets need to be removed.
  59. Returns
  60. -------
  61. - str :
  62. The sentence without brackets.
  63. """
  64. return re.sub(r"[\(\[].*?[\)\]]", "", sentence)
  65. def get_sentences(self, language: str, max_sents_to_download: int, chunk: int=128) -> Iterator[tuple]:
  66. """
  67. Function for getting sentences from opensubtitles for a given language\
  68. and a number of sentences.
  69. Parameters
  70. ----------
  71. - language : str
  72. The language for which to retrieve the sentences.
  73. - max_sents_to_process : str
  74. The number of sentences to retrieve.
  75. Returns
  76. -------
  77. - Iterator :
  78. Iterator over sentences and progressbars
  79. """
  80. # stream in order to not load all on memory
  81. response = requests.get(f"{self.base_url}{language}.gz", stream=True)
  82. # "chunk" increase as max_sents_to_process increases
  83. chunk_size = chunk * max_sents_to_download
  84. # iterator over chunks
  85. chunks = response.iter_content(chunk_size=chunk_size)
  86. with tqdm(total=max_sents_to_download) as progress_bar :
  87. progress_bar.set_description(f"Language={language}")
  88. while self.total_sents < max_sents_to_download :
  89. chunk = next(chunks)
  90. try :
  91. for sent in gzip.open(BytesIO(chunk), "rt") :
  92. if self.total_sents >= max_sents_to_download :
  93. break
  94. else :
  95. yield sent, progress_bar
  96. except : # if exception, this means the chunk size is too small for gzip
  97. print(f"The chunk size is to small for {max_sents_to_download} sentences to download")
  98. break
  99. def __call__(self, loaded_yaml_file, train_sentences, dev_sentences, chunk, out_dirname) -> None:
  100. """
  101. Collect the sentences for all languages.
  102. Paramaters
  103. ----------
  104. - loaded_yaml_file : dict
  105. This dictionary contains all informations relevant for this study, for each language. \
  106. This dictionary also contains informations about espeak ids for the languages, and this is relevant
  107. for phonemization.
  108. - train_sentences : int
  109. Number of sentences to download for train corpora. This number is the same for all languages.
  110. - dev_sentences : int
  111. Number of sentences to download for dev corpora. This number is the same for all languages.
  112. - out_dirname : str
  113. The folder where the outputs will be saved.
  114. """
  115. max_sents_to_download = train_sentences + dev_sentences
  116. for language in loaded_yaml_file :
  117. output_file_train = open(f"{out_dirname}/tokenized_in_phonemes_train/{language}.one_sentence_per_line", "w")
  118. output_file_words = open(f"{out_dirname}/tokenized_in_words/{language}.one_sentence_per_line", "w")
  119. output_file_dev = open(f"{out_dirname}/tokenized_in_phonemes_dev/{language}.one_sentence_per_line", "w")
  120. espeak_language_id = loaded_yaml_file[language]["espeak_language_id"]
  121. backend = EspeakBackend(language=espeak_language_id, language_switch="remove-utterance")
  122. added_sents = set()
  123. for sent, progress_bar in self.get_sentences(language, max_sents_to_download, chunk=chunk) :
  124. sent = self._remove_ponctuations(sent)
  125. sent = backend.phonemize([sent], separator=self.separator, strip=True)
  126. # phonemizer will sometimes return string with brackets, we have to remove them.
  127. sent = self._remove_brackets(*sent)
  128. sent = sent.strip()
  129. # tokenization by phoneme
  130. sent_phonemes = " ".join(phon for word in sent.split("@") for phon in word.split("$") if phon)
  131. sent_words = " ".join("".join(word.strip().split("$")) for word in sent.split("@"))
  132. output_file_words.write(sent_words + "\n")
  133. if sent_phonemes not in added_sents :
  134. added_sents.add(sent_phonemes)
  135. self.total_sents += 1
  136. progress_bar.update(1)
  137. added_sents = list(added_sents)
  138. shuffle(added_sents)
  139. train = added_sents[:train_sentences]
  140. dev = added_sents[train_sentences:max_sents_to_download]
  141. for sent_train in train :
  142. output_file_train.write(sent_train + "\n")
  143. for sent_dev in dev :
  144. output_file_dev.write(sent_dev + "\n")
  145. self.total_sents = 0
  146. if __name__ == "__main__" :
  147. from argparse import ArgumentParser
  148. parser = ArgumentParser()
  149. parser.add_argument("--yaml_file",
  150. help="YAML File containing for each language, all relevant information for downloading the data.",
  151. required=True)
  152. parser.add_argument("--out_dirname",
  153. help="The directory where outputs will be stored.",
  154. required=True)
  155. parser.add_argument("--chunk",
  156. help="For the chunk size. This number should grow as much as you want to download many sentences.\
  157. 256 is a good number when you want to get 1_000_000 or less sentences",
  158. default=1024,
  159. required=False)
  160. parser.add_argument("--train_sentences",
  161. help="Number of sent for the train corpora.",
  162. default=200_000,
  163. required=False)
  164. parser.add_argument("--dev_sentences",
  165. help="Number of sent for the dev copora.",
  166. default=10_000,
  167. required=False)
  168. args = parser.parse_args()
  169. yaml_file = args.yaml_file
  170. chunk = args.chunk
  171. out_dirname = args.out_dirname
  172. out_dirname = out_dirname[:-1] if out_dirname.endswith("/") else out_dirname
  173. if not os.path.exists(f"{out_dirname}/tokenized_in_phonemes_train"):
  174. os.makedirs(f"{out_dirname}/tokenized_in_phonemes_train")
  175. if not os.path.exists(f"{out_dirname}/tokenized_in_words"):
  176. os.makedirs(f"{out_dirname}/tokenized_in_words")
  177. if not os.path.exists(f"{out_dirname}/tokenized_in_phonemes_dev"):
  178. os.makedirs(f"{out_dirname}/tokenized_in_phonemes_dev")
  179. languages_to_download_informations = yaml.safe_load(open(args.yaml_file))
  180. downloader = DownloadOpenSubtitlesData()
  181. downloader(languages_to_download_informations, args.train_sentences, args.dev_sentences, chunk, out_dirname)