download_opensubtitles_corpora.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210
  1. """This module implements a class that download the data from \
  2. opus nlp website.
  3. The data from the website can be very huge, especially for high \
  4. ressourced languages. Moreover, we are only interested on a subset \
  5. of the data for each language. For the class implemented in this module,\
  6. instead of downloading all the data, it iterates over small chunks \
  7. and only extract the extract the necessary number of sentences only on these chunks.
  8. """
  9. import os
  10. import re
  11. from typing import Iterator
  12. import string
  13. import random
  14. from io import BytesIO
  15. from random import shuffle
  16. import gzip
  17. import yaml
  18. import requests
  19. from tqdm import tqdm
  20. from phonemizer.backend import EspeakBackend
  21. from phonemizer.separator import Separator
  22. random.seed(80)
  23. class DownloadOpenSubtitlesData :
  24. """
  25. Class that download sentences from OpenSubtitles.
  26. Atributes
  27. ---------
  28. - version : str
  29. The version of the OpenSubtitles page of the opus.nlpl website.
  30. - base_url : str
  31. The opus.nlpl url where the data is stored for each language
  32. - total_sents : int
  33. Counter of sentences.
  34. """
  35. def __init__(self, version="2018") :
  36. self.base_url = f"https://opus.nlpl.eu/download.php?f=OpenSubtitles/v{version}/mono/OpenSubtitles.raw."
  37. self.separator = Separator(phone='$', word='@')
  38. self.total_sents = 0
  39. def _remove_ponctuations(self, sentence: str) -> str :
  40. """
  41. Method that removes ponctuations from a given sentence.
  42. Parameters
  43. ----------
  44. - sent : str
  45. The sentence for which punctuations need to be removed
  46. Returns
  47. -------
  48. - str :
  49. The sentence without punctuations.
  50. """
  51. return sentence.translate(str.maketrans('', '', string.punctuation))
  52. def _remove_brackets(self, sentence: str) -> str:
  53. """
  54. Method that removes brackets from a given sentence.
  55. Parameters
  56. ----------
  57. - sentence : str
  58. The sentence for which brackets need to be removed.
  59. Returns
  60. -------
  61. - str :
  62. The sentence without brackets.
  63. """
  64. return re.sub(r"[\(\[].*?[\)\]]", "", sentence)
  65. def get_sentences(self, language: str,
  66. max_sents_to_download: int,
  67. chunk: int=128) -> Iterator[tuple]:
  68. """
  69. Function for getting sentences from opensubtitles for a given language\
  70. and a number of sentences.
  71. Parameters
  72. ----------
  73. - language : str
  74. The language for which to retrieve the sentences.
  75. - max_sents_to_process : str
  76. The number of sentences to retrieve.
  77. Returns
  78. -------
  79. - Iterator :
  80. Iterator over sentences and progressbars
  81. """
  82. # stream in order to not load all on memory
  83. response = requests.get(f"{self.base_url}{language}.gz", stream=True)
  84. # "chunk" increase as max_sents_to_process increases
  85. chunk_size = chunk * max_sents_to_download
  86. # iterator over chunks
  87. chunks = response.iter_content(chunk_size=chunk_size)
  88. with tqdm(total=max_sents_to_download) as progress_bar :
  89. progress_bar.set_description(f"Language={language}")
  90. while self.total_sents < max_sents_to_download :
  91. chunk = next(chunks)
  92. try :
  93. for sent in gzip.open(BytesIO(chunk), "rt") :
  94. if self.total_sents >= max_sents_to_download :
  95. break
  96. else :
  97. yield sent, progress_bar
  98. except : # if exception, this means the chunk size is too small for gzip
  99. print(f"The chunk size is to small for {max_sents_to_download}\
  100. sentences to download")
  101. break
  102. def __call__(self,
  103. loaded_yaml_file,
  104. train_sentences,
  105. dev_sentences,
  106. chunk,
  107. out_dirname) -> None:
  108. """
  109. Collect the sentences for all languages.
  110. Paramaters
  111. ----------
  112. - loaded_yaml_file : dict
  113. This dictionary contains all informations relevant\
  114. for this study, for each language. This dictionary also\
  115. contains informations about espeak ids for the languages,\
  116. and this is relevant for phonemization.
  117. - train_sentences : int
  118. Number of sentences to download for train corpora.\
  119. This number is the same for all languages.
  120. - dev_sentences : int
  121. Number of sentences to download for dev corpora.\
  122. This number is the same for all languages.
  123. - out_dirname : str
  124. The folder where the outputs will be saved.
  125. """
  126. max_sents_to_download = train_sentences + dev_sentences
  127. for language in loaded_yaml_file :
  128. output_file_train = open(f"{out_dirname}/tokenized_in_phonemes_train/{language}.one_sentence_per_line", "w")
  129. output_file_dev = open(f"{out_dirname}/tokenized_in_phonemes_dev/{language}.one_sentence_per_line", "w")
  130. espeak_language_id = loaded_yaml_file[language]["espeak_language_id"]
  131. backend = EspeakBackend(language=espeak_language_id, language_switch="remove-utterance")
  132. added_sents = set()
  133. for sent, progress_bar in self.get_sentences(language, max_sents_to_download, chunk=chunk) :
  134. sent = self._remove_ponctuations(sent)
  135. sent = backend.phonemize([sent], separator=self.separator, strip=True)
  136. # phonemizer will sometimes return string with brackets, we have to remove them.
  137. sent = self._remove_brackets(*sent)
  138. sent = sent.strip()
  139. # tokenization by phoneme
  140. sent_phonemes = " ".join(phon for word in sent.split("@") for phon in word.split("$") if phon)
  141. if sent_phonemes not in added_sents :
  142. added_sents.add(sent_phonemes)
  143. self.total_sents += 1
  144. progress_bar.update(1)
  145. added_sents = list(added_sents)
  146. shuffle(added_sents)
  147. train = added_sents[:train_sentences]
  148. dev = added_sents[train_sentences:max_sents_to_download]
  149. for sent_train in train :
  150. output_file_train.write(sent_train + "\n")
  151. for sent_dev in dev :
  152. output_file_dev.write(sent_dev + "\n")
  153. self.total_sents = 0
  154. if __name__ == "__main__" :
  155. from argparse import ArgumentParser
  156. parser = ArgumentParser()
  157. parser.add_argument("--yaml_file",
  158. help="YAML File containing for each language,\
  159. all relevant information for downloading the data.",
  160. required=True)
  161. parser.add_argument("--out_dirname",
  162. help="The directory where outputs will be stored.",
  163. required=True)
  164. parser.add_argument("--chunk",
  165. help="For the chunk size. This number should\
  166. grow as much as you want to download many sentences.\
  167. 256 is a good number when you want to get 1_000_000 or less sentences",
  168. default=1024,
  169. required=False)
  170. parser.add_argument("--train_sentences",
  171. help="Number of sent for the training corpora.",
  172. default=200_000,
  173. required=False)
  174. parser.add_argument("--dev_sentences",
  175. help="Number of sent for the dev or test copora.",
  176. default=10_000,
  177. required=False)
  178. args = parser.parse_args()
  179. yaml_file = args.yaml_file
  180. chunk = args.chunk
  181. out_dirname = args.out_dirname
  182. out_dirname = out_dirname[:-1] if out_dirname.endswith("/") else out_dirname
  183. if not os.path.exists(f"{out_dirname}/tokenized_in_phonemes_train"):
  184. os.makedirs(f"{out_dirname}/tokenized_in_phonemes_train")
  185. if not os.path.exists(f"{out_dirname}/tokenized_in_phonemes_dev"):
  186. os.makedirs(f"{out_dirname}/tokenized_in_phonemes_dev")
  187. languages_to_download_informations = yaml.safe_load(open(args.yaml_file))
  188. downloader = DownloadOpenSubtitlesData()
  189. downloader(languages_to_download_informations,
  190. args.train_sentences,
  191. args.dev_sentences,
  192. chunk,
  193. out_dirname)