doi
/
Unsupervised_Metrics_CLD_Sy
forked from LAAC-LSCP/Unsupervised_Metrics_CLD_Sy


			
			
				
					
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
							"""This module implements a class that download the data from \
    opus nlp website.

    The data from the website can be very huge, especially for high \
    ressourced languages. Moreover, we are only interested on a subset \
    of the data for each language. For the class implemented in this module,\
    instead of downloading all the data, it iterates over small chunks \
    and only extract the extract the necessary number of sentences only on these chunks.
"""
import os
from random import shuffle
import re
from io import BytesIO
import gzip
import yaml
import requests
from tqdm import tqdm
import string
from phonemizer.backend import EspeakBackend
from phonemizer.separator import Separator
from typing import Iterator
import random
random.seed(80)

class DownloadOpenSubtitlesData :
    """
    Class that download sentences from OpenSubtitles.

    Atributes
    ---------
    - version : str
        The version of the OpenSubtitles page of the opus.nlpl website.
    - base_url : str
        The opus.nlpl url where the data is stored for each language
    - total_sents : int
        Counter of sentences.
    """

    def __init__(self, version="2018") :
        self.base_url  = f"https://opus.nlpl.eu/download.php?f=OpenSubtitles/v{version}/mono/OpenSubtitles.raw."
        self.separator = Separator(phone='$', word='@')
        self.total_sents = 0
    
    def _remove_ponctuations(self, sentence: str) -> str :
        """
        Method that removes ponctuations from a given sentence.
        
        Parameters
        ----------
        - sent : str
            The sentence for which punctuations need to be removed

        Returns
        -------
        - str :
            The sentence without punctuations.
        """
        return sentence.translate(str.maketrans('', '', string.punctuation))

    def _remove_brackets(self, sentence: str) -> str:
        """
        Method that removes brackets from a given sentence.
        
        Parameters
        ----------
        - sentence : str
            The sentence for which brackets need to be removed.
        
        Returns
        -------
        - str :
            The sentence without brackets.
        """
        return re.sub(r"[\(\[].*?[\)\]]", "", sentence)
        
    def get_sentences(self, language: str, max_sents_to_download: int, chunk: int=128) -> Iterator[tuple]:
        """
        Function for getting sentences from opensubtitles for a given language\
        and a number of sentences.

        Parameters
        ----------
        - language : str
            The language for which to retrieve the sentences.
        - max_sents_to_process : str
            The number of sentences to retrieve.
        
        Returns
        -------
        - Iterator : 
            Iterator over sentences and progressbars
        """
        # stream in order to not load all on memory
        response = requests.get(f"{self.base_url}{language}.gz", stream=True)
        # "chunk" increase as max_sents_to_process increases
        chunk_size = chunk * max_sents_to_download
        # iterator over chunks
        chunks = response.iter_content(chunk_size=chunk_size)
        with tqdm(total=max_sents_to_download) as progress_bar :
            progress_bar.set_description(f"Language={language}")
            while self.total_sents < max_sents_to_download :
                chunk = next(chunks)
                try :
                    for sent in gzip.open(BytesIO(chunk), "rt") :
                        if self.total_sents >= max_sents_to_download : 
                            break
                        else :
                            yield sent, progress_bar
                except : # if exception, this means the chunk size is too small for gzip
                    print(f"The chunk size is to small for {max_sents_to_download} sentences to download")
                    break
    
    def __call__(self, loaded_yaml_file, train_sentences, dev_sentences, chunk, out_dirname) -> None:
        """
        Collect the sentences for all languages.

        Paramaters
        ----------
        - loaded_yaml_file : dict
            This dictionary contains all informations relevant for this study, for each language. \
            This dictionary also contains informations about espeak ids for the languages, and this is relevant
            for phonemization.
        - train_sentences : int
            Number of sentences to download for train corpora. This number is the same for all languages.
        - dev_sentences : int
            Number of sentences to download for dev corpora. This number is the same for all languages.
        - out_dirname : str
            The folder where the outputs will be saved.
        """
        max_sents_to_download = train_sentences + dev_sentences
        for language in loaded_yaml_file :
            output_file_train = open(f"{out_dirname}/tokenized_in_phonemes_train/{language}.one_sentence_per_line", "w")
            output_file_words = open(f"{out_dirname}/tokenized_in_words/{language}.one_sentence_per_line", "w")
            output_file_dev = open(f"{out_dirname}/tokenized_in_phonemes_dev/{language}.one_sentence_per_line", "w")
            espeak_language_id = loaded_yaml_file[language]["espeak_language_id"]
            backend = EspeakBackend(language=espeak_language_id, language_switch="remove-utterance")
            added_sents = set()
            for sent, progress_bar in self.get_sentences(language, max_sents_to_download, chunk=chunk) :
                sent = self._remove_ponctuations(sent)
                sent = backend.phonemize([sent], separator=self.separator, strip=True)
                # phonemizer will sometimes return string with brackets, we have to remove them.
                sent = self._remove_brackets(*sent)
                sent = sent.strip()
                # tokenization by phoneme
                sent_phonemes = " ".join(phon for word in sent.split("@") for phon in word.split("$") if phon)
                sent_words = " ".join("".join(word.strip().split("$")) for word in sent.split("@"))
                output_file_words.write(sent_words + "\n")

                if sent_phonemes not in added_sents :
                    added_sents.add(sent_phonemes)
                    self.total_sents += 1
                    progress_bar.update(1)
            added_sents = list(added_sents)
            shuffle(added_sents)
            train = added_sents[:train_sentences]
            dev = added_sents[train_sentences:max_sents_to_download]
            for sent_train in train : 
                output_file_train.write(sent_train + "\n")
            for sent_dev in dev :
                output_file_dev.write(sent_dev + "\n")
            self.total_sents = 0

if __name__ == "__main__" :
    from argparse import ArgumentParser
    parser = ArgumentParser()

    parser.add_argument("--yaml_file",
                        help="YAML File containing for each language, all relevant information for downloading the data.",
                        required=True)
    parser.add_argument("--out_dirname",
                        help="The directory where outputs will be stored.",
                        required=True)
    parser.add_argument("--chunk",
                        help="For the chunk size. This number should grow as much as you want to download many sentences.\
                            256 is a good number when you want to get 1_000_000 or less sentences",
                        default=1024,
                        required=False)
    parser.add_argument("--train_sentences",
                        help="Number of sent for the train corpora.",
                        default=200_000,
                        required=False)
    parser.add_argument("--dev_sentences",
                        help="Number of sent for the dev copora.",
                        default=10_000,
                        required=False)
    args = parser.parse_args()
    yaml_file = args.yaml_file
    chunk = args.chunk
    out_dirname = args.out_dirname
    out_dirname = out_dirname[:-1] if out_dirname.endswith("/") else out_dirname
    if not os.path.exists(f"{out_dirname}/tokenized_in_phonemes_train"):
        os.makedirs(f"{out_dirname}/tokenized_in_phonemes_train")
    if not os.path.exists(f"{out_dirname}/tokenized_in_words"):
        os.makedirs(f"{out_dirname}/tokenized_in_words")
    if not os.path.exists(f"{out_dirname}/tokenized_in_phonemes_dev"):
        os.makedirs(f"{out_dirname}/tokenized_in_phonemes_dev")
    languages_to_download_informations = yaml.safe_load(open(args.yaml_file))
    downloader = DownloadOpenSubtitlesData()
    downloader(languages_to_download_informations, args.train_sentences, args.dev_sentences, chunk, out_dirname)