Browse Source

re-downloaded opensubtitles data

yaya-sy 1 year ago
parent
commit
9fef24bf15
62 changed files with 225 additions and 25 deletions
  1. 200 0
      code/download_opensubtitles_corpora.py
  2. 1 0
      datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/da.one_sentence_per_line
  3. 1 0
      datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/de.one_sentence_per_line
  4. 1 0
      datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/en.one_sentence_per_line
  5. 1 0
      datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/es.one_sentence_per_line
  6. 1 0
      datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/et.one_sentence_per_line
  7. 1 0
      datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/eu.one_sentence_per_line
  8. 1 0
      datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/fr.one_sentence_per_line
  9. 1 0
      datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/ja.one_sentence_per_line
  10. 1 0
      datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/pl.one_sentence_per_line
  11. 1 0
      datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/pt.one_sentence_per_line
  12. 1 0
      datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/sr.one_sentence_per_line
  13. 1 0
      datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/tr.one_sentence_per_line
  14. 1 0
      datasets/opensubtitles_corpora/tokenized_in_phonemes_train/da.one_sentence_per_line
  15. 1 0
      datasets/opensubtitles_corpora/tokenized_in_phonemes_train/de.one_sentence_per_line
  16. 1 0
      datasets/opensubtitles_corpora/tokenized_in_phonemes_train/en.one_sentence_per_line
  17. 1 0
      datasets/opensubtitles_corpora/tokenized_in_phonemes_train/es.one_sentence_per_line
  18. 1 0
      datasets/opensubtitles_corpora/tokenized_in_phonemes_train/et.one_sentence_per_line
  19. 1 0
      datasets/opensubtitles_corpora/tokenized_in_phonemes_train/eu.one_sentence_per_line
  20. 1 0
      datasets/opensubtitles_corpora/tokenized_in_phonemes_train/fr.one_sentence_per_line
  21. 1 0
      datasets/opensubtitles_corpora/tokenized_in_phonemes_train/ja.one_sentence_per_line
  22. 1 0
      datasets/opensubtitles_corpora/tokenized_in_phonemes_train/pl.one_sentence_per_line
  23. 1 0
      datasets/opensubtitles_corpora/tokenized_in_phonemes_train/pt.one_sentence_per_line
  24. 1 0
      datasets/opensubtitles_corpora/tokenized_in_phonemes_train/sr.one_sentence_per_line
  25. 1 0
      datasets/opensubtitles_corpora/tokenized_in_phonemes_train/tr.one_sentence_per_line
  26. 0 0
      datasets/opensubtitles_corpora/tokenized_in_words/da.one_sentence_per_line
  27. 0 0
      datasets/opensubtitles_corpora/tokenized_in_words/de.one_sentence_per_line
  28. 0 0
      datasets/opensubtitles_corpora/tokenized_in_words/en.one_sentence_per_line
  29. 0 0
      datasets/opensubtitles_corpora/tokenized_in_words/es.one_sentence_per_line
  30. 0 0
      datasets/opensubtitles_corpora/tokenized_in_words/et.one_sentence_per_line
  31. 0 0
      datasets/opensubtitles_corpora/tokenized_in_words/eu.one_sentence_per_line
  32. 1 0
      datasets/opensubtitles_corpora/tokenized_in_words/fr.one_sentence_per_line
  33. 0 0
      datasets/opensubtitles_corpora/tokenized_in_words/ja.one_sentence_per_line
  34. 0 0
      datasets/opensubtitles_corpora/tokenized_in_words/pl.one_sentence_per_line
  35. 0 0
      datasets/opensubtitles_corpora/tokenized_in_words/pt.one_sentence_per_line
  36. 0 0
      datasets/opensubtitles_corpora/tokenized_in_words/sr.one_sentence_per_line
  37. 0 0
      datasets/opensubtitles_corpora/tokenized_in_words/tr.one_sentence_per_line
  38. 0 1
      datasets/train_dev_opensubtitles/dev/da.one_sentence_per_line
  39. 0 1
      datasets/train_dev_opensubtitles/dev/de.one_sentence_per_line
  40. 0 1
      datasets/train_dev_opensubtitles/dev/en.one_sentence_per_line
  41. 0 1
      datasets/train_dev_opensubtitles/dev/es.one_sentence_per_line
  42. 0 1
      datasets/train_dev_opensubtitles/dev/et.one_sentence_per_line
  43. 0 1
      datasets/train_dev_opensubtitles/dev/eu.one_sentence_per_line
  44. 0 1
      datasets/train_dev_opensubtitles/dev/fr.one_sentence_per_line
  45. 0 1
      datasets/train_dev_opensubtitles/dev/ja.one_sentence_per_line
  46. 0 1
      datasets/train_dev_opensubtitles/dev/pl.one_sentence_per_line
  47. 0 1
      datasets/train_dev_opensubtitles/dev/pt.one_sentence_per_line
  48. 0 1
      datasets/train_dev_opensubtitles/dev/sr.one_sentence_per_line
  49. 0 1
      datasets/train_dev_opensubtitles/dev/tr.one_sentence_per_line
  50. 0 1
      datasets/train_dev_opensubtitles/train/da.one_sentence_per_line
  51. 0 1
      datasets/train_dev_opensubtitles/train/de.one_sentence_per_line
  52. 0 1
      datasets/train_dev_opensubtitles/train/en.one_sentence_per_line
  53. 0 1
      datasets/train_dev_opensubtitles/train/es.one_sentence_per_line
  54. 0 1
      datasets/train_dev_opensubtitles/train/et.one_sentence_per_line
  55. 0 1
      datasets/train_dev_opensubtitles/train/eu.one_sentence_per_line
  56. 0 1
      datasets/train_dev_opensubtitles/train/fr.one_sentence_per_line
  57. 0 1
      datasets/train_dev_opensubtitles/train/ja.one_sentence_per_line
  58. 0 1
      datasets/train_dev_opensubtitles/train/pl.one_sentence_per_line
  59. 0 1
      datasets/train_dev_opensubtitles/train/pt.one_sentence_per_line
  60. 0 1
      datasets/train_dev_opensubtitles/train/sr.one_sentence_per_line
  61. 0 1
      datasets/train_dev_opensubtitles/train/tr.one_sentence_per_line
  62. 0 1
      datasets/train_dev_opensubtitles/words/fr.one_sentence_per_line

+ 200 - 0
code/download_opensubtitles_corpora.py

@@ -0,0 +1,200 @@
+"""This module implements a class that download the data from \
+    opus nlp website.
+
+    The data from the website can be very huge, especially for high \
+    ressourced languages. Moreover, we are only interested on a subset \
+    of the data for each language. For the class implemented in this module,\
+    instead of downloading all the data, it iterates over small chunks \
+    and only extract the extract the necessary number of sentences only on these chunks.
+"""
+import os
+from random import shuffle
+import re
+from io import BytesIO
+import gzip
+import yaml
+import requests
+from tqdm import tqdm
+import string
+from phonemizer.backend import EspeakBackend
+from phonemizer.separator import Separator
+from typing import Iterator
+import random
+random.seed(80)
+
+class DownloadOpenSubtitlesData :
+    """
+    Class that download sentences from OpenSubtitles.
+
+    Atributes
+    ---------
+    - version : str
+        The version of the OpenSubtitles page of the opus.nlpl website.
+    - base_url : str
+        The opus.nlpl url where the data is stored for each language
+    - total_sents : int
+        Counter of sentences.
+    """
+
+    def __init__(self, version="2018") :
+        self.base_url  = f"https://opus.nlpl.eu/download.php?f=OpenSubtitles/v{version}/mono/OpenSubtitles.raw."
+        self.separator = Separator(phone='$', word='@')
+        self.total_sents = 0
+    
+    def _remove_ponctuations(self, sentence: str) -> str :
+        """
+        Method that removes ponctuations from a given sentence.
+        
+        Parameters
+        ----------
+        - sent : str
+            The sentence for which punctuations need to be removed
+
+        Returns
+        -------
+        - str :
+            The sentence without punctuations.
+        """
+        return sentence.translate(str.maketrans('', '', string.punctuation))
+
+    def _remove_brackets(self, sentence: str) -> str:
+        """
+        Method that removes brackets from a given sentence.
+        
+        Parameters
+        ----------
+        - sentence : str
+            The sentence for which brackets need to be removed.
+        
+        Returns
+        -------
+        - str :
+            The sentence without brackets.
+        """
+        return re.sub(r"[\(\[].*?[\)\]]", "", sentence)
+        
+    def get_sentences(self, language: str, max_sents_to_download: int, chunk: int=128) -> Iterator[tuple]:
+        """
+        Function for getting sentences from opensubtitles for a given language\
+        and a number of sentences.
+
+        Parameters
+        ----------
+        - language : str
+            The language for which to retrieve the sentences.
+        - max_sents_to_process : str
+            The number of sentences to retrieve.
+        
+        Returns
+        -------
+        - Iterator : 
+            Iterator over sentences and progressbars
+        """
+        # stream in order to not load all on memory
+        response = requests.get(f"{self.base_url}{language}.gz", stream=True)
+        # "chunk" increase as max_sents_to_process increases
+        chunk_size = chunk * max_sents_to_download
+        # iterator over chunks
+        chunks = response.iter_content(chunk_size=chunk_size)
+        with tqdm(total=max_sents_to_download) as progress_bar :
+            progress_bar.set_description(f"Language={language}")
+            while self.total_sents < max_sents_to_download :
+                chunk = next(chunks)
+                try :
+                    for sent in gzip.open(BytesIO(chunk), "rt") :
+                        if self.total_sents >= max_sents_to_download : 
+                            break
+                        else :
+                            yield sent, progress_bar
+                except : # if exception, this means the chunk size is too small for gzip
+                    print(f"The chunk size is to small for {max_sents_to_download} sentences to download")
+                    break
+    
+    def __call__(self, loaded_yaml_file, train_sentences, dev_sentences, chunk, out_dirname) -> None:
+        """
+        Collect the sentences for all languages.
+
+        Paramaters
+        ----------
+        - loaded_yaml_file : dict
+            This dictionary contains all informations relevant for this study, for each language. \
+            This dictionary also contains informations about espeak ids for the languages, and this is relevant
+            for phonemization.
+        - train_sentences : int
+            Number of sentences to download for train corpora. This number is the same for all languages.
+        - dev_sentences : int
+            Number of sentences to download for dev corpora. This number is the same for all languages.
+        - out_dirname : str
+            The folder where the outputs will be saved.
+        """
+        max_sents_to_download = train_sentences + dev_sentences
+        for language in loaded_yaml_file :
+            output_file_train = open(f"{out_dirname}/tokenized_in_phonemes_train/{language}.one_sentence_per_line", "w")
+            output_file_words = open(f"{out_dirname}/tokenized_in_words/{language}.one_sentence_per_line", "w")
+            output_file_dev = open(f"{out_dirname}/tokenized_in_phonemes_dev/{language}.one_sentence_per_line", "w")
+            espeak_language_id = loaded_yaml_file[language]["espeak_language_id"]
+            backend = EspeakBackend(language=espeak_language_id)
+            added_sents = set()
+            for sent, progress_bar in self.get_sentences(language, max_sents_to_download, chunk=chunk) :
+                sent = self._remove_ponctuations(sent)
+                sent = backend.phonemize([sent], separator=self.separator, strip=True)
+                # phonemizer will sometimes return string with brackets, we have to remove them.
+                sent = self._remove_brackets(*sent)
+                sent = sent.strip()
+                # tokenization by phoneme
+                sent_phonemes = " ".join(phon for word in sent.split("@") for phon in word.split("$") if phon)
+                sent_words = " ".join("".join(word.strip().split("$")) for word in sent.split("@"))
+                output_file_words.write(sent_words + "\n")
+
+                if sent_phonemes not in added_sents :
+                    added_sents.add(sent_phonemes)
+                    self.total_sents += 1
+                    progress_bar.update(1)
+            added_sents = list(added_sents)
+            shuffle(added_sents)
+            train = added_sents[:train_sentences]
+            dev = added_sents[train_sentences:max_sents_to_download]
+            for sent_train in train : 
+                output_file_train.write(sent_train + "\n")
+            for sent_dev in dev :
+                output_file_dev.write(sent_dev + "\n")
+            self.total_sents = 0
+
+if __name__ == "__main__" :
+    from argparse import ArgumentParser
+    parser = ArgumentParser()
+
+    parser.add_argument("--yaml_file",
+                        help="YAML File containing for each language, all relevant information for downloading the data.",
+                        required=True)
+    parser.add_argument("--out_dirname",
+                        help="The directory where outputs will be stored.",
+                        required=True)
+    parser.add_argument("--chunk",
+                        help="For the chunk size. This number should grow as much as you want to download many sentences.\
+                            256 is a good number when you want to get 1_000_000 or less sentences",
+                        default=1024,
+                        required=False)
+    parser.add_argument("--train_sentences",
+                        help="Number of sent for the train corpora.",
+                        default=500_000,
+                        required=False)
+    parser.add_argument("--dev_sentences",
+                        help="Number of sent for the dev copora.",
+                        default=10_000,
+                        required=False)
+    args = parser.parse_args()
+    yaml_file = args.yaml_file
+    chunk = args.chunk
+    out_dirname = args.out_dirname
+    out_dirname = out_dirname[:-1] if out_dirname.endswith("/") else out_dirname
+    if not os.path.exists(f"{out_dirname}/tokenized_in_phonemes_train"):
+        os.makedirs(f"{out_dirname}/tokenized_in_phonemes_train")
+    if not os.path.exists(f"{out_dirname}/tokenized_in_words"):
+        os.makedirs(f"{out_dirname}/tokenized_in_words")
+    if not os.path.exists(f"{out_dirname}/tokenized_in_phonemes_dev"):
+        os.makedirs(f"{out_dirname}/tokenized_in_phonemes_dev")
+    languages_to_download_informations = yaml.safe_load(open(args.yaml_file))
+    downloader = DownloadOpenSubtitlesData()
+    downloader(languages_to_download_informations, args.train_sentences, args.dev_sentences, chunk, out_dirname)
+

+ 1 - 0
datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/da.one_sentence_per_line

@@ -0,0 +1 @@
+../../../.git/annex/objects/22/q6/MD5E-s620529--3e4179641f7a348d39505a9a1333c26e/MD5E-s620529--3e4179641f7a348d39505a9a1333c26e

+ 1 - 0
datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/de.one_sentence_per_line

@@ -0,0 +1 @@
+../../../.git/annex/objects/p6/Kp/MD5E-s689485--d4f53c518c1d1dd09307030cdbdb0151/MD5E-s689485--d4f53c518c1d1dd09307030cdbdb0151

+ 1 - 0
datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/en.one_sentence_per_line

@@ -0,0 +1 @@
+../../../.git/annex/objects/JZ/87/MD5E-s625387--a6f26de5279b172fd418d24b29a2b059/MD5E-s625387--a6f26de5279b172fd418d24b29a2b059

+ 1 - 0
datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/es.one_sentence_per_line

@@ -0,0 +1 @@
+../../../.git/annex/objects/JV/2g/MD5E-s608100--dc406486e645c5aeeac92e84fd0eec85/MD5E-s608100--dc406486e645c5aeeac92e84fd0eec85

+ 1 - 0
datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/et.one_sentence_per_line

@@ -0,0 +1 @@
+../../../.git/annex/objects/Qm/WG/MD5E-s604630--bee32118984c8a3cde226ec022de0797/MD5E-s604630--bee32118984c8a3cde226ec022de0797

+ 1 - 0
datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/eu.one_sentence_per_line

@@ -0,0 +1 @@
+../../../.git/annex/objects/Zp/8m/MD5E-s646644--f43bb88e1c39fc23a7f0490ce2d4260a/MD5E-s646644--f43bb88e1c39fc23a7f0490ce2d4260a

+ 1 - 0
datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/fr.one_sentence_per_line

@@ -0,0 +1 @@
+../../../.git/annex/objects/JM/4Q/MD5E-s513721--bc168f268a7a7eb87c9621abf76c858a/MD5E-s513721--bc168f268a7a7eb87c9621abf76c858a

+ 1 - 0
datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/ja.one_sentence_per_line

@@ -0,0 +1 @@
+../../../.git/annex/objects/8x/zf/MD5E-s1625087--e594546108226554dac59cdd0ab2ca2e/MD5E-s1625087--e594546108226554dac59cdd0ab2ca2e

+ 1 - 0
datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/pl.one_sentence_per_line

@@ -0,0 +1 @@
+../../../.git/annex/objects/8m/k1/MD5E-s659395--ab04c68cbdf80baec54b754ec18bfbc9/MD5E-s659395--ab04c68cbdf80baec54b754ec18bfbc9

+ 1 - 0
datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/pt.one_sentence_per_line

@@ -0,0 +1 @@
+../../../.git/annex/objects/qV/6F/MD5E-s699709--b798071d8c259c4688d455f2d6f09ffc/MD5E-s699709--b798071d8c259c4688d455f2d6f09ffc

+ 1 - 0
datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/sr.one_sentence_per_line

@@ -0,0 +1 @@
+../../../.git/annex/objects/56/7m/MD5E-s602585--c472364aa49ebd8287926311425fd9aa/MD5E-s602585--c472364aa49ebd8287926311425fd9aa

+ 1 - 0
datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/tr.one_sentence_per_line

@@ -0,0 +1 @@
+../../../.git/annex/objects/wG/6Z/MD5E-s667909--4b1dacd17b2eb82471d553414475cf54/MD5E-s667909--4b1dacd17b2eb82471d553414475cf54

+ 1 - 0
datasets/opensubtitles_corpora/tokenized_in_phonemes_train/da.one_sentence_per_line

@@ -0,0 +1 @@
+../../../.git/annex/objects/16/Fj/MD5E-s31107032--e698d686ce611047a1ce737a619056de/MD5E-s31107032--e698d686ce611047a1ce737a619056de

+ 1 - 0
datasets/opensubtitles_corpora/tokenized_in_phonemes_train/de.one_sentence_per_line

@@ -0,0 +1 @@
+../../../.git/annex/objects/K4/x9/MD5E-s34412970--bcea547d24542c2d907adb45c217bd52/MD5E-s34412970--bcea547d24542c2d907adb45c217bd52

+ 1 - 0
datasets/opensubtitles_corpora/tokenized_in_phonemes_train/en.one_sentence_per_line

@@ -0,0 +1 @@
+../../../.git/annex/objects/29/58/MD5E-s31351782--9a33006b69492c7d114a9a7456869a98/MD5E-s31351782--9a33006b69492c7d114a9a7456869a98

+ 1 - 0
datasets/opensubtitles_corpora/tokenized_in_phonemes_train/es.one_sentence_per_line

@@ -0,0 +1 @@
+../../../.git/annex/objects/9k/7w/MD5E-s30321219--2d8998016a695d15b67609150a988316/MD5E-s30321219--2d8998016a695d15b67609150a988316

+ 1 - 0
datasets/opensubtitles_corpora/tokenized_in_phonemes_train/et.one_sentence_per_line

@@ -0,0 +1 @@
+../../../.git/annex/objects/xW/J8/MD5E-s30304598--8fed9d9b82dd421f28b51acd464823a6/MD5E-s30304598--8fed9d9b82dd421f28b51acd464823a6

+ 1 - 0
datasets/opensubtitles_corpora/tokenized_in_phonemes_train/eu.one_sentence_per_line

@@ -0,0 +1 @@
+../../../.git/annex/objects/Xp/8G/MD5E-s32429278--e6e85a8b62e2214622487ff408ef4e22/MD5E-s32429278--e6e85a8b62e2214622487ff408ef4e22

+ 1 - 0
datasets/opensubtitles_corpora/tokenized_in_phonemes_train/fr.one_sentence_per_line

@@ -0,0 +1 @@
+../../../.git/annex/objects/1q/JJ/MD5E-s25810878--d85276e4c8eff319b9211f274d008ca0/MD5E-s25810878--d85276e4c8eff319b9211f274d008ca0

+ 1 - 0
datasets/opensubtitles_corpora/tokenized_in_phonemes_train/ja.one_sentence_per_line

@@ -0,0 +1 @@
+../../../.git/annex/objects/F5/K1/MD5E-s81417420--a774ba1fd6507464c964f081839bf39a/MD5E-s81417420--a774ba1fd6507464c964f081839bf39a

+ 1 - 0
datasets/opensubtitles_corpora/tokenized_in_phonemes_train/pl.one_sentence_per_line

@@ -0,0 +1 @@
+../../../.git/annex/objects/g0/41/MD5E-s33219564--169b868d2c3648b0393b6e0c3a044b17/MD5E-s33219564--169b868d2c3648b0393b6e0c3a044b17

+ 1 - 0
datasets/opensubtitles_corpora/tokenized_in_phonemes_train/pt.one_sentence_per_line

@@ -0,0 +1 @@
+../../../.git/annex/objects/2f/4m/MD5E-s35272955--ba7dcb3f711316f6af9bf357d95d9e98/MD5E-s35272955--ba7dcb3f711316f6af9bf357d95d9e98

+ 1 - 0
datasets/opensubtitles_corpora/tokenized_in_phonemes_train/sr.one_sentence_per_line

@@ -0,0 +1 @@
+../../../.git/annex/objects/x1/4p/MD5E-s30061053--5620fe89e6d26e074bb379bbd95e36fa/MD5E-s30061053--5620fe89e6d26e074bb379bbd95e36fa

+ 1 - 0
datasets/opensubtitles_corpora/tokenized_in_phonemes_train/tr.one_sentence_per_line

@@ -0,0 +1 @@
+../../../.git/annex/objects/WW/Mm/MD5E-s33685177--d95e01d72ee0dd6f0790fca4dbca48de/MD5E-s33685177--d95e01d72ee0dd6f0790fca4dbca48de

datasets/train_dev_opensubtitles/words/da.one_sentence_per_line → datasets/opensubtitles_corpora/tokenized_in_words/da.one_sentence_per_line


datasets/train_dev_opensubtitles/words/de.one_sentence_per_line → datasets/opensubtitles_corpora/tokenized_in_words/de.one_sentence_per_line


datasets/train_dev_opensubtitles/words/en.one_sentence_per_line → datasets/opensubtitles_corpora/tokenized_in_words/en.one_sentence_per_line


datasets/train_dev_opensubtitles/words/es.one_sentence_per_line → datasets/opensubtitles_corpora/tokenized_in_words/es.one_sentence_per_line


datasets/train_dev_opensubtitles/words/et.one_sentence_per_line → datasets/opensubtitles_corpora/tokenized_in_words/et.one_sentence_per_line


datasets/train_dev_opensubtitles/words/eu.one_sentence_per_line → datasets/opensubtitles_corpora/tokenized_in_words/eu.one_sentence_per_line


+ 1 - 0
datasets/opensubtitles_corpora/tokenized_in_words/fr.one_sentence_per_line

@@ -0,0 +1 @@
+../../../.git/annex/objects/Wq/xq/MD5E-s38046127--184b33f856068727f9405bf33a97371f/MD5E-s38046127--184b33f856068727f9405bf33a97371f

datasets/train_dev_opensubtitles/words/ja.one_sentence_per_line → datasets/opensubtitles_corpora/tokenized_in_words/ja.one_sentence_per_line


datasets/train_dev_opensubtitles/words/pl.one_sentence_per_line → datasets/opensubtitles_corpora/tokenized_in_words/pl.one_sentence_per_line


datasets/train_dev_opensubtitles/words/pt.one_sentence_per_line → datasets/opensubtitles_corpora/tokenized_in_words/pt.one_sentence_per_line


datasets/train_dev_opensubtitles/words/sr.one_sentence_per_line → datasets/opensubtitles_corpora/tokenized_in_words/sr.one_sentence_per_line


datasets/train_dev_opensubtitles/words/tr.one_sentence_per_line → datasets/opensubtitles_corpora/tokenized_in_words/tr.one_sentence_per_line


+ 0 - 1
datasets/train_dev_opensubtitles/dev/da.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/7X/wF/MD5E-s628135--09500a5c30f324db1ac655fdcd698057/MD5E-s628135--09500a5c30f324db1ac655fdcd698057

+ 0 - 1
datasets/train_dev_opensubtitles/dev/de.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/WG/KJ/MD5E-s688449--67c5d38a3483c32089345dbd1f9b8841/MD5E-s688449--67c5d38a3483c32089345dbd1f9b8841

+ 0 - 1
datasets/train_dev_opensubtitles/dev/en.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/7W/1W/MD5E-s625461--d999aa024465427f7747514778b9cd31/MD5E-s625461--d999aa024465427f7747514778b9cd31

+ 0 - 1
datasets/train_dev_opensubtitles/dev/es.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/fQ/gq/MD5E-s611453--160fbd2fd42e4676a5a0dd8284529f7b/MD5E-s611453--160fbd2fd42e4676a5a0dd8284529f7b

+ 0 - 1
datasets/train_dev_opensubtitles/dev/et.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/KP/JZ/MD5E-s606641--dbf47a6784f1a02aeb641901981ca620/MD5E-s606641--dbf47a6784f1a02aeb641901981ca620

+ 0 - 1
datasets/train_dev_opensubtitles/dev/eu.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/Zk/X9/MD5E-s646723--030d017208f278d06ef8073090cfe20c/MD5E-s646723--030d017208f278d06ef8073090cfe20c

+ 0 - 1
datasets/train_dev_opensubtitles/dev/fr.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/FZ/g8/MD5E-s513871--3ef2f0efe98abffab417682447b3f226/MD5E-s513871--3ef2f0efe98abffab417682447b3f226

+ 0 - 1
datasets/train_dev_opensubtitles/dev/ja.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/19/4m/MD5E-s1611113--135e9eda094b10ab4194865f9965f886/MD5E-s1611113--135e9eda094b10ab4194865f9965f886

+ 0 - 1
datasets/train_dev_opensubtitles/dev/pl.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/V3/5f/MD5E-s662618--9b7c74396bd7086fe5ff2a0efb7b7f9f/MD5E-s662618--9b7c74396bd7086fe5ff2a0efb7b7f9f

+ 0 - 1
datasets/train_dev_opensubtitles/dev/pt.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/xf/Q6/MD5E-s704164--907624e97f8f81335629f133d6fed6e8/MD5E-s704164--907624e97f8f81335629f133d6fed6e8

+ 0 - 1
datasets/train_dev_opensubtitles/dev/sr.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/0z/91/MD5E-s599488--3f73eb78ff33b2313351ba75b40e76f8/MD5E-s599488--3f73eb78ff33b2313351ba75b40e76f8

+ 0 - 1
datasets/train_dev_opensubtitles/dev/tr.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/8x/3p/MD5E-s677913--f5b8f8f467ffd6a9a6b518832abc35ca/MD5E-s677913--f5b8f8f467ffd6a9a6b518832abc35ca

+ 0 - 1
datasets/train_dev_opensubtitles/train/da.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/xG/Qq/MD5E-s31099426--217a8f30125f412b2b62b886b04805d3/MD5E-s31099426--217a8f30125f412b2b62b886b04805d3

+ 0 - 1
datasets/train_dev_opensubtitles/train/de.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/6m/GZ/MD5E-s34414006--99464e797d913dd95e26aaaa56953603/MD5E-s34414006--99464e797d913dd95e26aaaa56953603

+ 0 - 1
datasets/train_dev_opensubtitles/train/en.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/WV/J0/MD5E-s31351708--b737f19c2dababb98835ba4b4f0e523a/MD5E-s31351708--b737f19c2dababb98835ba4b4f0e523a

+ 0 - 1
datasets/train_dev_opensubtitles/train/es.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/g5/vK/MD5E-s30317866--f8606c40818dae33f2afcb0411d30649/MD5E-s30317866--f8606c40818dae33f2afcb0411d30649

+ 0 - 1
datasets/train_dev_opensubtitles/train/et.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/w6/J4/MD5E-s30302587--2df9e11b5757fb9e929a0e7aed5a246a/MD5E-s30302587--2df9e11b5757fb9e929a0e7aed5a246a

+ 0 - 1
datasets/train_dev_opensubtitles/train/eu.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/Kq/7m/MD5E-s32429199--d34a59b568354a7a32d9290f3c53fefa/MD5E-s32429199--d34a59b568354a7a32d9290f3c53fefa

+ 0 - 1
datasets/train_dev_opensubtitles/train/fr.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/0W/26/MD5E-s25810117--38651bddeab03ca251207953a6d1f054/MD5E-s25810117--38651bddeab03ca251207953a6d1f054

+ 0 - 1
datasets/train_dev_opensubtitles/train/ja.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/Vg/v3/MD5E-s81431394--4d7d472cbea6cd74093005fb02d3790f/MD5E-s81431394--4d7d472cbea6cd74093005fb02d3790f

+ 0 - 1
datasets/train_dev_opensubtitles/train/pl.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/Qp/J3/MD5E-s33216341--cd691f241e1401ec2d2148f7922457e8/MD5E-s33216341--cd691f241e1401ec2d2148f7922457e8

+ 0 - 1
datasets/train_dev_opensubtitles/train/pt.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/3K/Vz/MD5E-s35268500--cb9c6784b5d9c397b1acf85fe62a069f/MD5E-s35268500--cb9c6784b5d9c397b1acf85fe62a069f

+ 0 - 1
datasets/train_dev_opensubtitles/train/sr.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/17/zW/MD5E-s30064150--a2cd6891c5356f12afdfc1187f89a036/MD5E-s30064150--a2cd6891c5356f12afdfc1187f89a036

+ 0 - 1
datasets/train_dev_opensubtitles/train/tr.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/Kg/k2/MD5E-s33675173--5de00ab7397c3cf808d7dcb6ab2f5115/MD5E-s33675173--5de00ab7397c3cf808d7dcb6ab2f5115

+ 0 - 1
datasets/train_dev_opensubtitles/words/fr.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/Xw/4g/MD5E-s38045436--ba10d5a625c23d52e3a006acff8e5f93/MD5E-s38045436--ba10d5a625c23d52e3a006acff8e5f93