Browse Source

importing conda environment

yaya-sy 1 year ago
parent
commit
202ad44f37
34 changed files with 254 additions and 166 deletions
  1. BIN
      code/__pycache__/make_noiser.cpython-310.pyc
  2. 3 1
      code/download_childes_corpora.py
  3. 44 28
      code/download_opensubtitles_corpora.py
  4. 85 0
      code/evaluate_language_models.py
  5. 47 36
      code/make_noiser.py
  6. 61 87
      code/test_on_all_languages.py
  7. 0 1
      datasets/opensubtitles_corpora/tokenized_in_words/da.one_sentence_per_line
  8. 0 1
      datasets/opensubtitles_corpora/tokenized_in_words/de.one_sentence_per_line
  9. 0 1
      datasets/opensubtitles_corpora/tokenized_in_words/en.one_sentence_per_line
  10. 0 1
      datasets/opensubtitles_corpora/tokenized_in_words/es.one_sentence_per_line
  11. 0 1
      datasets/opensubtitles_corpora/tokenized_in_words/et.one_sentence_per_line
  12. 0 1
      datasets/opensubtitles_corpora/tokenized_in_words/eu.one_sentence_per_line
  13. 0 1
      datasets/opensubtitles_corpora/tokenized_in_words/fr.one_sentence_per_line
  14. 0 1
      datasets/opensubtitles_corpora/tokenized_in_words/ja.one_sentence_per_line
  15. 0 1
      datasets/opensubtitles_corpora/tokenized_in_words/pl.one_sentence_per_line
  16. 0 1
      datasets/opensubtitles_corpora/tokenized_in_words/pt.one_sentence_per_line
  17. 0 1
      datasets/opensubtitles_corpora/tokenized_in_words/sr.one_sentence_per_line
  18. 0 1
      datasets/opensubtitles_corpora/tokenized_in_words/tr.one_sentence_per_line
  19. 1 0
      environment.yml
  20. 1 0
      estimated/da.one_sentence_per_line.arpa
  21. 1 0
      estimated/de.one_sentence_per_line.arpa
  22. 1 0
      estimated/en.one_sentence_per_line.arpa
  23. 1 0
      estimated/es.one_sentence_per_line.arpa
  24. 1 0
      estimated/et.one_sentence_per_line.arpa
  25. 1 0
      estimated/eu.one_sentence_per_line.arpa
  26. 1 0
      estimated/fr.one_sentence_per_line.arpa
  27. 1 0
      estimated/ja.one_sentence_per_line.arpa
  28. 1 0
      estimated/pl.one_sentence_per_line.arpa
  29. 1 0
      estimated/pt.one_sentence_per_line.arpa
  30. 1 0
      estimated/sr.one_sentence_per_line.arpa
  31. 1 0
      estimated/tr.one_sentence_per_line.arpa
  32. 1 0
      results/evaluation.csv
  33. 0 1
      results/results_for_study2_datalad.csv
  34. 0 1
      ter

BIN
code/__pycache__/make_noiser.cpython-310.pyc


+ 3 - 1
code/download_childes_corpora.py

@@ -246,7 +246,9 @@ if __name__ == "__main__" :
     parser.add_argument("--markers_json",
     parser.add_argument("--markers_json",
                         help="Json markers that serve for cleaning.",
                         help="Json markers that serve for cleaning.",
                         required=True)
                         required=True)
-    parser.add_argument("--phonemize_child", action=BooleanOptionalAction)
+    parser.add_argument("--phonemize_child",
+        help="Whether phonemize child utterances or not.",
+        action=BooleanOptionalAction)
     args = parser.parse_args()
     args = parser.parse_args()
     phonemize_child_or_not = args.phonemize_child
     phonemize_child_or_not = args.phonemize_child
     yaml_file = args.yaml_file
     yaml_file = args.yaml_file

+ 44 - 28
code/download_opensubtitles_corpora.py

@@ -8,18 +8,18 @@
     and only extract the extract the necessary number of sentences only on these chunks.
     and only extract the extract the necessary number of sentences only on these chunks.
 """
 """
 import os
 import os
-from random import shuffle
 import re
 import re
+from typing import Iterator
+import string
+import random
 from io import BytesIO
 from io import BytesIO
+from random import shuffle
 import gzip
 import gzip
 import yaml
 import yaml
 import requests
 import requests
 from tqdm import tqdm
 from tqdm import tqdm
-import string
 from phonemizer.backend import EspeakBackend
 from phonemizer.backend import EspeakBackend
 from phonemizer.separator import Separator
 from phonemizer.separator import Separator
-from typing import Iterator
-import random
 random.seed(80)
 random.seed(80)
 
 
 class DownloadOpenSubtitlesData :
 class DownloadOpenSubtitlesData :
@@ -40,11 +40,11 @@ class DownloadOpenSubtitlesData :
         self.base_url  = f"https://opus.nlpl.eu/download.php?f=OpenSubtitles/v{version}/mono/OpenSubtitles.raw."
         self.base_url  = f"https://opus.nlpl.eu/download.php?f=OpenSubtitles/v{version}/mono/OpenSubtitles.raw."
         self.separator = Separator(phone='$', word='@')
         self.separator = Separator(phone='$', word='@')
         self.total_sents = 0
         self.total_sents = 0
-    
+
     def _remove_ponctuations(self, sentence: str) -> str :
     def _remove_ponctuations(self, sentence: str) -> str :
         """
         """
         Method that removes ponctuations from a given sentence.
         Method that removes ponctuations from a given sentence.
-        
+
         Parameters
         Parameters
         ----------
         ----------
         - sent : str
         - sent : str
@@ -60,20 +60,22 @@ class DownloadOpenSubtitlesData :
     def _remove_brackets(self, sentence: str) -> str:
     def _remove_brackets(self, sentence: str) -> str:
         """
         """
         Method that removes brackets from a given sentence.
         Method that removes brackets from a given sentence.
-        
+
         Parameters
         Parameters
         ----------
         ----------
         - sentence : str
         - sentence : str
             The sentence for which brackets need to be removed.
             The sentence for which brackets need to be removed.
-        
+
         Returns
         Returns
         -------
         -------
         - str :
         - str :
             The sentence without brackets.
             The sentence without brackets.
         """
         """
         return re.sub(r"[\(\[].*?[\)\]]", "", sentence)
         return re.sub(r"[\(\[].*?[\)\]]", "", sentence)
-        
-    def get_sentences(self, language: str, max_sents_to_download: int, chunk: int=128) -> Iterator[tuple]:
+
+    def get_sentences(self, language: str,
+                        max_sents_to_download: int,
+                        chunk: int=128) -> Iterator[tuple]:
         """
         """
         Function for getting sentences from opensubtitles for a given language\
         Function for getting sentences from opensubtitles for a given language\
         and a number of sentences.
         and a number of sentences.
@@ -84,10 +86,10 @@ class DownloadOpenSubtitlesData :
             The language for which to retrieve the sentences.
             The language for which to retrieve the sentences.
         - max_sents_to_process : str
         - max_sents_to_process : str
             The number of sentences to retrieve.
             The number of sentences to retrieve.
-        
+
         Returns
         Returns
         -------
         -------
-        - Iterator : 
+        - Iterator :
             Iterator over sentences and progressbars
             Iterator over sentences and progressbars
         """
         """
         # stream in order to not load all on memory
         # stream in order to not load all on memory
@@ -102,28 +104,37 @@ class DownloadOpenSubtitlesData :
                 chunk = next(chunks)
                 chunk = next(chunks)
                 try :
                 try :
                     for sent in gzip.open(BytesIO(chunk), "rt") :
                     for sent in gzip.open(BytesIO(chunk), "rt") :
-                        if self.total_sents >= max_sents_to_download : 
+                        if self.total_sents >= max_sents_to_download :
                             break
                             break
                         else :
                         else :
                             yield sent, progress_bar
                             yield sent, progress_bar
                 except : # if exception, this means the chunk size is too small for gzip
                 except : # if exception, this means the chunk size is too small for gzip
-                    print(f"The chunk size is to small for {max_sents_to_download} sentences to download")
+                    print(f"The chunk size is to small for {max_sents_to_download}\
+                            sentences to download")
                     break
                     break
-    
-    def __call__(self, loaded_yaml_file, train_sentences, dev_sentences, chunk, out_dirname) -> None:
+
+    def __call__(self,
+                loaded_yaml_file,
+                train_sentences,
+                dev_sentences,
+                chunk,
+                out_dirname) -> None:
         """
         """
         Collect the sentences for all languages.
         Collect the sentences for all languages.
 
 
         Paramaters
         Paramaters
         ----------
         ----------
         - loaded_yaml_file : dict
         - loaded_yaml_file : dict
-            This dictionary contains all informations relevant for this study, for each language. \
-            This dictionary also contains informations about espeak ids for the languages, and this is relevant
-            for phonemization.
+            This dictionary contains all informations relevant\
+            for this study, for each language. This dictionary also\
+            contains informations about espeak ids for the languages,\
+            and this is relevant for phonemization.
         - train_sentences : int
         - train_sentences : int
-            Number of sentences to download for train corpora. This number is the same for all languages.
+            Number of sentences to download for train corpora.\
+            This number is the same for all languages.
         - dev_sentences : int
         - dev_sentences : int
-            Number of sentences to download for dev corpora. This number is the same for all languages.
+            Number of sentences to download for dev corpora.\
+            This number is the same for all languages.
         - out_dirname : str
         - out_dirname : str
             The folder where the outputs will be saved.
             The folder where the outputs will be saved.
         """
         """
@@ -154,7 +165,7 @@ class DownloadOpenSubtitlesData :
             shuffle(added_sents)
             shuffle(added_sents)
             train = added_sents[:train_sentences]
             train = added_sents[:train_sentences]
             dev = added_sents[train_sentences:max_sents_to_download]
             dev = added_sents[train_sentences:max_sents_to_download]
-            for sent_train in train : 
+            for sent_train in train :
                 output_file_train.write(sent_train + "\n")
                 output_file_train.write(sent_train + "\n")
             for sent_dev in dev :
             for sent_dev in dev :
                 output_file_dev.write(sent_dev + "\n")
                 output_file_dev.write(sent_dev + "\n")
@@ -165,22 +176,24 @@ if __name__ == "__main__" :
     parser = ArgumentParser()
     parser = ArgumentParser()
 
 
     parser.add_argument("--yaml_file",
     parser.add_argument("--yaml_file",
-                        help="YAML File containing for each language, all relevant information for downloading the data.",
+                        help="YAML File containing for each language,\
+                            all relevant information for downloading the data.",
                         required=True)
                         required=True)
     parser.add_argument("--out_dirname",
     parser.add_argument("--out_dirname",
                         help="The directory where outputs will be stored.",
                         help="The directory where outputs will be stored.",
                         required=True)
                         required=True)
     parser.add_argument("--chunk",
     parser.add_argument("--chunk",
-                        help="For the chunk size. This number should grow as much as you want to download many sentences.\
+                        help="For the chunk size. This number should\
+                            grow as much as you want to download many sentences.\
                             256 is a good number when you want to get 1_000_000 or less sentences",
                             256 is a good number when you want to get 1_000_000 or less sentences",
                         default=1024,
                         default=1024,
                         required=False)
                         required=False)
     parser.add_argument("--train_sentences",
     parser.add_argument("--train_sentences",
-                        help="Number of sent for the train corpora.",
+                        help="Number of sent for the training corpora.",
                         default=200_000,
                         default=200_000,
                         required=False)
                         required=False)
     parser.add_argument("--dev_sentences",
     parser.add_argument("--dev_sentences",
-                        help="Number of sent for the dev copora.",
+                        help="Number of sent for the dev or test copora.",
                         default=10_000,
                         default=10_000,
                         required=False)
                         required=False)
     args = parser.parse_args()
     args = parser.parse_args()
@@ -196,5 +209,8 @@ if __name__ == "__main__" :
         os.makedirs(f"{out_dirname}/tokenized_in_phonemes_dev")
         os.makedirs(f"{out_dirname}/tokenized_in_phonemes_dev")
     languages_to_download_informations = yaml.safe_load(open(args.yaml_file))
     languages_to_download_informations = yaml.safe_load(open(args.yaml_file))
     downloader = DownloadOpenSubtitlesData()
     downloader = DownloadOpenSubtitlesData()
-    downloader(languages_to_download_informations, args.train_sentences, args.dev_sentences, chunk, out_dirname)
-
+    downloader(languages_to_download_informations,
+                args.train_sentences,
+                args.dev_sentences,
+                chunk,
+                out_dirname)

+ 85 - 0
code/evaluate_language_models.py

@@ -0,0 +1,85 @@
+"""This module implements a function that\
+    evaluate the trained language moedels"""
+import os
+from math import log
+import random
+import pandas as pd
+import kenlm
+random.seed(1023)
+
+LANGUAGES_TYPOLOGIES = {
+    'da' : ("Danish", "fusional"),
+    'de' : ("German", "fusional"),
+    'en' : ("English", "fusional"),
+    'es' : ("Spanish", "fusional"),
+    'et' : ("Estonian", "agglutinative"),
+    'eu' : ("Basque", "agglutinative"),
+    'fr' : ("French", "fusional"),
+    'ja' : ("Japanese", "agglutinative"),
+    'pl' : ("Polish", "fusional"),
+    'pt' : ("Portuguese", "fusional"),
+    'sr' : ("Serbian", "fusional"),
+    'tr' : ("Turkish", "agglutinative")}
+
+def evaluate(train_files_directory: str,
+                dev_files_directory: str,
+                models_directory: str) -> pd.DataFrame:
+    """
+    This function will compute the entropies of\
+    test files for all languages.
+
+    Parameters
+    ----------
+    - train_files_directory: str
+        The path to the directory containing training files.
+    - dev_files_directory: str
+        The path to the directory containing testing/development files.
+    - models_directory: str
+        The path to the directory containing training trained\
+        language models.
+    """
+    triplets_files_model = zip(sorted(os.listdir(train_files_directory)),
+                                sorted(os.listdir(dev_files_directory)),
+                                sorted(os.listdir(models_directory)))
+    columns = ["language", "train_entropy", "dev_entropy"]
+    evaluation = pd.DataFrame(columns=columns, index=None)
+    for train_filename, dev_filename, model_filename in triplets_files_model :
+        language, _ = train_filename.split(".")
+        model = model = kenlm.Model(f"{models_directory}/{model_filename}")
+        train_sents = "\n".join(sent.strip() for sent in open(f"{train_files_directory}/{train_filename}"))
+        train_entropy = log(model.perplexity(train_sents))
+        dev_sents = "\n".join(sent.strip() for sent in open(f"{dev_files_directory}/{dev_filename}"))
+        dev_entropy = log(model.perplexity(dev_sents))
+        new_row = {
+            "language" : LANGUAGES_TYPOLOGIES[language][0],
+            "train_entropy" : train_entropy,
+            "dev_entropy" : dev_entropy
+        }
+        evaluation = evaluation.append(new_row, ignore_index=True)
+    return evaluation
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--train_files_directory',
+        required=True,
+        help="The directory containing the OpenSubtitles training files"
+        )
+    parser.add_argument('--dev_files_directory',
+        required=True,
+        help="The directory containing the OpenSubtitles test files"
+        )
+    parser.add_argument('--models_directory',
+        required=True,
+        help="The directory containing the trained language models"
+        )
+
+    args = parser.parse_args()
+    train_files = args.train_files_directory
+    dev_files = args.dev_files_directory
+    models_directory = args.models_directory
+    if not os.path.exists("results"):
+        os.makedirs("results")
+    evaluate(train_files,
+                dev_files,
+                models_directory).to_csv("results/evaluation.csv")

+ 47 - 36
code/make_noiser.py

@@ -9,32 +9,38 @@ random.seed(80)
 
 
 class Noise :
 class Noise :
     """
     """
-    This class simulate noise in the data. Crucially, noise can be made on three points :\
-    (1) The noise of phonemes order of a given sequence by making the order of the sequence more aribitrary,\
-    (2) Replacement of some phonemes of a given sequence by arbitrary sampled phonemes from a vocabulary and\
+    This class simulate noise in the data. Crucially,\
+    noise can be made on three points :\
+    (1) The noise of phonemes order of a given sequence\
+    by making the order of the sequence more aribitrary,\
+    (2) Replacement of some phonemes of a given sequence\
+    by arbitrary sampled phonemes from a vocabulary and\
     (3) By arbitrary interverting some sequences of two different speakers.
     (3) By arbitrary interverting some sequences of two different speakers.
 
 
     Atributes
     Atributes
     ---------
     ---------
     - phonemes_order_noise :
     - phonemes_order_noise :
-        Parameter for controling the degree of noise at the level of phonemes order. See the point 1 mentioned above.
+        Parameter for controling the degree of noise at the level\
+        of phonemes order. See the point 1 mentioned above.
     - speakers_noise :
     - speakers_noise :
-        Parameters for controling the degree of noise at the level of speakers. See the point 3 mentioned above.
+        Parameters for controling the degree of noise at the level\
+        of speakers. See the point 3 mentioned above.
     - phonemes_noise :
     - phonemes_noise :
-        Parameter for controling the degree of noise at the level of phonemes. See the point 2 mentioned above.
+        Parameter for controling the degree of noise at the level of phonemes.
+        See the point 2 mentioned above.
     """
     """
 
 
     def __init__(self,
     def __init__(self,
                     most_probable_phonemes: list,
                     most_probable_phonemes: list,
-                    phonemes_order_noise=0.3,
-                    speakers_noise=(0.5, 0.5),
-                    phonemes_noise=0.5) :
+                    phonemes_order_noise_value=0.3,
+                    speakers_noise_values=(0.5, 0.5),
+                    phonemes_noise_value=0.5) :
         self.most_probable_phonemes = most_probable_phonemes
         self.most_probable_phonemes = most_probable_phonemes
-        self.phonemes_order_noise = phonemes_order_noise
-        self.speakers_noise = speakers_noise
-        self.phonemes_noise = phonemes_noise
-    
-    def _order_noise(self, sequence: List[str]) -> str :
+        self.phonemes_order_noise_value = phonemes_order_noise_value
+        self.speakers_noise_values = speakers_noise_values
+        self.phonemes_noise_value = phonemes_noise_value
+
+    def order_noise(self, sequence: List[str]) -> str :
         """
         """
         Making noise the order of the phonemes in a given sequence
         Making noise the order of the phonemes in a given sequence
 
 
@@ -42,24 +48,24 @@ class Noise :
         ----------
         ----------
         - sequence : list
         - sequence : list
             The sequence for which the phonemes order must be noised.
             The sequence for which the phonemes order must be noised.
-        
+
         Returns
         Returns
         -------
         -------
         - str :
         - str :
             The sequence with the order of phonemes noised.
             The sequence with the order of phonemes noised.
         """
         """
         # number of phonemes to noise in the sequence = len(sequence) / nb_phonemes_to_noise
         # number of phonemes to noise in the sequence = len(sequence) / nb_phonemes_to_noise
-        phonemes_to_noise = round(len(sequence) * self.phonemes_order_noise)
+        phonemes_to_noise = round(len(sequence) * self.phonemes_order_noise_value)
         # sample nb_phonemes_to_noise positions in the sequence
         # sample nb_phonemes_to_noise positions in the sequence
         positions_sampled = list(sample(range(len(sequence)), k=phonemes_to_noise))
         positions_sampled = list(sample(range(len(sequence)), k=phonemes_to_noise))
         copied_positions = copy.deepcopy(positions_sampled)
         copied_positions = copy.deepcopy(positions_sampled)
         shuffle(copied_positions)
         shuffle(copied_positions)
-        # change the positions of the sampled phonemes 
+        # change the positions of the sampled phonemes
         for original_position, new_position in zip(positions_sampled, copied_positions):
         for original_position, new_position in zip(positions_sampled, copied_positions):
             sequence[original_position] = sequence[new_position]
             sequence[original_position] = sequence[new_position]
         return " ".join(sequence)
         return " ".join(sequence)
 
 
-    def _phonemes_noise(self, sequence: List[str]) -> str :
+    def phonemes_noise(self, sequence: List[str]) -> str :
         """
         """
         Makinng noise the phonemes of the sequence by replacing\
         Makinng noise the phonemes of the sequence by replacing\
         some phonemes of the sequence by arbitrary sampled phonemes\
         some phonemes of the sequence by arbitrary sampled phonemes\
@@ -69,13 +75,14 @@ class Noise :
         ----------
         ----------
         - sequence : list
         - sequence : list
             The sequence for which the phonemes must be noised.
             The sequence for which the phonemes must be noised.
-        
+
         Returns
         Returns
         -------
         -------
         - str :
         - str :
             The sequence with noised phonemes.
             The sequence with noised phonemes.
         """
         """
-        phonemes_to_noise = round(len(sequence) * self.phonemes_noise)
+        phonemes_to_noise = round(len(sequence) * self.phonemes_noise_value)
+        assert phonemes_to_noise < len(sequence), "Number of phoneme to noise greather that sequence's length"
         indexes = choices(range(len(sequence)), k=phonemes_to_noise)
         indexes = choices(range(len(sequence)), k=phonemes_to_noise)
         # choose new phonemes only from the most probable phonemes.
         # choose new phonemes only from the most probable phonemes.
         phonemes = choices(self.most_probable_phonemes, k=phonemes_to_noise)
         phonemes = choices(self.most_probable_phonemes, k=phonemes_to_noise)
@@ -83,8 +90,8 @@ class Noise :
         for idx, phonemes in zip(indexes, phonemes) :
         for idx, phonemes in zip(indexes, phonemes) :
             sequence[idx] = phonemes
             sequence[idx] = phonemes
         return " ".join(sequence)
         return " ".join(sequence)
-    
-    def _speakers_noise(self, speakers_sequences: Dict[str, set]) -> Dict[str, set] :
+
+    def speakers_noise(self, speakers_sequences: Dict[str, set]) -> Dict[str, set] :
         """
         """
         Making noise in the speaker's statements.
         Making noise in the speaker's statements.
 
 
@@ -92,14 +99,15 @@ class Noise :
         ----------
         ----------
         - speakers_sequences : dict
         - speakers_sequences : dict
             Dictionary containing the utterances for each speaker.
             Dictionary containing the utterances for each speaker.
-        
+
         Returns
         Returns
         -------
         -------
         - dict :
         - dict :
-            The dictionary containing the few statements interchanged between the two speakers.
+            The dictionary containing the few statements\
+            interchanged between the two speakers.
         """
         """
         first_speaker, second_speaker = "Target_Child", "Adult"
         first_speaker, second_speaker = "Target_Child", "Adult"
-        noise_first_speaker, noise_second_speaker = self.speakers_noise
+        noise_first_speaker, noise_second_speaker = self.speakers_noise_values
         speakers_sequences[second_speaker] = set(speakers_sequences[second_speaker])
         speakers_sequences[second_speaker] = set(speakers_sequences[second_speaker])
         speakers_sequences[first_speaker] = set(speakers_sequences[first_speaker])
         speakers_sequences[first_speaker] = set(speakers_sequences[first_speaker])
         # sample some percentage of utterances from each speaker
         # sample some percentage of utterances from each speaker
@@ -115,9 +123,9 @@ class Noise :
         # set to list
         # set to list
         speakers_sequences[first_speaker] = list(speakers_sequences[first_speaker])
         speakers_sequences[first_speaker] = list(speakers_sequences[first_speaker])
         speakers_sequences[second_speaker] = list(speakers_sequences[second_speaker])
         speakers_sequences[second_speaker] = list(speakers_sequences[second_speaker])
-        
+
         return speakers_sequences
         return speakers_sequences
-    
+
     def __call__(self, loaded_dataset: dict) -> dict:
     def __call__(self, loaded_dataset: dict) -> dict:
         """
         """
         Apply the three types of noise.
         Apply the three types of noise.
@@ -125,8 +133,9 @@ class Noise :
         Parameters
         Parameters
         ----------
         ----------
         loaded_dataset : dict
         loaded_dataset : dict
-            The dictionary containing the utterances for each family, at each and for each speaker.
-        
+            The dictionary containing the utterances for each family,\
+            at each and for each speaker.
+
         Returns
         Returns
         -------
         -------
         dict :
         dict :
@@ -137,15 +146,17 @@ class Noise :
                 if "Adult" in loaded_dataset[family][age] :
                 if "Adult" in loaded_dataset[family][age] :
                     for idx, utterance in enumerate(loaded_dataset[family][age]["Adult"]) :
                     for idx, utterance in enumerate(loaded_dataset[family][age]["Adult"]) :
                         splitted_utterance = utterance.split(" ")
                         splitted_utterance = utterance.split(" ")
-                        loaded_dataset[family][age]["Adult"][idx] = self._order_noise(splitted_utterance)
-                        loaded_dataset[family][age]["Adult"][idx] = self._phonemes_noise(splitted_utterance)
+                        loaded_dataset[family][age]["Adult"][idx] = self.order_noise(splitted_utterance)
+                        loaded_dataset[family][age]["Adult"][idx] = self.phonemes_noise(splitted_utterance)
                 if "Target_Child" in loaded_dataset[family][age] :
                 if "Target_Child" in loaded_dataset[family][age] :
                     for idx, utterance in enumerate(loaded_dataset[family][age]["Target_Child"]) :
                     for idx, utterance in enumerate(loaded_dataset[family][age]["Target_Child"]) :
                         splitted_utterance = utterance.split(" ")
                         splitted_utterance = utterance.split(" ")
-                        loaded_dataset[family][age]["Target_Child"][idx] = self._order_noise(splitted_utterance)
-                        loaded_dataset[family][age]["Target_Child"][idx] = self._phonemes_noise(splitted_utterance)
-                if "Target_Child" in loaded_dataset[family][age] and "Adult" in loaded_dataset[family][age] and all(self.speakers_noise):
-                    noised_speaker = self._speakers_noise(loaded_dataset[family][age])
+                        loaded_dataset[family][age]["Target_Child"][idx] = self.order_noise(splitted_utterance)
+                        loaded_dataset[family][age]["Target_Child"][idx] = self.phonemes_noise(splitted_utterance)
+                if("Target_Child" in loaded_dataset[family][age]
+                    and "Adult" in loaded_dataset[family][age]
+                    and all(self.speakers_noise_values)):
+                    noised_speaker = self.speakers_noise(loaded_dataset[family][age])
                     loaded_dataset[family][age]["Target_Child"] = noised_speaker["Target_Child"]
                     loaded_dataset[family][age]["Target_Child"] = noised_speaker["Target_Child"]
                     loaded_dataset[family][age]["Adult"] = noised_speaker["Adult"]
                     loaded_dataset[family][age]["Adult"] = noised_speaker["Adult"]
-        return loaded_dataset
+        return loaded_dataset

+ 61 - 87
code/test_on_all_languages.py

@@ -1,20 +1,14 @@
 import os
 import os
-import sys
+import random
 import json
 import json
-sys.path.append("./")
-sys.path.append("../")
-sys.path.append(".../")
+from math import log
+from typing import Iterable
 from itertools import product
 from itertools import product
 from tqdm import tqdm
 from tqdm import tqdm
 import kenlm
 import kenlm
-from math import log
-import numpy as np
 from make_noiser import Noise
 from make_noiser import Noise
 import pandas as pd
 import pandas as pd
-import sys
 from get_most_probable_phonemes import get_most_probable_phonemes
 from get_most_probable_phonemes import get_most_probable_phonemes
-import random
-from collections import Counter
 random.seed(1023)
 random.seed(1023)
 
 
 
 
@@ -26,29 +20,29 @@ LANGUAGES_TYPOLOGIES = {
     'et' : ("Estonian", "agglutinative"),
     'et' : ("Estonian", "agglutinative"),
     'eu' : ("Basque", "agglutinative"),
     'eu' : ("Basque", "agglutinative"),
     'fr' : ("French", "fusional"),
     'fr' : ("French", "fusional"),
-    'ja' : ("Japanese", "agglutinative"), 
-    'pl' : ("Polish", "fusional"), 
-    'pt' : ("Portuguese", "fusional"), 
-    'sr' : ("Serbian", "fusional"), 
+    'ja' : ("Japanese", "agglutinative"),
+    'pl' : ("Polish", "fusional"),
+    'pt' : ("Portuguese", "fusional"),
+    'sr' : ("Serbian", "fusional"),
     'tr' : ("Turkish", "agglutinative")}
     'tr' : ("Turkish", "agglutinative")}
 
 
-def compute_word_frequencies(word_train_corpus, pct=0.95) :
-    frequencies = Counter()
-    for line in word_train_corpus :
-        line = line.strip()
-        if not line : continue
-        # line = line.strip()
-        frequencies.update(Counter(line.split(" ")))
-    return dict(frequencies)
-
-
-def statistics_word(utterances, word_frequencies, model) :
+def statistics_word(utterances: list, model: kenlm.Model) -> dict:
+    """
+    This function will test a given language model\
+    on a given list of utterances.\
+    The function will also compute some statistics; MLU, TTR, etc
+
+    Parameters
+    ----------
+    - model
+        The estimated language model
+    - utterances: list
+        The utterances to test
+    """
     phoneme_utterances = []
     phoneme_utterances = []
     unique_words = set()
     unique_words = set()
-    nb_unk = 0
     mlu_w = 0.0
     mlu_w = 0.0
     mlu_p = 0.0
     mlu_p = 0.0
-    mean_word_frequencies = 0
     nb_utterances = 0
     nb_utterances = 0
     nb_words = 0
     nb_words = 0
 
 
@@ -68,13 +62,6 @@ def statistics_word(utterances, word_frequencies, model) :
         nb_words += len(utterance_words)
         nb_words += len(utterance_words)
         unique_words |= set(utterance_words)
         unique_words |= set(utterance_words)
 
 
-        for word in utterance_words :
-            word = word.strip()
-            if word in word_frequencies :
-                mean_word_frequencies += word_frequencies[word]
-            else : 
-                nb_unk += 1
-    
     mlu_w /= nb_utterances
     mlu_w /= nb_utterances
     mlu_p /= nb_utterances
     mlu_p /= nb_utterances
     ttr_w = len(unique_words) / nb_words
     ttr_w = len(unique_words) / nb_words
@@ -87,60 +74,62 @@ def statistics_word(utterances, word_frequencies, model) :
     statistics["mlu_w"] = mlu_w
     statistics["mlu_w"] = mlu_w
     statistics["mlu_p"] = mlu_p
     statistics["mlu_p"] = mlu_p
     statistics["ttr_w"] = ttr_w
     statistics["ttr_w"] = ttr_w
-    statistics["mean_word_frequencies"] = mean_word_frequencies
-    statistics["nb_unk"] = nb_unk
 
 
     return statistics
     return statistics
 
 
-def create_sparse_combinantions(values) :
+def create_sparse_combinantions(values: Iterable) -> set:
+    """
+    This function will create combinantions for noising.
+    Each item in the returned set contains four values corresponding\
+    to (1) phoneme noise, (2) noise of from adult to child utterances,\
+    (3) noise of from child to adult utterances and (4) noise of
+    These combinantions are sparse because we only noise one value at time.
+    For example, an item can be (0.0, 0.0, 0.0, 0.25), which means that we only
+    noise 25 percent of the phonemes, and nothing else is affected.
+    See the file make_noiser.py for more infomrations.
+    """
     sparse_combinantions = []
     sparse_combinantions = []
     for value in values :
     for value in values :
-        for idx in range(len(values)) : 
+        for idx in range(len(values)) :
             sparse_values = [0.0] * len(values)
             sparse_values = [0.0] * len(values)
             sparse_values[idx] = value
             sparse_values[idx] = value
             sparse_combinantions.append(tuple(sparse_values))
             sparse_combinantions.append(tuple(sparse_values))
     return set(sparse_combinantions)
     return set(sparse_combinantions)
 
 
-def test(json_files_directory, models_directory, phoneme_train_files, word_train_files, add_noise=False) :
+def test(json_files_directory, models_directory, train_files, add_noise=True) :
     """
     """
+    This function will test the language models on CHILDES corpora
     """
     """
     columns = ["language", "typology", "family", "speaker",\
     columns = ["language", "typology", "family", "speaker",\
                 "age", "perplexity", "entropy", "mlu", "mlu_without_repetition",\
                 "age", "perplexity", "entropy", "mlu", "mlu_without_repetition",\
                 "phonemes_order_noise", "speakers_noise_adult",\
                 "phonemes_order_noise", "speakers_noise_adult",\
                 "speakers_noise_child", "phonemes_noise"]
                 "speakers_noise_child", "phonemes_noise"]
     results = pd.DataFrame(columns=columns, index=None)
     results = pd.DataFrame(columns=columns, index=None)
-    all_combinations = list(product((0.0, 0.25, 0.5, 0.75), repeat=4)) if add_noise else [((0.0, 0.0, 0.0, 0.0))]
-    # sparse_combinantions = create_sparse_combinantions((0.0, 0.25, 0.5, 0.75))
+    # all_combinations = (list(product((0.0, 0.25, 0.5, 0.75), repeat=4))
+    #                       if add_noise else [((0.0, 0.0, 0.0, 0.0))])
+    sparse_combinantions = create_sparse_combinantions((0.0, 0.25, 0.5, 0.75))
     # noise_values = np.linspace(0.0, 1.0, num=6)
     # noise_values = np.linspace(0.0, 1.0, num=6)
-    for phonemes_noise, speakers_noise_child, speakers_noise_adult, phonemes_order_noise in tqdm(all_combinations, total=len(all_combinations)) :
+    for phonemes_noise, speakers_noise_child, speakers_noise_adult, phonemes_order_noise in tqdm(sparse_combinantions, total=len(sparse_combinantions)) :
         for test_filename, model_filename in product(os.listdir(json_files_directory), os.listdir(models_directory)) :
         for test_filename, model_filename in product(os.listdir(json_files_directory), os.listdir(models_directory)) :
             lg_iso, _ = test_filename.split(".")
             lg_iso, _ = test_filename.split(".")
             model_lg = model_filename.split(".")[0]
             model_lg = model_filename.split(".")[0]
-            if lg_iso != model_lg : continue
-            print(lg_iso, model_lg)
-            most_probable_phonemes = get_most_probable_phonemes(f"{phoneme_train_files}/{lg_iso}.one_sentence_per_line")
-            word_frequencies = compute_word_frequencies(f"{word_train_files}/{lg_iso}.one_sentence_per_line")
+            if lg_iso != model_lg :
+                continue
+            most_probable_phonemes = get_most_probable_phonemes(f"{train_files}/{lg_iso}.one_sentence_per_line")
             loaded_json = json.load(open(f"{json_files_directory}/{test_filename}"))
             loaded_json = json.load(open(f"{json_files_directory}/{test_filename}"))
             if add_noise :
             if add_noise :
                 noise = Noise(most_probable_phonemes,
                 noise = Noise(most_probable_phonemes,
-                                phonemes_order_noise=phonemes_order_noise,
-                                speakers_noise=(speakers_noise_child, speakers_noise_adult),
-                                phonemes_noise=phonemes_noise)
+                                phonemes_order_noise_value=phonemes_order_noise,
+                                speakers_noise_values=(speakers_noise_child, speakers_noise_adult),
+                                phonemes_noise_value=phonemes_noise)
                 loaded_json = noise(loaded_json)
                 loaded_json = noise(loaded_json)
             model = kenlm.Model(f"{models_directory}/{model_filename}")
             model = kenlm.Model(f"{models_directory}/{model_filename}")
             for family in loaded_json :
             for family in loaded_json :
-                for age in loaded_json[family] : 
+                for age in loaded_json[family] :
                     if age == "None" : print(family, lg_iso, age); continue
                     if age == "None" : print(family, lg_iso, age); continue
                     for speaker in loaded_json[family][age] :
                     for speaker in loaded_json[family][age] :
                         if speaker not in ["Adult", "Target_Child"] : continue
                         if speaker not in ["Adult", "Target_Child"] : continue
-                        # test_utterances = "\n".join(loaded_json[family][age][speaker])
-                        # utterances = [utterance.split(" ") for utterance in loaded_json[family][age][speaker]]
-                        # mlu = np.mean([len(utterance) for utterance in utterances])
-                        # mlu_without_repetition = np.mean([len(set(utterance)) for utterance in utterances])
-                        # ppl = model.perplexity(test_utterances)
-                        # entropy = log(ppl)
-
-                        results_statistics = statistics_word(loaded_json[family][age][speaker], word_frequencies, model)
+                        results_statistics = statistics_word(loaded_json[family][age][speaker], model)
                         language, typology = LANGUAGES_TYPOLOGIES[lg_iso]
                         language, typology = LANGUAGES_TYPOLOGIES[lg_iso]
                         new_row =  {"language" : language,
                         new_row =  {"language" : language,
                                     "typology" : typology,
                                     "typology" : typology,
@@ -152,8 +141,6 @@ def test(json_files_directory, models_directory, phoneme_train_files, word_train
                                     "mlu_w" : results_statistics["mlu_w"],
                                     "mlu_w" : results_statistics["mlu_w"],
                                     "mlu_p" : results_statistics["mlu_p"],
                                     "mlu_p" : results_statistics["mlu_p"],
                                     "ttr_w" : results_statistics["ttr_w"],
                                     "ttr_w" : results_statistics["ttr_w"],
-                                    "mean_word_frequencies" : results_statistics["mean_word_frequencies"],
-                                    "nb_unk" : results_statistics["nb_unk"],
                                     "phonemes_order_noise" : phonemes_order_noise,
                                     "phonemes_order_noise" : phonemes_order_noise,
                                     "speakers_noise_adult" : speakers_noise_adult,
                                     "speakers_noise_adult" : speakers_noise_adult,
                                     "speakers_noise_child" : speakers_noise_child,
                                     "speakers_noise_child" : speakers_noise_child,
@@ -164,46 +151,33 @@ if __name__ == "__main__":
     from argparse import ArgumentParser, BooleanOptionalAction
     from argparse import ArgumentParser, BooleanOptionalAction
 
 
     parser = ArgumentParser()
     parser = ArgumentParser()
-    parser.add_argument('--phoneme_train_directory',
-        required=True,
-        help="Dataset containing the train files in phonemes (dot one_sentence_per_line) "
-        )
-    parser.add_argument('--word_train_directory',
+    parser.add_argument('--train_directory',
         required=True,
         required=True,
-        help="Dataset containing the train files in words (dot one_sentence_per_line) "
+        help="The directory containing the train files tokenized in phonemes."
         )
         )
-    parser.add_argument('--models_directory', 
+    parser.add_argument('--models_directory',
         required=True,
         required=True,
-        help="Folder containing the estimated parameters"
+        help="The directory containing the trained language models."
         )
         )
-    
+
     parser.add_argument('--json_files_directory',
     parser.add_argument('--json_files_directory',
         required=True,
         required=True,
-        help="Directory containing json files for test"
+        help="The directory containing CHILDES utterances in json format for each language"
         )
         )
-    
-    parser.add_argument('--out_dirname',
-        required=True,
-        help="Out directory"
-        )
-    parser.add_argument('--out_filename',
-            required=True,
-            help="Out filename"
-            )
-    parser.add_argument("--add_noise", action=BooleanOptionalAction)
+
+    parser.add_argument("--add_noise",
+        help="Whether noise the CHILDES utterances or not",
+        action=BooleanOptionalAction)
 
 
     args = parser.parse_args()
     args = parser.parse_args()
     add_noise = args.add_noise
     add_noise = args.add_noise
     json_files_directory = args.json_files_directory
     json_files_directory = args.json_files_directory
-    phoneme_train_files, word_train_files = args.phoneme_train_directory, args.word_train_directory
+    phoneme_train_files = args.train_directory
     models_directory = args.models_directory
     models_directory = args.models_directory
-    out_dirname = args.out_dirname
-    out_filename = args.out_filename
 
 
     if not os.path.exists("results"):
     if not os.path.exists("results"):
         os.makedirs("results")
         os.makedirs("results")
-    test(json_files_directory, 
-        models_directory, 
-        phoneme_train_files,
-        word_train_files, 
-        add_noise).to_csv(f"{out_dirname}/{out_filename}.csv")
+    test(json_files_directory,
+            models_directory,
+            phoneme_train_files,
+            add_noise=add_noise).to_csv("results/results.csv")

+ 0 - 1
datasets/opensubtitles_corpora/tokenized_in_words/da.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/Xf/J3/MD5E-s15872283--589ddd6867dae20a40048aaed160f751/MD5E-s15872283--589ddd6867dae20a40048aaed160f751

+ 0 - 1
datasets/opensubtitles_corpora/tokenized_in_words/de.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/PK/VG/MD5E-s16478486--16c8639a6e1bc848b9e7c835138bc779/MD5E-s16478486--16c8639a6e1bc848b9e7c835138bc779

+ 0 - 1
datasets/opensubtitles_corpora/tokenized_in_words/en.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/k5/41/MD5E-s24346046--828c32d227d0c2d934ac85954c0549be/MD5E-s24346046--828c32d227d0c2d934ac85954c0549be

+ 0 - 1
datasets/opensubtitles_corpora/tokenized_in_words/es.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/Vj/Zq/MD5E-s15998814--588f8969805d7e19a2fb8555c805c7bf/MD5E-s15998814--588f8969805d7e19a2fb8555c805c7bf

+ 0 - 1
datasets/opensubtitles_corpora/tokenized_in_words/et.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/kK/m3/MD5E-s16250386--1fa013016100de4c1369aaaf5b51c3bc/MD5E-s16250386--1fa013016100de4c1369aaaf5b51c3bc

+ 0 - 1
datasets/opensubtitles_corpora/tokenized_in_words/eu.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/Z5/WJ/MD5E-s10355435--f2c85a6f418cd351e643387f77f80466/MD5E-s10355435--f2c85a6f418cd351e643387f77f80466

+ 0 - 1
datasets/opensubtitles_corpora/tokenized_in_words/fr.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/xQ/Mz/MD5E-s13149110--66bc34c1a1005f80f734ff7c9e0330ef/MD5E-s13149110--66bc34c1a1005f80f734ff7c9e0330ef

+ 0 - 1
datasets/opensubtitles_corpora/tokenized_in_words/ja.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/Fx/P8/MD5E-s14790606--0468f28b5726a09d980c07fb217974e8/MD5E-s14790606--0468f28b5726a09d980c07fb217974e8

+ 0 - 1
datasets/opensubtitles_corpora/tokenized_in_words/pl.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/8k/jP/MD5E-s27403501--65e8c8d45fe6075ab088da985dde5bfd/MD5E-s27403501--65e8c8d45fe6075ab088da985dde5bfd

+ 0 - 1
datasets/opensubtitles_corpora/tokenized_in_words/pt.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/pq/Gx/MD5E-s19442758--174c64a1b49c9d94615dfcfb5c52508b/MD5E-s19442758--174c64a1b49c9d94615dfcfb5c52508b

+ 0 - 1
datasets/opensubtitles_corpora/tokenized_in_words/sr.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/VK/59/MD5E-s15029711--76eb34c274a6e5ab100575ce2fa60ab0/MD5E-s15029711--76eb34c274a6e5ab100575ce2fa60ab0

+ 0 - 1
datasets/opensubtitles_corpora/tokenized_in_words/tr.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/FM/Vk/MD5E-s22131015--b159f0714684e9fc60945904418a1240/MD5E-s22131015--b159f0714684e9fc60945904418a1240

+ 1 - 0
environment.yml

@@ -0,0 +1 @@
+.git/annex/objects/V8/56/MD5E-s5478--6ce1a1dfc33f3c2aee2a0c4f0f11aa02.yml/MD5E-s5478--6ce1a1dfc33f3c2aee2a0c4f0f11aa02.yml

+ 1 - 0
estimated/da.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/5x/74/MD5E-s28028840--71fbf9fb169884d736da26c047e16f4e.arpa/MD5E-s28028840--71fbf9fb169884d736da26c047e16f4e.arpa

+ 1 - 0
estimated/de.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/Z2/0W/MD5E-s22540364--11e64685c900b25e47a7c2a137dd7a9b.arpa/MD5E-s22540364--11e64685c900b25e47a7c2a137dd7a9b.arpa

+ 1 - 0
estimated/en.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/KG/5q/MD5E-s31436879--847b2a7d2e5210d87f638963a8764808.arpa/MD5E-s31436879--847b2a7d2e5210d87f638963a8764808.arpa

+ 1 - 0
estimated/es.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/Zq/pj/MD5E-s10061705--b466f7fc80c31c74891f85256d324c43.arpa/MD5E-s10061705--b466f7fc80c31c74891f85256d324c43.arpa

+ 1 - 0
estimated/et.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/w4/9Q/MD5E-s18873182--89176dfdd746dd62fe277cf760489709.arpa/MD5E-s18873182--89176dfdd746dd62fe277cf760489709.arpa

+ 1 - 0
estimated/eu.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/vZ/2G/MD5E-s12176188--ae20d403fb51fef0b7572521b95d47a9.arpa/MD5E-s12176188--ae20d403fb51fef0b7572521b95d47a9.arpa

+ 1 - 0
estimated/fr.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/QG/ff/MD5E-s20901089--1873e4fa871af748a4028e962a941b74.arpa/MD5E-s20901089--1873e4fa871af748a4028e962a941b74.arpa

+ 1 - 0
estimated/ja.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/6W/kM/MD5E-s8026445--d320df753b865052827e96c0be67e418.arpa/MD5E-s8026445--d320df753b865052827e96c0be67e418.arpa

+ 1 - 0
estimated/pl.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/5j/46/MD5E-s23833364--0c4492ab80d3c7f37ff923288dc88d80.arpa/MD5E-s23833364--0c4492ab80d3c7f37ff923288dc88d80.arpa

+ 1 - 0
estimated/pt.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/ZF/pz/MD5E-s22346672--1a9f56836b07f9a0d981e329ce47e1c9.arpa/MD5E-s22346672--1a9f56836b07f9a0d981e329ce47e1c9.arpa

+ 1 - 0
estimated/sr.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/6M/xg/MD5E-s20755431--b4f26a89a36c9c4a61bb39a00c83c116.arpa/MD5E-s20755431--b4f26a89a36c9c4a61bb39a00c83c116.arpa

+ 1 - 0
estimated/tr.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/Gf/70/MD5E-s18935056--4fe9ce073a5c9cb9e601fa1424524c3a.arpa/MD5E-s18935056--4fe9ce073a5c9cb9e601fa1424524c3a.arpa

+ 1 - 0
results/evaluation.csv

@@ -0,0 +1 @@
+../.git/annex/objects/5g/Gj/MD5E-s607--3a8fb15dbd039d29e12a0e126c73112d.csv/MD5E-s607--3a8fb15dbd039d29e12a0e126c73112d.csv

+ 0 - 1
results/results_for_study2_datalad.csv

@@ -1 +0,0 @@
-../.git/annex/objects/gg/pJ/MD5E-s1271727--76f1f59ac68fe619ae34bd03ede011a8.csv/MD5E-s1271727--76f1f59ac68fe619ae34bd03ede011a8.csv

+ 0 - 1
ter

@@ -1 +0,0 @@
-.git/annex/objects/K5/g1/MD5E-s11843--f7a613986d426f7c5667fc59487dd559/MD5E-s11843--f7a613986d426f7c5667fc59487dd559