Browse Source

importing conda environment

yaya-sy 1 year ago
parent
commit
202ad44f37
34 changed files with 254 additions and 166 deletions
  1. BIN
      code/__pycache__/make_noiser.cpython-310.pyc
  2. 3 1
      code/download_childes_corpora.py
  3. 44 28
      code/download_opensubtitles_corpora.py
  4. 85 0
      code/evaluate_language_models.py
  5. 47 36
      code/make_noiser.py
  6. 61 87
      code/test_on_all_languages.py
  7. 0 1
      datasets/opensubtitles_corpora/tokenized_in_words/da.one_sentence_per_line
  8. 0 1
      datasets/opensubtitles_corpora/tokenized_in_words/de.one_sentence_per_line
  9. 0 1
      datasets/opensubtitles_corpora/tokenized_in_words/en.one_sentence_per_line
  10. 0 1
      datasets/opensubtitles_corpora/tokenized_in_words/es.one_sentence_per_line
  11. 0 1
      datasets/opensubtitles_corpora/tokenized_in_words/et.one_sentence_per_line
  12. 0 1
      datasets/opensubtitles_corpora/tokenized_in_words/eu.one_sentence_per_line
  13. 0 1
      datasets/opensubtitles_corpora/tokenized_in_words/fr.one_sentence_per_line
  14. 0 1
      datasets/opensubtitles_corpora/tokenized_in_words/ja.one_sentence_per_line
  15. 0 1
      datasets/opensubtitles_corpora/tokenized_in_words/pl.one_sentence_per_line
  16. 0 1
      datasets/opensubtitles_corpora/tokenized_in_words/pt.one_sentence_per_line
  17. 0 1
      datasets/opensubtitles_corpora/tokenized_in_words/sr.one_sentence_per_line
  18. 0 1
      datasets/opensubtitles_corpora/tokenized_in_words/tr.one_sentence_per_line
  19. 1 0
      environment.yml
  20. 1 0
      estimated/da.one_sentence_per_line.arpa
  21. 1 0
      estimated/de.one_sentence_per_line.arpa
  22. 1 0
      estimated/en.one_sentence_per_line.arpa
  23. 1 0
      estimated/es.one_sentence_per_line.arpa
  24. 1 0
      estimated/et.one_sentence_per_line.arpa
  25. 1 0
      estimated/eu.one_sentence_per_line.arpa
  26. 1 0
      estimated/fr.one_sentence_per_line.arpa
  27. 1 0
      estimated/ja.one_sentence_per_line.arpa
  28. 1 0
      estimated/pl.one_sentence_per_line.arpa
  29. 1 0
      estimated/pt.one_sentence_per_line.arpa
  30. 1 0
      estimated/sr.one_sentence_per_line.arpa
  31. 1 0
      estimated/tr.one_sentence_per_line.arpa
  32. 1 0
      results/evaluation.csv
  33. 0 1
      results/results_for_study2_datalad.csv
  34. 0 1
      ter

BIN
code/__pycache__/make_noiser.cpython-310.pyc


+ 3 - 1
code/download_childes_corpora.py

@@ -246,7 +246,9 @@ if __name__ == "__main__" :
     parser.add_argument("--markers_json",
                         help="Json markers that serve for cleaning.",
                         required=True)
-    parser.add_argument("--phonemize_child", action=BooleanOptionalAction)
+    parser.add_argument("--phonemize_child",
+        help="Whether phonemize child utterances or not.",
+        action=BooleanOptionalAction)
     args = parser.parse_args()
     phonemize_child_or_not = args.phonemize_child
     yaml_file = args.yaml_file

+ 44 - 28
code/download_opensubtitles_corpora.py

@@ -8,18 +8,18 @@
     and only extract the extract the necessary number of sentences only on these chunks.
 """
 import os
-from random import shuffle
 import re
+from typing import Iterator
+import string
+import random
 from io import BytesIO
+from random import shuffle
 import gzip
 import yaml
 import requests
 from tqdm import tqdm
-import string
 from phonemizer.backend import EspeakBackend
 from phonemizer.separator import Separator
-from typing import Iterator
-import random
 random.seed(80)
 
 class DownloadOpenSubtitlesData :
@@ -40,11 +40,11 @@ class DownloadOpenSubtitlesData :
         self.base_url  = f"https://opus.nlpl.eu/download.php?f=OpenSubtitles/v{version}/mono/OpenSubtitles.raw."
         self.separator = Separator(phone='$', word='@')
         self.total_sents = 0
-    
+
     def _remove_ponctuations(self, sentence: str) -> str :
         """
         Method that removes ponctuations from a given sentence.
-        
+
         Parameters
         ----------
         - sent : str
@@ -60,20 +60,22 @@ class DownloadOpenSubtitlesData :
     def _remove_brackets(self, sentence: str) -> str:
         """
         Method that removes brackets from a given sentence.
-        
+
         Parameters
         ----------
         - sentence : str
             The sentence for which brackets need to be removed.
-        
+
         Returns
         -------
         - str :
             The sentence without brackets.
         """
         return re.sub(r"[\(\[].*?[\)\]]", "", sentence)
-        
-    def get_sentences(self, language: str, max_sents_to_download: int, chunk: int=128) -> Iterator[tuple]:
+
+    def get_sentences(self, language: str,
+                        max_sents_to_download: int,
+                        chunk: int=128) -> Iterator[tuple]:
         """
         Function for getting sentences from opensubtitles for a given language\
         and a number of sentences.
@@ -84,10 +86,10 @@ class DownloadOpenSubtitlesData :
             The language for which to retrieve the sentences.
         - max_sents_to_process : str
             The number of sentences to retrieve.
-        
+
         Returns
         -------
-        - Iterator : 
+        - Iterator :
             Iterator over sentences and progressbars
         """
         # stream in order to not load all on memory
@@ -102,28 +104,37 @@ class DownloadOpenSubtitlesData :
                 chunk = next(chunks)
                 try :
                     for sent in gzip.open(BytesIO(chunk), "rt") :
-                        if self.total_sents >= max_sents_to_download : 
+                        if self.total_sents >= max_sents_to_download :
                             break
                         else :
                             yield sent, progress_bar
                 except : # if exception, this means the chunk size is too small for gzip
-                    print(f"The chunk size is to small for {max_sents_to_download} sentences to download")
+                    print(f"The chunk size is to small for {max_sents_to_download}\
+                            sentences to download")
                     break
-    
-    def __call__(self, loaded_yaml_file, train_sentences, dev_sentences, chunk, out_dirname) -> None:
+
+    def __call__(self,
+                loaded_yaml_file,
+                train_sentences,
+                dev_sentences,
+                chunk,
+                out_dirname) -> None:
         """
         Collect the sentences for all languages.
 
         Paramaters
         ----------
         - loaded_yaml_file : dict
-            This dictionary contains all informations relevant for this study, for each language. \
-            This dictionary also contains informations about espeak ids for the languages, and this is relevant
-            for phonemization.
+            This dictionary contains all informations relevant\
+            for this study, for each language. This dictionary also\
+            contains informations about espeak ids for the languages,\
+            and this is relevant for phonemization.
         - train_sentences : int
-            Number of sentences to download for train corpora. This number is the same for all languages.
+            Number of sentences to download for train corpora.\
+            This number is the same for all languages.
         - dev_sentences : int
-            Number of sentences to download for dev corpora. This number is the same for all languages.
+            Number of sentences to download for dev corpora.\
+            This number is the same for all languages.
         - out_dirname : str
             The folder where the outputs will be saved.
         """
@@ -154,7 +165,7 @@ class DownloadOpenSubtitlesData :
             shuffle(added_sents)
             train = added_sents[:train_sentences]
             dev = added_sents[train_sentences:max_sents_to_download]
-            for sent_train in train : 
+            for sent_train in train :
                 output_file_train.write(sent_train + "\n")
             for sent_dev in dev :
                 output_file_dev.write(sent_dev + "\n")
@@ -165,22 +176,24 @@ if __name__ == "__main__" :
     parser = ArgumentParser()
 
     parser.add_argument("--yaml_file",
-                        help="YAML File containing for each language, all relevant information for downloading the data.",
+                        help="YAML File containing for each language,\
+                            all relevant information for downloading the data.",
                         required=True)
     parser.add_argument("--out_dirname",
                         help="The directory where outputs will be stored.",
                         required=True)
     parser.add_argument("--chunk",
-                        help="For the chunk size. This number should grow as much as you want to download many sentences.\
+                        help="For the chunk size. This number should\
+                            grow as much as you want to download many sentences.\
                             256 is a good number when you want to get 1_000_000 or less sentences",
                         default=1024,
                         required=False)
     parser.add_argument("--train_sentences",
-                        help="Number of sent for the train corpora.",
+                        help="Number of sent for the training corpora.",
                         default=200_000,
                         required=False)
     parser.add_argument("--dev_sentences",
-                        help="Number of sent for the dev copora.",
+                        help="Number of sent for the dev or test copora.",
                         default=10_000,
                         required=False)
     args = parser.parse_args()
@@ -196,5 +209,8 @@ if __name__ == "__main__" :
         os.makedirs(f"{out_dirname}/tokenized_in_phonemes_dev")
     languages_to_download_informations = yaml.safe_load(open(args.yaml_file))
     downloader = DownloadOpenSubtitlesData()
-    downloader(languages_to_download_informations, args.train_sentences, args.dev_sentences, chunk, out_dirname)
-
+    downloader(languages_to_download_informations,
+                args.train_sentences,
+                args.dev_sentences,
+                chunk,
+                out_dirname)

+ 85 - 0
code/evaluate_language_models.py

@@ -0,0 +1,85 @@
+"""This module implements a function that\
+    evaluate the trained language moedels"""
+import os
+from math import log
+import random
+import pandas as pd
+import kenlm
+random.seed(1023)
+
+LANGUAGES_TYPOLOGIES = {
+    'da' : ("Danish", "fusional"),
+    'de' : ("German", "fusional"),
+    'en' : ("English", "fusional"),
+    'es' : ("Spanish", "fusional"),
+    'et' : ("Estonian", "agglutinative"),
+    'eu' : ("Basque", "agglutinative"),
+    'fr' : ("French", "fusional"),
+    'ja' : ("Japanese", "agglutinative"),
+    'pl' : ("Polish", "fusional"),
+    'pt' : ("Portuguese", "fusional"),
+    'sr' : ("Serbian", "fusional"),
+    'tr' : ("Turkish", "agglutinative")}
+
+def evaluate(train_files_directory: str,
+                dev_files_directory: str,
+                models_directory: str) -> pd.DataFrame:
+    """
+    This function will compute the entropies of\
+    test files for all languages.
+
+    Parameters
+    ----------
+    - train_files_directory: str
+        The path to the directory containing training files.
+    - dev_files_directory: str
+        The path to the directory containing testing/development files.
+    - models_directory: str
+        The path to the directory containing training trained\
+        language models.
+    """
+    triplets_files_model = zip(sorted(os.listdir(train_files_directory)),
+                                sorted(os.listdir(dev_files_directory)),
+                                sorted(os.listdir(models_directory)))
+    columns = ["language", "train_entropy", "dev_entropy"]
+    evaluation = pd.DataFrame(columns=columns, index=None)
+    for train_filename, dev_filename, model_filename in triplets_files_model :
+        language, _ = train_filename.split(".")
+        model = model = kenlm.Model(f"{models_directory}/{model_filename}")
+        train_sents = "\n".join(sent.strip() for sent in open(f"{train_files_directory}/{train_filename}"))
+        train_entropy = log(model.perplexity(train_sents))
+        dev_sents = "\n".join(sent.strip() for sent in open(f"{dev_files_directory}/{dev_filename}"))
+        dev_entropy = log(model.perplexity(dev_sents))
+        new_row = {
+            "language" : LANGUAGES_TYPOLOGIES[language][0],
+            "train_entropy" : train_entropy,
+            "dev_entropy" : dev_entropy
+        }
+        evaluation = evaluation.append(new_row, ignore_index=True)
+    return evaluation
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--train_files_directory',
+        required=True,
+        help="The directory containing the OpenSubtitles training files"
+        )
+    parser.add_argument('--dev_files_directory',
+        required=True,
+        help="The directory containing the OpenSubtitles test files"
+        )
+    parser.add_argument('--models_directory',
+        required=True,
+        help="The directory containing the trained language models"
+        )
+
+    args = parser.parse_args()
+    train_files = args.train_files_directory
+    dev_files = args.dev_files_directory
+    models_directory = args.models_directory
+    if not os.path.exists("results"):
+        os.makedirs("results")
+    evaluate(train_files,
+                dev_files,
+                models_directory).to_csv("results/evaluation.csv")

+ 47 - 36
code/make_noiser.py

@@ -9,32 +9,38 @@ random.seed(80)
 
 class Noise :
     """
-    This class simulate noise in the data. Crucially, noise can be made on three points :\
-    (1) The noise of phonemes order of a given sequence by making the order of the sequence more aribitrary,\
-    (2) Replacement of some phonemes of a given sequence by arbitrary sampled phonemes from a vocabulary and\
+    This class simulate noise in the data. Crucially,\
+    noise can be made on three points :\
+    (1) The noise of phonemes order of a given sequence\
+    by making the order of the sequence more aribitrary,\
+    (2) Replacement of some phonemes of a given sequence\
+    by arbitrary sampled phonemes from a vocabulary and\
     (3) By arbitrary interverting some sequences of two different speakers.
 
     Atributes
     ---------
     - phonemes_order_noise :
-        Parameter for controling the degree of noise at the level of phonemes order. See the point 1 mentioned above.
+        Parameter for controling the degree of noise at the level\
+        of phonemes order. See the point 1 mentioned above.
     - speakers_noise :
-        Parameters for controling the degree of noise at the level of speakers. See the point 3 mentioned above.
+        Parameters for controling the degree of noise at the level\
+        of speakers. See the point 3 mentioned above.
     - phonemes_noise :
-        Parameter for controling the degree of noise at the level of phonemes. See the point 2 mentioned above.
+        Parameter for controling the degree of noise at the level of phonemes.
+        See the point 2 mentioned above.
     """
 
     def __init__(self,
                     most_probable_phonemes: list,
-                    phonemes_order_noise=0.3,
-                    speakers_noise=(0.5, 0.5),
-                    phonemes_noise=0.5) :
+                    phonemes_order_noise_value=0.3,
+                    speakers_noise_values=(0.5, 0.5),
+                    phonemes_noise_value=0.5) :
         self.most_probable_phonemes = most_probable_phonemes
-        self.phonemes_order_noise = phonemes_order_noise
-        self.speakers_noise = speakers_noise
-        self.phonemes_noise = phonemes_noise
-    
-    def _order_noise(self, sequence: List[str]) -> str :
+        self.phonemes_order_noise_value = phonemes_order_noise_value
+        self.speakers_noise_values = speakers_noise_values
+        self.phonemes_noise_value = phonemes_noise_value
+
+    def order_noise(self, sequence: List[str]) -> str :
         """
         Making noise the order of the phonemes in a given sequence
 
@@ -42,24 +48,24 @@ class Noise :
         ----------
         - sequence : list
             The sequence for which the phonemes order must be noised.
-        
+
         Returns
         -------
         - str :
             The sequence with the order of phonemes noised.
         """
         # number of phonemes to noise in the sequence = len(sequence) / nb_phonemes_to_noise
-        phonemes_to_noise = round(len(sequence) * self.phonemes_order_noise)
+        phonemes_to_noise = round(len(sequence) * self.phonemes_order_noise_value)
         # sample nb_phonemes_to_noise positions in the sequence
         positions_sampled = list(sample(range(len(sequence)), k=phonemes_to_noise))
         copied_positions = copy.deepcopy(positions_sampled)
         shuffle(copied_positions)
-        # change the positions of the sampled phonemes 
+        # change the positions of the sampled phonemes
         for original_position, new_position in zip(positions_sampled, copied_positions):
             sequence[original_position] = sequence[new_position]
         return " ".join(sequence)
 
-    def _phonemes_noise(self, sequence: List[str]) -> str :
+    def phonemes_noise(self, sequence: List[str]) -> str :
         """
         Makinng noise the phonemes of the sequence by replacing\
         some phonemes of the sequence by arbitrary sampled phonemes\
@@ -69,13 +75,14 @@ class Noise :
         ----------
         - sequence : list
             The sequence for which the phonemes must be noised.
-        
+
         Returns
         -------
         - str :
             The sequence with noised phonemes.
         """
-        phonemes_to_noise = round(len(sequence) * self.phonemes_noise)
+        phonemes_to_noise = round(len(sequence) * self.phonemes_noise_value)
+        assert phonemes_to_noise < len(sequence), "Number of phoneme to noise greather that sequence's length"
         indexes = choices(range(len(sequence)), k=phonemes_to_noise)
         # choose new phonemes only from the most probable phonemes.
         phonemes = choices(self.most_probable_phonemes, k=phonemes_to_noise)
@@ -83,8 +90,8 @@ class Noise :
         for idx, phonemes in zip(indexes, phonemes) :
             sequence[idx] = phonemes
         return " ".join(sequence)
-    
-    def _speakers_noise(self, speakers_sequences: Dict[str, set]) -> Dict[str, set] :
+
+    def speakers_noise(self, speakers_sequences: Dict[str, set]) -> Dict[str, set] :
         """
         Making noise in the speaker's statements.
 
@@ -92,14 +99,15 @@ class Noise :
         ----------
         - speakers_sequences : dict
             Dictionary containing the utterances for each speaker.
-        
+
         Returns
         -------
         - dict :
-            The dictionary containing the few statements interchanged between the two speakers.
+            The dictionary containing the few statements\
+            interchanged between the two speakers.
         """
         first_speaker, second_speaker = "Target_Child", "Adult"
-        noise_first_speaker, noise_second_speaker = self.speakers_noise
+        noise_first_speaker, noise_second_speaker = self.speakers_noise_values
         speakers_sequences[second_speaker] = set(speakers_sequences[second_speaker])
         speakers_sequences[first_speaker] = set(speakers_sequences[first_speaker])
         # sample some percentage of utterances from each speaker
@@ -115,9 +123,9 @@ class Noise :
         # set to list
         speakers_sequences[first_speaker] = list(speakers_sequences[first_speaker])
         speakers_sequences[second_speaker] = list(speakers_sequences[second_speaker])
-        
+
         return speakers_sequences
-    
+
     def __call__(self, loaded_dataset: dict) -> dict:
         """
         Apply the three types of noise.
@@ -125,8 +133,9 @@ class Noise :
         Parameters
         ----------
         loaded_dataset : dict
-            The dictionary containing the utterances for each family, at each and for each speaker.
-        
+            The dictionary containing the utterances for each family,\
+            at each and for each speaker.
+
         Returns
         -------
         dict :
@@ -137,15 +146,17 @@ class Noise :
                 if "Adult" in loaded_dataset[family][age] :
                     for idx, utterance in enumerate(loaded_dataset[family][age]["Adult"]) :
                         splitted_utterance = utterance.split(" ")
-                        loaded_dataset[family][age]["Adult"][idx] = self._order_noise(splitted_utterance)
-                        loaded_dataset[family][age]["Adult"][idx] = self._phonemes_noise(splitted_utterance)
+                        loaded_dataset[family][age]["Adult"][idx] = self.order_noise(splitted_utterance)
+                        loaded_dataset[family][age]["Adult"][idx] = self.phonemes_noise(splitted_utterance)
                 if "Target_Child" in loaded_dataset[family][age] :
                     for idx, utterance in enumerate(loaded_dataset[family][age]["Target_Child"]) :
                         splitted_utterance = utterance.split(" ")
-                        loaded_dataset[family][age]["Target_Child"][idx] = self._order_noise(splitted_utterance)
-                        loaded_dataset[family][age]["Target_Child"][idx] = self._phonemes_noise(splitted_utterance)
-                if "Target_Child" in loaded_dataset[family][age] and "Adult" in loaded_dataset[family][age] and all(self.speakers_noise):
-                    noised_speaker = self._speakers_noise(loaded_dataset[family][age])
+                        loaded_dataset[family][age]["Target_Child"][idx] = self.order_noise(splitted_utterance)
+                        loaded_dataset[family][age]["Target_Child"][idx] = self.phonemes_noise(splitted_utterance)
+                if("Target_Child" in loaded_dataset[family][age]
+                    and "Adult" in loaded_dataset[family][age]
+                    and all(self.speakers_noise_values)):
+                    noised_speaker = self.speakers_noise(loaded_dataset[family][age])
                     loaded_dataset[family][age]["Target_Child"] = noised_speaker["Target_Child"]
                     loaded_dataset[family][age]["Adult"] = noised_speaker["Adult"]
-        return loaded_dataset
+        return loaded_dataset

+ 61 - 87
code/test_on_all_languages.py

@@ -1,20 +1,14 @@
 import os
-import sys
+import random
 import json
-sys.path.append("./")
-sys.path.append("../")
-sys.path.append(".../")
+from math import log
+from typing import Iterable
 from itertools import product
 from tqdm import tqdm
 import kenlm
-from math import log
-import numpy as np
 from make_noiser import Noise
 import pandas as pd
-import sys
 from get_most_probable_phonemes import get_most_probable_phonemes
-import random
-from collections import Counter
 random.seed(1023)
 
 
@@ -26,29 +20,29 @@ LANGUAGES_TYPOLOGIES = {
     'et' : ("Estonian", "agglutinative"),
     'eu' : ("Basque", "agglutinative"),
     'fr' : ("French", "fusional"),
-    'ja' : ("Japanese", "agglutinative"), 
-    'pl' : ("Polish", "fusional"), 
-    'pt' : ("Portuguese", "fusional"), 
-    'sr' : ("Serbian", "fusional"), 
+    'ja' : ("Japanese", "agglutinative"),
+    'pl' : ("Polish", "fusional"),
+    'pt' : ("Portuguese", "fusional"),
+    'sr' : ("Serbian", "fusional"),
     'tr' : ("Turkish", "agglutinative")}
 
-def compute_word_frequencies(word_train_corpus, pct=0.95) :
-    frequencies = Counter()
-    for line in word_train_corpus :
-        line = line.strip()
-        if not line : continue
-        # line = line.strip()
-        frequencies.update(Counter(line.split(" ")))
-    return dict(frequencies)
-
-
-def statistics_word(utterances, word_frequencies, model) :
+def statistics_word(utterances: list, model: kenlm.Model) -> dict:
+    """
+    This function will test a given language model\
+    on a given list of utterances.\
+    The function will also compute some statistics; MLU, TTR, etc
+
+    Parameters
+    ----------
+    - model
+        The estimated language model
+    - utterances: list
+        The utterances to test
+    """
     phoneme_utterances = []
     unique_words = set()
-    nb_unk = 0
     mlu_w = 0.0
     mlu_p = 0.0
-    mean_word_frequencies = 0
     nb_utterances = 0
     nb_words = 0
 
@@ -68,13 +62,6 @@ def statistics_word(utterances, word_frequencies, model) :
         nb_words += len(utterance_words)
         unique_words |= set(utterance_words)
 
-        for word in utterance_words :
-            word = word.strip()
-            if word in word_frequencies :
-                mean_word_frequencies += word_frequencies[word]
-            else : 
-                nb_unk += 1
-    
     mlu_w /= nb_utterances
     mlu_p /= nb_utterances
     ttr_w = len(unique_words) / nb_words
@@ -87,60 +74,62 @@ def statistics_word(utterances, word_frequencies, model) :
     statistics["mlu_w"] = mlu_w
     statistics["mlu_p"] = mlu_p
     statistics["ttr_w"] = ttr_w
-    statistics["mean_word_frequencies"] = mean_word_frequencies
-    statistics["nb_unk"] = nb_unk
 
     return statistics
 
-def create_sparse_combinantions(values) :
+def create_sparse_combinantions(values: Iterable) -> set:
+    """
+    This function will create combinantions for noising.
+    Each item in the returned set contains four values corresponding\
+    to (1) phoneme noise, (2) noise of from adult to child utterances,\
+    (3) noise of from child to adult utterances and (4) noise of
+    These combinantions are sparse because we only noise one value at time.
+    For example, an item can be (0.0, 0.0, 0.0, 0.25), which means that we only
+    noise 25 percent of the phonemes, and nothing else is affected.
+    See the file make_noiser.py for more infomrations.
+    """
     sparse_combinantions = []
     for value in values :
-        for idx in range(len(values)) : 
+        for idx in range(len(values)) :
             sparse_values = [0.0] * len(values)
             sparse_values[idx] = value
             sparse_combinantions.append(tuple(sparse_values))
     return set(sparse_combinantions)
 
-def test(json_files_directory, models_directory, phoneme_train_files, word_train_files, add_noise=False) :
+def test(json_files_directory, models_directory, train_files, add_noise=True) :
     """
+    This function will test the language models on CHILDES corpora
     """
     columns = ["language", "typology", "family", "speaker",\
                 "age", "perplexity", "entropy", "mlu", "mlu_without_repetition",\
                 "phonemes_order_noise", "speakers_noise_adult",\
                 "speakers_noise_child", "phonemes_noise"]
     results = pd.DataFrame(columns=columns, index=None)
-    all_combinations = list(product((0.0, 0.25, 0.5, 0.75), repeat=4)) if add_noise else [((0.0, 0.0, 0.0, 0.0))]
-    # sparse_combinantions = create_sparse_combinantions((0.0, 0.25, 0.5, 0.75))
+    # all_combinations = (list(product((0.0, 0.25, 0.5, 0.75), repeat=4))
+    #                       if add_noise else [((0.0, 0.0, 0.0, 0.0))])
+    sparse_combinantions = create_sparse_combinantions((0.0, 0.25, 0.5, 0.75))
     # noise_values = np.linspace(0.0, 1.0, num=6)
-    for phonemes_noise, speakers_noise_child, speakers_noise_adult, phonemes_order_noise in tqdm(all_combinations, total=len(all_combinations)) :
+    for phonemes_noise, speakers_noise_child, speakers_noise_adult, phonemes_order_noise in tqdm(sparse_combinantions, total=len(sparse_combinantions)) :
         for test_filename, model_filename in product(os.listdir(json_files_directory), os.listdir(models_directory)) :
             lg_iso, _ = test_filename.split(".")
             model_lg = model_filename.split(".")[0]
-            if lg_iso != model_lg : continue
-            print(lg_iso, model_lg)
-            most_probable_phonemes = get_most_probable_phonemes(f"{phoneme_train_files}/{lg_iso}.one_sentence_per_line")
-            word_frequencies = compute_word_frequencies(f"{word_train_files}/{lg_iso}.one_sentence_per_line")
+            if lg_iso != model_lg :
+                continue
+            most_probable_phonemes = get_most_probable_phonemes(f"{train_files}/{lg_iso}.one_sentence_per_line")
             loaded_json = json.load(open(f"{json_files_directory}/{test_filename}"))
             if add_noise :
                 noise = Noise(most_probable_phonemes,
-                                phonemes_order_noise=phonemes_order_noise,
-                                speakers_noise=(speakers_noise_child, speakers_noise_adult),
-                                phonemes_noise=phonemes_noise)
+                                phonemes_order_noise_value=phonemes_order_noise,
+                                speakers_noise_values=(speakers_noise_child, speakers_noise_adult),
+                                phonemes_noise_value=phonemes_noise)
                 loaded_json = noise(loaded_json)
             model = kenlm.Model(f"{models_directory}/{model_filename}")
             for family in loaded_json :
-                for age in loaded_json[family] : 
+                for age in loaded_json[family] :
                     if age == "None" : print(family, lg_iso, age); continue
                     for speaker in loaded_json[family][age] :
                         if speaker not in ["Adult", "Target_Child"] : continue
-                        # test_utterances = "\n".join(loaded_json[family][age][speaker])
-                        # utterances = [utterance.split(" ") for utterance in loaded_json[family][age][speaker]]
-                        # mlu = np.mean([len(utterance) for utterance in utterances])
-                        # mlu_without_repetition = np.mean([len(set(utterance)) for utterance in utterances])
-                        # ppl = model.perplexity(test_utterances)
-                        # entropy = log(ppl)
-
-                        results_statistics = statistics_word(loaded_json[family][age][speaker], word_frequencies, model)
+                        results_statistics = statistics_word(loaded_json[family][age][speaker], model)
                         language, typology = LANGUAGES_TYPOLOGIES[lg_iso]
                         new_row =  {"language" : language,
                                     "typology" : typology,
@@ -152,8 +141,6 @@ def test(json_files_directory, models_directory, phoneme_train_files, word_train
                                     "mlu_w" : results_statistics["mlu_w"],
                                     "mlu_p" : results_statistics["mlu_p"],
                                     "ttr_w" : results_statistics["ttr_w"],
-                                    "mean_word_frequencies" : results_statistics["mean_word_frequencies"],
-                                    "nb_unk" : results_statistics["nb_unk"],
                                     "phonemes_order_noise" : phonemes_order_noise,
                                     "speakers_noise_adult" : speakers_noise_adult,
                                     "speakers_noise_child" : speakers_noise_child,
@@ -164,46 +151,33 @@ if __name__ == "__main__":
     from argparse import ArgumentParser, BooleanOptionalAction
 
     parser = ArgumentParser()
-    parser.add_argument('--phoneme_train_directory',
-        required=True,
-        help="Dataset containing the train files in phonemes (dot one_sentence_per_line) "
-        )
-    parser.add_argument('--word_train_directory',
+    parser.add_argument('--train_directory',
         required=True,
-        help="Dataset containing the train files in words (dot one_sentence_per_line) "
+        help="The directory containing the train files tokenized in phonemes."
         )
-    parser.add_argument('--models_directory', 
+    parser.add_argument('--models_directory',
         required=True,
-        help="Folder containing the estimated parameters"
+        help="The directory containing the trained language models."
         )
-    
+
     parser.add_argument('--json_files_directory',
         required=True,
-        help="Directory containing json files for test"
+        help="The directory containing CHILDES utterances in json format for each language"
         )
-    
-    parser.add_argument('--out_dirname',
-        required=True,
-        help="Out directory"
-        )
-    parser.add_argument('--out_filename',
-            required=True,
-            help="Out filename"
-            )
-    parser.add_argument("--add_noise", action=BooleanOptionalAction)
+
+    parser.add_argument("--add_noise",
+        help="Whether noise the CHILDES utterances or not",
+        action=BooleanOptionalAction)
 
     args = parser.parse_args()
     add_noise = args.add_noise
     json_files_directory = args.json_files_directory
-    phoneme_train_files, word_train_files = args.phoneme_train_directory, args.word_train_directory
+    phoneme_train_files = args.train_directory
     models_directory = args.models_directory
-    out_dirname = args.out_dirname
-    out_filename = args.out_filename
 
     if not os.path.exists("results"):
         os.makedirs("results")
-    test(json_files_directory, 
-        models_directory, 
-        phoneme_train_files,
-        word_train_files, 
-        add_noise).to_csv(f"{out_dirname}/{out_filename}.csv")
+    test(json_files_directory,
+            models_directory,
+            phoneme_train_files,
+            add_noise=add_noise).to_csv("results/results.csv")

+ 0 - 1
datasets/opensubtitles_corpora/tokenized_in_words/da.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/Xf/J3/MD5E-s15872283--589ddd6867dae20a40048aaed160f751/MD5E-s15872283--589ddd6867dae20a40048aaed160f751

+ 0 - 1
datasets/opensubtitles_corpora/tokenized_in_words/de.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/PK/VG/MD5E-s16478486--16c8639a6e1bc848b9e7c835138bc779/MD5E-s16478486--16c8639a6e1bc848b9e7c835138bc779

+ 0 - 1
datasets/opensubtitles_corpora/tokenized_in_words/en.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/k5/41/MD5E-s24346046--828c32d227d0c2d934ac85954c0549be/MD5E-s24346046--828c32d227d0c2d934ac85954c0549be

+ 0 - 1
datasets/opensubtitles_corpora/tokenized_in_words/es.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/Vj/Zq/MD5E-s15998814--588f8969805d7e19a2fb8555c805c7bf/MD5E-s15998814--588f8969805d7e19a2fb8555c805c7bf

+ 0 - 1
datasets/opensubtitles_corpora/tokenized_in_words/et.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/kK/m3/MD5E-s16250386--1fa013016100de4c1369aaaf5b51c3bc/MD5E-s16250386--1fa013016100de4c1369aaaf5b51c3bc

+ 0 - 1
datasets/opensubtitles_corpora/tokenized_in_words/eu.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/Z5/WJ/MD5E-s10355435--f2c85a6f418cd351e643387f77f80466/MD5E-s10355435--f2c85a6f418cd351e643387f77f80466

+ 0 - 1
datasets/opensubtitles_corpora/tokenized_in_words/fr.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/xQ/Mz/MD5E-s13149110--66bc34c1a1005f80f734ff7c9e0330ef/MD5E-s13149110--66bc34c1a1005f80f734ff7c9e0330ef

+ 0 - 1
datasets/opensubtitles_corpora/tokenized_in_words/ja.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/Fx/P8/MD5E-s14790606--0468f28b5726a09d980c07fb217974e8/MD5E-s14790606--0468f28b5726a09d980c07fb217974e8

+ 0 - 1
datasets/opensubtitles_corpora/tokenized_in_words/pl.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/8k/jP/MD5E-s27403501--65e8c8d45fe6075ab088da985dde5bfd/MD5E-s27403501--65e8c8d45fe6075ab088da985dde5bfd

+ 0 - 1
datasets/opensubtitles_corpora/tokenized_in_words/pt.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/pq/Gx/MD5E-s19442758--174c64a1b49c9d94615dfcfb5c52508b/MD5E-s19442758--174c64a1b49c9d94615dfcfb5c52508b

+ 0 - 1
datasets/opensubtitles_corpora/tokenized_in_words/sr.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/VK/59/MD5E-s15029711--76eb34c274a6e5ab100575ce2fa60ab0/MD5E-s15029711--76eb34c274a6e5ab100575ce2fa60ab0

+ 0 - 1
datasets/opensubtitles_corpora/tokenized_in_words/tr.one_sentence_per_line

@@ -1 +0,0 @@
-../../../.git/annex/objects/FM/Vk/MD5E-s22131015--b159f0714684e9fc60945904418a1240/MD5E-s22131015--b159f0714684e9fc60945904418a1240

+ 1 - 0
environment.yml

@@ -0,0 +1 @@
+.git/annex/objects/V8/56/MD5E-s5478--6ce1a1dfc33f3c2aee2a0c4f0f11aa02.yml/MD5E-s5478--6ce1a1dfc33f3c2aee2a0c4f0f11aa02.yml

+ 1 - 0
estimated/da.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/5x/74/MD5E-s28028840--71fbf9fb169884d736da26c047e16f4e.arpa/MD5E-s28028840--71fbf9fb169884d736da26c047e16f4e.arpa

+ 1 - 0
estimated/de.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/Z2/0W/MD5E-s22540364--11e64685c900b25e47a7c2a137dd7a9b.arpa/MD5E-s22540364--11e64685c900b25e47a7c2a137dd7a9b.arpa

+ 1 - 0
estimated/en.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/KG/5q/MD5E-s31436879--847b2a7d2e5210d87f638963a8764808.arpa/MD5E-s31436879--847b2a7d2e5210d87f638963a8764808.arpa

+ 1 - 0
estimated/es.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/Zq/pj/MD5E-s10061705--b466f7fc80c31c74891f85256d324c43.arpa/MD5E-s10061705--b466f7fc80c31c74891f85256d324c43.arpa

+ 1 - 0
estimated/et.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/w4/9Q/MD5E-s18873182--89176dfdd746dd62fe277cf760489709.arpa/MD5E-s18873182--89176dfdd746dd62fe277cf760489709.arpa

+ 1 - 0
estimated/eu.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/vZ/2G/MD5E-s12176188--ae20d403fb51fef0b7572521b95d47a9.arpa/MD5E-s12176188--ae20d403fb51fef0b7572521b95d47a9.arpa

+ 1 - 0
estimated/fr.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/QG/ff/MD5E-s20901089--1873e4fa871af748a4028e962a941b74.arpa/MD5E-s20901089--1873e4fa871af748a4028e962a941b74.arpa

+ 1 - 0
estimated/ja.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/6W/kM/MD5E-s8026445--d320df753b865052827e96c0be67e418.arpa/MD5E-s8026445--d320df753b865052827e96c0be67e418.arpa

+ 1 - 0
estimated/pl.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/5j/46/MD5E-s23833364--0c4492ab80d3c7f37ff923288dc88d80.arpa/MD5E-s23833364--0c4492ab80d3c7f37ff923288dc88d80.arpa

+ 1 - 0
estimated/pt.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/ZF/pz/MD5E-s22346672--1a9f56836b07f9a0d981e329ce47e1c9.arpa/MD5E-s22346672--1a9f56836b07f9a0d981e329ce47e1c9.arpa

+ 1 - 0
estimated/sr.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/6M/xg/MD5E-s20755431--b4f26a89a36c9c4a61bb39a00c83c116.arpa/MD5E-s20755431--b4f26a89a36c9c4a61bb39a00c83c116.arpa

+ 1 - 0
estimated/tr.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/Gf/70/MD5E-s18935056--4fe9ce073a5c9cb9e601fa1424524c3a.arpa/MD5E-s18935056--4fe9ce073a5c9cb9e601fa1424524c3a.arpa

+ 1 - 0
results/evaluation.csv

@@ -0,0 +1 @@
+../.git/annex/objects/5g/Gj/MD5E-s607--3a8fb15dbd039d29e12a0e126c73112d.csv/MD5E-s607--3a8fb15dbd039d29e12a0e126c73112d.csv

+ 0 - 1
results/results_for_study2_datalad.csv

@@ -1 +0,0 @@
-../.git/annex/objects/gg/pJ/MD5E-s1271727--76f1f59ac68fe619ae34bd03ede011a8.csv/MD5E-s1271727--76f1f59ac68fe619ae34bd03ede011a8.csv

+ 0 - 1
ter

@@ -1 +0,0 @@
-.git/annex/objects/K5/g1/MD5E-s11843--f7a613986d426f7c5667fc59487dd559/MD5E-s11843--f7a613986d426f7c5667fc59487dd559