Browse Source

add rmarkdown code

yaya-sy 1 year ago
parent
commit
b4fa96e143
50 changed files with 120 additions and 66 deletions
  1. 8 3
      README.md
  2. BIN
      code/__pycache__/get_most_probable_phonemes.cpython-310.pyc
  3. BIN
      code/__pycache__/make_noiser.cpython-310.pyc
  4. BIN
      code/__pycache__/utterances_cleaner.cpython-310.pyc
  5. 3 1
      code/download_childes_corpora.py
  6. 2 3
      code/get_most_probable_phonemes.py
  7. 5 6
      code/make_noiser.py
  8. 2 1
      code/one_utterance_per_line_to_json.py
  9. 18 21
      code/test_on_all_languages.py
  10. 43 18
      code/utterances_cleaner.py
  11. 0 1
      commands_reproduction.txt
  12. 1 1
      datasets/childes_json_corpora/da.json
  13. 1 1
      datasets/childes_json_corpora/de.json
  14. 1 1
      datasets/childes_json_corpora/en.json
  15. 1 1
      datasets/childes_json_corpora/es.json
  16. 1 1
      datasets/childes_json_corpora/et.json
  17. 1 1
      datasets/childes_json_corpora/eu.json
  18. 1 1
      datasets/childes_json_corpora/fr.json
  19. 1 1
      datasets/childes_json_corpora/ja.json
  20. 1 1
      datasets/childes_json_corpora/pl.json
  21. 1 1
      datasets/childes_json_corpora/pt.json
  22. 1 1
      datasets/childes_json_corpora/sr.json
  23. 1 1
      datasets/childes_json_corpora/tr.json
  24. 1 0
      datasets/childes_one_utterance_per_line_files/da.one_utterance_per_line
  25. 1 0
      datasets/childes_one_utterance_per_line_files/de.one_utterance_per_line
  26. 1 0
      datasets/childes_one_utterance_per_line_files/en.one_utterance_per_line
  27. 1 0
      datasets/childes_one_utterance_per_line_files/es.one_utterance_per_line
  28. 1 0
      datasets/childes_one_utterance_per_line_files/et.one_utterance_per_line
  29. 1 0
      datasets/childes_one_utterance_per_line_files/eu.one_utterance_per_line
  30. 1 0
      datasets/childes_one_utterance_per_line_files/fr.one_utterance_per_line
  31. 1 0
      datasets/childes_one_utterance_per_line_files/ja.one_utterance_per_line
  32. 1 0
      datasets/childes_one_utterance_per_line_files/not_downloaded_data.txt
  33. 1 0
      datasets/childes_one_utterance_per_line_files/pl.one_utterance_per_line
  34. 1 0
      datasets/childes_one_utterance_per_line_files/pt.one_utterance_per_line
  35. 1 0
      datasets/childes_one_utterance_per_line_files/sr.one_utterance_per_line
  36. 1 0
      datasets/childes_one_utterance_per_line_files/tr.one_utterance_per_line
  37. 1 0
      estimated/da.one_sentence_per_line.arpa
  38. 1 0
      estimated/de.one_sentence_per_line.arpa
  39. 1 0
      estimated/en.one_sentence_per_line.arpa
  40. 1 0
      estimated/es.one_sentence_per_line.arpa
  41. 1 0
      estimated/et.one_sentence_per_line.arpa
  42. 1 0
      estimated/eu.one_sentence_per_line.arpa
  43. 1 0
      estimated/fr.one_sentence_per_line.arpa
  44. 1 0
      estimated/ja.one_sentence_per_line.arpa
  45. 1 0
      estimated/pl.one_sentence_per_line.arpa
  46. 1 0
      estimated/pt.one_sentence_per_line.arpa
  47. 1 0
      estimated/sr.one_sentence_per_line.arpa
  48. 1 0
      estimated/tr.one_sentence_per_line.arpa
  49. 1 0
      final_results_analysis.Rmd
  50. 1 0
      results/results.csv

+ 8 - 3
README.md

@@ -9,6 +9,8 @@ and activate it :
 
 
 ```conda activate measuring_cld```
 ```conda activate measuring_cld```
 
 
+We provide all the data already pre-processed and phonemized. But if you want to re-download the raw data and to re-pre-processed it entierely, then you will need to install phonemizer (https://github.com/bootphon/phonemizer) with the espeak backend.
+
 ## Folder structure
 ## Folder structure
 
 
 - All source code is located in `code/`
 - All source code is located in `code/`
@@ -67,9 +69,12 @@ This will output a `evalution.csv` file in a `results` folder.
 We can now compute the entropies on the CHILDES utterances with the script `code/test_on_all_languages.py`. This script take the following arguments:
 We can now compute the entropies on the CHILDES utterances with the script `code/test_on_all_languages.py`. This script take the following arguments:
 
 
 > `--train_directory` : The directory containing the train files tokenized in phonemes.
 > `--train_directory` : The directory containing the train files tokenized in phonemes.
-> `--models_directory`: The directory containing the trained language models.
->  --json_files_directory: The directory containing CHILDES utterances in json format for each language.
->  --add_noise, --no-add_noise: Whether noise the CHILDES utterances or not.
+
+> `--models_directory` : The directory containing the trained language models.
+
+> `--json_files_directory`: The directory containing CHILDES utterances in json format for each language.
+
+> `--add_noise`, `--no-add_noise` : Whether noise the CHILDES utterances or not.
 
 
 If you stored the language models in the `estimated/` folder, then you can run the script like that :
 If you stored the language models in the `estimated/` folder, then you can run the script like that :
 
 

BIN
code/__pycache__/get_most_probable_phonemes.cpython-310.pyc


BIN
code/__pycache__/make_noiser.cpython-310.pyc


BIN
code/__pycache__/utterances_cleaner.cpython-310.pyc


+ 3 - 1
code/download_childes_corpora.py

@@ -174,7 +174,6 @@ class DownloadChildCorpora :
         - phonemize_child: bool
         - phonemize_child: bool
 
 
         """
         """
-        print(language, languages_to_download_informations[language]["ort_tier"])
         participants_to_consider = languages_to_download_informations[language]["participants"]
         participants_to_consider = languages_to_download_informations[language]["participants"]
         downloading_file = open(f"{out_dirname}/{language}.one_utterance_per_line", 
         downloading_file = open(f"{out_dirname}/{language}.one_utterance_per_line", 
                                 "w", encoding="UTF-8")
                                 "w", encoding="UTF-8")
@@ -206,6 +205,9 @@ class DownloadChildCorpora :
                                                                     participant,
                                                                     participant,
                                                                     backend,
                                                                     backend,
                                                                     phonemize_child=phonemize_child) :
                                                                     phonemize_child=phonemize_child) :
+                        utterance = utterance.replace("$", " ").replace("@", " ")
+                        utterance = self.utterances_cleaner.remove_multiple_spaces(utterance)
+                        utterance = utterance.strip()
                         if not utterance :
                         if not utterance :
                             continue
                             continue
                         family_name = "_".join((corpus, family))
                         family_name = "_".join((corpus, family))

+ 2 - 3
code/get_most_probable_phonemes.py

@@ -7,9 +7,8 @@ def get_most_probable_phonemes(one_sentence_per_line_file, p=0.007) :
     counts = defaultdict(int)
     counts = defaultdict(int)
     for sentence in open(one_sentence_per_line_file) :
     for sentence in open(one_sentence_per_line_file) :
         sentence = sentence.rstrip()
         sentence = sentence.rstrip()
-        for word in sentence.split("@") :
-            for phoneme in word.split("$") :
-                counts[phoneme] += 1
+        for phoneme in sentence.split(" ") :
+            counts[phoneme] += 1
     total = sum(counts.values())
     total = sum(counts.values())
     for phoneme in counts :
     for phoneme in counts :
         counts[phoneme] /= total
         counts[phoneme] /= total

+ 5 - 6
code/make_noiser.py

@@ -9,8 +9,8 @@ random.seed(80)
 
 
 class Noise :
 class Noise :
     """
     """
-    This class simulate noise in the data. Crucially,\
-    noise can be made on three points :\
+    This class simulates noise in the data. Crucially,\
+    noise can be made on three cases :\
     (1) The noise of phonemes order of a given sequence\
     (1) The noise of phonemes order of a given sequence\
     by making the order of the sequence more aribitrary,\
     by making the order of the sequence more aribitrary,\
     (2) Replacement of some phonemes of a given sequence\
     (2) Replacement of some phonemes of a given sequence\
@@ -22,16 +22,16 @@ class Noise :
     - phonemes_order_noise :
     - phonemes_order_noise :
         Parameter for controling the degree of noise at the level\
         Parameter for controling the degree of noise at the level\
         of phonemes order. See the point 1 mentioned above.
         of phonemes order. See the point 1 mentioned above.
-    - speakers_noise :
+    - speakers_noise_values :
         Parameters for controling the degree of noise at the level\
         Parameters for controling the degree of noise at the level\
         of speakers. See the point 3 mentioned above.
         of speakers. See the point 3 mentioned above.
-    - phonemes_noise :
+    - phonemes_noise_value :
         Parameter for controling the degree of noise at the level of phonemes.
         Parameter for controling the degree of noise at the level of phonemes.
         See the point 2 mentioned above.
         See the point 2 mentioned above.
     """
     """
 
 
     def __init__(self,
     def __init__(self,
-                    most_probable_phonemes: list,
+                    most_probable_phonemes,
                     phonemes_order_noise_value=0.3,
                     phonemes_order_noise_value=0.3,
                     speakers_noise_values=(0.5, 0.5),
                     speakers_noise_values=(0.5, 0.5),
                     phonemes_noise_value=0.5) :
                     phonemes_noise_value=0.5) :
@@ -82,7 +82,6 @@ class Noise :
             The sequence with noised phonemes.
             The sequence with noised phonemes.
         """
         """
         phonemes_to_noise = round(len(sequence) * self.phonemes_noise_value)
         phonemes_to_noise = round(len(sequence) * self.phonemes_noise_value)
-        assert phonemes_to_noise < len(sequence), "Number of phoneme to noise greather that sequence's length"
         indexes = choices(range(len(sequence)), k=phonemes_to_noise)
         indexes = choices(range(len(sequence)), k=phonemes_to_noise)
         # choose new phonemes only from the most probable phonemes.
         # choose new phonemes only from the most probable phonemes.
         phonemes = choices(self.most_probable_phonemes, k=phonemes_to_noise)
         phonemes = choices(self.most_probable_phonemes, k=phonemes_to_noise)

+ 2 - 1
code/one_utterance_per_line_to_json.py

@@ -33,7 +33,8 @@ def one_utterance_per_line_to_json(directory: str, out_dirname: str) -> None:
         for participant_age_utterance in open(f"{directory}/{language_filename}") :
         for participant_age_utterance in open(f"{directory}/{language_filename}") :
             family, participant, age, utterance = participant_age_utterance.strip().split(",")
             family, participant, age, utterance = participant_age_utterance.strip().split(",")
             participant = "Adult" if participant in ["Father", "Mother"] else participant
             participant = "Adult" if participant in ["Father", "Mother"] else participant
-            if utterance in by_family[family][age][participant] : continue
+            if utterance in by_family[family][age][participant] :
+                continue
             by_family[family][age][participant].append(utterance)
             by_family[family][age][participant].append(utterance)
         with open(f"{out_dirname}/{language}.json", "w") as out_filename :
         with open(f"{out_dirname}/{language}.json", "w") as out_filename :
             json.dump(
             json.dump(

+ 18 - 21
code/test_on_all_languages.py

@@ -77,7 +77,7 @@ def statistics_word(utterances: list, model: kenlm.Model) -> dict:
 
 
     return statistics
     return statistics
 
 
-def create_sparse_combinantions(values: Iterable) -> set:
+def create_sparse_combinantions(values: Iterable, variables=3) -> set:
     """
     """
     This function will create combinantions for noising.
     This function will create combinantions for noising.
     Each item in the returned set contains four values corresponding\
     Each item in the returned set contains four values corresponding\
@@ -90,8 +90,8 @@ def create_sparse_combinantions(values: Iterable) -> set:
     """
     """
     sparse_combinantions = []
     sparse_combinantions = []
     for value in values :
     for value in values :
-        for idx in range(len(values)) :
-            sparse_values = [0.0] * len(values)
+        for idx in range(variables) :
+            sparse_values = [0.0] * variables
             sparse_values[idx] = value
             sparse_values[idx] = value
             sparse_combinantions.append(tuple(sparse_values))
             sparse_combinantions.append(tuple(sparse_values))
     return set(sparse_combinantions)
     return set(sparse_combinantions)
@@ -101,15 +101,14 @@ def test(json_files_directory, models_directory, train_files, add_noise=True) :
     This function will test the language models on CHILDES corpora
     This function will test the language models on CHILDES corpora
     """
     """
     columns = ["language", "typology", "family", "speaker",\
     columns = ["language", "typology", "family", "speaker",\
-                "age", "perplexity", "entropy", "mlu", "mlu_without_repetition",\
-                "phonemes_order_noise", "speakers_noise_adult",\
-                "speakers_noise_child", "phonemes_noise"]
+                "age", "perplexity", "entropy", "phonemes_order_noise",\
+                "speakers_noise", "phonemes_noise"]
     results = pd.DataFrame(columns=columns, index=None)
     results = pd.DataFrame(columns=columns, index=None)
     # all_combinations = (list(product((0.0, 0.25, 0.5, 0.75), repeat=4))
     # all_combinations = (list(product((0.0, 0.25, 0.5, 0.75), repeat=4))
     #                       if add_noise else [((0.0, 0.0, 0.0, 0.0))])
     #                       if add_noise else [((0.0, 0.0, 0.0, 0.0))])
-    sparse_combinantions = create_sparse_combinantions((0.0, 0.25, 0.5, 0.75))
+    sparse_combinantions = create_sparse_combinantions((0.0, 0.25, 0.5, 0.75, 1))
     # noise_values = np.linspace(0.0, 1.0, num=6)
     # noise_values = np.linspace(0.0, 1.0, num=6)
-    for phonemes_noise, speakers_noise_child, speakers_noise_adult, phonemes_order_noise in tqdm(sparse_combinantions, total=len(sparse_combinantions)) :
+    for phonemes_noise, speakers_noise, phonemes_order_noise in tqdm(sparse_combinantions, total=len(sparse_combinantions)) :
         for test_filename, model_filename in product(os.listdir(json_files_directory), os.listdir(models_directory)) :
         for test_filename, model_filename in product(os.listdir(json_files_directory), os.listdir(models_directory)) :
             lg_iso, _ = test_filename.split(".")
             lg_iso, _ = test_filename.split(".")
             model_lg = model_filename.split(".")[0]
             model_lg = model_filename.split(".")[0]
@@ -120,7 +119,7 @@ def test(json_files_directory, models_directory, train_files, add_noise=True) :
             if add_noise :
             if add_noise :
                 noise = Noise(most_probable_phonemes,
                 noise = Noise(most_probable_phonemes,
                                 phonemes_order_noise_value=phonemes_order_noise,
                                 phonemes_order_noise_value=phonemes_order_noise,
-                                speakers_noise_values=(speakers_noise_child, speakers_noise_adult),
+                                speakers_noise_values=(speakers_noise, speakers_noise),
                                 phonemes_noise_value=phonemes_noise)
                                 phonemes_noise_value=phonemes_noise)
                 loaded_json = noise(loaded_json)
                 loaded_json = noise(loaded_json)
             model = kenlm.Model(f"{models_directory}/{model_filename}")
             model = kenlm.Model(f"{models_directory}/{model_filename}")
@@ -129,21 +128,19 @@ def test(json_files_directory, models_directory, train_files, add_noise=True) :
                     if age == "None" : print(family, lg_iso, age); continue
                     if age == "None" : print(family, lg_iso, age); continue
                     for speaker in loaded_json[family][age] :
                     for speaker in loaded_json[family][age] :
                         if speaker not in ["Adult", "Target_Child"] : continue
                         if speaker not in ["Adult", "Target_Child"] : continue
-                        results_statistics = statistics_word(loaded_json[family][age][speaker], model)
+                        # results_statistics = statistics_word(loaded_json[family][age][speaker], model)
                         language, typology = LANGUAGES_TYPOLOGIES[lg_iso]
                         language, typology = LANGUAGES_TYPOLOGIES[lg_iso]
+                        ppl = model.perplexity("\n".join(loaded_json[family][age][speaker]))
+                        entropy = log(ppl)
                         new_row =  {"language" : language,
                         new_row =  {"language" : language,
                                     "typology" : typology,
                                     "typology" : typology,
                                     "family" : family,
                                     "family" : family,
                                     "speaker" : speaker,
                                     "speaker" : speaker,
                                     "age" : float(age),
                                     "age" : float(age),
-                                    "perplexity" : results_statistics["ppl"],
-                                    "entropy" : results_statistics["entropy"],
-                                    "mlu_w" : results_statistics["mlu_w"],
-                                    "mlu_p" : results_statistics["mlu_p"],
-                                    "ttr_w" : results_statistics["ttr_w"],
+                                    "perplexity" : ppl,
+                                    "entropy" : entropy,
                                     "phonemes_order_noise" : phonemes_order_noise,
                                     "phonemes_order_noise" : phonemes_order_noise,
-                                    "speakers_noise_adult" : speakers_noise_adult,
-                                    "speakers_noise_child" : speakers_noise_child,
+                                    "speakers_noise" : speakers_noise,
                                     "phonemes_noise" : phonemes_noise}
                                     "phonemes_noise" : phonemes_noise}
                         results = results.append(new_row, ignore_index=True)
                         results = results.append(new_row, ignore_index=True)
     return results
     return results
@@ -151,11 +148,11 @@ if __name__ == "__main__":
     from argparse import ArgumentParser, BooleanOptionalAction
     from argparse import ArgumentParser, BooleanOptionalAction
 
 
     parser = ArgumentParser()
     parser = ArgumentParser()
-    parser.add_argument('--train_directory',
+    parser.add_argument('--train_files_directory',
         required=True,
         required=True,
         help="The directory containing the train files tokenized in phonemes."
         help="The directory containing the train files tokenized in phonemes."
         )
         )
-    parser.add_argument('--models_directory',
+    parser.add_argument('--model_files_directory',
         required=True,
         required=True,
         help="The directory containing the trained language models."
         help="The directory containing the trained language models."
         )
         )
@@ -172,8 +169,8 @@ if __name__ == "__main__":
     args = parser.parse_args()
     args = parser.parse_args()
     add_noise = args.add_noise
     add_noise = args.add_noise
     json_files_directory = args.json_files_directory
     json_files_directory = args.json_files_directory
-    phoneme_train_files = args.train_directory
-    models_directory = args.models_directory
+    phoneme_train_files = args.train_files_directory
+    models_directory = args.model_files_directory
 
 
     if not os.path.exists("results"):
     if not os.path.exists("results"):
         os.makedirs("results")
         os.makedirs("results")

+ 43 - 18
code/utterances_cleaner.py

@@ -1,4 +1,3 @@
-# pylint: disable=no-member
 """This module contains an implementation of a class that help /
 """This module contains an implementation of a class that help /
     to clean orthographic or IPA transcripts of utterances. /
     to clean orthographic or IPA transcripts of utterances. /
     Crucially, this class will clean utterances by removing or replacing /
     Crucially, this class will clean utterances by removing or replacing /
@@ -8,9 +7,11 @@
 import re
 import re
 import string
 import string
 
 
-
 class UtterancesCleaner :
 class UtterancesCleaner :
     """
     """
+    This class will clean utterances from CHILDES,\
+    by deleting words, patterns, ponctuation or replacing\
+    or replacing them by other things.
     """
     """
     def __init__(self, markers: dict) :
     def __init__(self, markers: dict) :
         self.delete_marker_pattern = '|'.join(markers["marker_to_delete"])
         self.delete_marker_pattern = '|'.join(markers["marker_to_delete"])
@@ -27,12 +28,12 @@ class UtterancesCleaner :
 
 
         Parameters
         Parameters
         ----------
         ----------
-        - utterance : list
-            list of words utterance
-        - pattern : list
-            regex pattern containing markers to delete from the utterance
+        - utterance : str
+            Utterance from which markers will be replaced
+        - pattern : str
+            Regex pattern containing markers to delete from the utterance
         - replacement :
         - replacement :
-            symbol that will replace markers
+            Symbol that will replace markers
         """
         """
         return " ".join(re.sub(pattern, replacement, word) for word in utterance.split(" "))
         return " ".join(re.sub(pattern, replacement, word) for word in utterance.split(" "))
 
 
@@ -43,8 +44,8 @@ class UtterancesCleaner :
 
 
         Parameters
         Parameters
         ----------
         ----------
-        - utterance : list
-            list of words utterance
+        - utterance : str
+            Utterance from which those words will be removed
         """
         """
         return " ".join(word for word in utterance.split(" ") \
         return " ".join(word for word in utterance.split(" ") \
             if not re.match(self.word_contains_delete_pattern, word))
             if not re.match(self.word_contains_delete_pattern, word))
@@ -57,14 +58,14 @@ class UtterancesCleaner :
         ----------
         ----------
         - utterance : str
         - utterance : str
             The utterance from which the punctuation will be removed.
             The utterance from which the punctuation will be removed.
-        
+
         Returns
         Returns
         -------
         -------
         str :
         str :
             The utterance without punctuations.
             The utterance without punctuations.
         """
         """
         return utterance.translate(str.maketrans('', '', string.punctuation))
         return utterance.translate(str.maketrans('', '', string.punctuation))
-    
+
     def remove_brackets(self, utterance: str) -> str :
     def remove_brackets(self, utterance: str) -> str :
         """
         """
         Remove brackets from a given utterance.
         Remove brackets from a given utterance.
@@ -73,7 +74,7 @@ class UtterancesCleaner :
         ----------
         ----------
         - utterance : str
         - utterance : str
             The utterance from which the brackets will be removed.
             The utterance from which the brackets will be removed.
-        
+
         Returns
         Returns
         -------
         -------
         str :
         str :
@@ -82,6 +83,15 @@ class UtterancesCleaner :
         return re.sub(r"[\(\[].*?[\)\]]", '', utterance)
         return re.sub(r"[\(\[].*?[\)\]]", '', utterance)
 
 
     def handle_repetitions(self, utterance: str) -> str:
     def handle_repetitions(self, utterance: str) -> str:
+        """
+        This function will repeat n times some units from\
+        a give utterance.
+
+        Parameters
+        ----------
+        utterance: str
+            Utterance from which some units will be repeated.
+        """
         while True:
         while True:
             matched = re.search(self.pattern_repetition, utterance)
             matched = re.search(self.pattern_repetition, utterance)
 
 
@@ -91,13 +101,27 @@ class UtterancesCleaner :
             all_match = matched.group(0)
             all_match = matched.group(0)
             separator = matched.group(1)
             separator = matched.group(1)
             word, repetitions = matched.group(2),matched.group(3)
             word, repetitions = matched.group(2),matched.group(3)
-            repeated_word = '{}{}'.format(separator, ' '.join([word] * int(repetitions)))
+            repeated_word = f"{separator}{' '.join([word] * int(repetitions))}"
 
 
             utterance = utterance.replace(all_match, repeated_word, 1)
             utterance = utterance.replace(all_match, repeated_word, 1)
 
 
         return utterance
         return utterance
-    
+
     def remove_multiple_spaces(self, utterance: str) -> str :
     def remove_multiple_spaces(self, utterance: str) -> str :
+        """
+        Remove multiple spaces from a given utterance.
+
+        Parameters
+        ----------
+        utterance: str
+            Utterance from which multiple successive spaces\
+            will be replaced.
+
+        Returns
+        -------
+        - str
+            Utterance without multiple successive spaces.
+        """
         return re.sub(' +', ' ', utterance)
         return re.sub(' +', ' ', utterance)
 
 
     def clean(self, utterance: str) -> str :
     def clean(self, utterance: str) -> str :
@@ -108,18 +132,19 @@ class UtterancesCleaner :
 
 
         Parameters
         Parameters
         ----------
         ----------
-        - utterances : list
-            list of utterances to clean
+        - utterances : str
+            Utterance to clean
         Returns
         Returns
         -------
         -------
-        - generator over cleaned utterances
+        - str
+            Cleaned utterance
         """
         """
         utterance = self.handle_repetitions(utterance)
         utterance = self.handle_repetitions(utterance)
         utterance = self.replace_marker(utterance, self.delete_marker_pattern, "")
         utterance = self.replace_marker(utterance, self.delete_marker_pattern, "")
         utterance = self.delete_words(utterance)
         utterance = self.delete_words(utterance)
         utterance = self.replace_marker(utterance, self.poncts_to_delete_pattern, "")
         utterance = self.replace_marker(utterance, self.poncts_to_delete_pattern, "")
         utterance = self.replace_marker(utterance, self.delete_comments_pattern, "")
         utterance = self.replace_marker(utterance, self.delete_comments_pattern, "")
-        utterance = self.replace_marker(utterance, self.replace_unk_pattern, "") # pour mot non retranscrit
+        utterance = self.replace_marker(utterance, self.replace_unk_pattern, "")
         utterance = self.remove_brackets(utterance)
         utterance = self.remove_brackets(utterance)
         utterance = self.remove_ponctuations(utterance)
         utterance = self.remove_ponctuations(utterance)
         utterance = self.remove_multiple_spaces(utterance)
         utterance = self.remove_multiple_spaces(utterance)

+ 0 - 1
commands_reproduction.txt

@@ -1 +0,0 @@
-.git/annex/objects/8v/Zm/MD5E-s459--11999fdb245d2931764986dd3e7ee155.txt/MD5E-s459--11999fdb245d2931764986dd3e7ee155.txt

+ 1 - 1
datasets/childes_json_corpora/da.json

@@ -1 +1 @@
-../../.git/annex/objects/4W/Xg/MD5E-s2940316--507efe8e52bf5ce75f8df711d87d1f38.json/MD5E-s2940316--507efe8e52bf5ce75f8df711d87d1f38.json
+../../.git/annex/objects/pQ/K0/MD5E-s2929654--c67f2b9d013cecd89ad1f90d3f1042d1.json/MD5E-s2929654--c67f2b9d013cecd89ad1f90d3f1042d1.json

+ 1 - 1
datasets/childes_json_corpora/de.json

@@ -1 +1 @@
-../../.git/annex/objects/m8/K7/MD5E-s45738282--c42b1e618dc3371fc04798b8aec56033.json/MD5E-s45738282--c42b1e618dc3371fc04798b8aec56033.json
+../../.git/annex/objects/V1/pk/MD5E-s45122753--28c70804c58919db510891f7cd21e9ad.json/MD5E-s45122753--28c70804c58919db510891f7cd21e9ad.json

+ 1 - 1
datasets/childes_json_corpora/en.json

@@ -1 +1 @@
-../../.git/annex/objects/mM/Fk/MD5E-s32246267--53fa0ec80e98ef57b52100fa3e52a686.json/MD5E-s32246267--53fa0ec80e98ef57b52100fa3e52a686.json
+../../.git/annex/objects/3m/jj/MD5E-s32243800--1e4701d6cc0a5ce2362254232cdd1818.json/MD5E-s32243800--1e4701d6cc0a5ce2362254232cdd1818.json

+ 1 - 1
datasets/childes_json_corpora/es.json

@@ -1 +1 @@
-../../.git/annex/objects/qz/0X/MD5E-s7528611--1534de247b52108a71b2c32d24a2a07a.json/MD5E-s7528611--1534de247b52108a71b2c32d24a2a07a.json
+../../.git/annex/objects/8g/pZ/MD5E-s7526224--8c4754f17b5bfbb314384bcc8da75abb.json/MD5E-s7526224--8c4754f17b5bfbb314384bcc8da75abb.json

+ 1 - 1
datasets/childes_json_corpora/et.json

@@ -1 +1 @@
-../../.git/annex/objects/1x/Gv/MD5E-s8512506--8f79ccb462b01e1ca3ef1ff5ae5461cd.json/MD5E-s8512506--8f79ccb462b01e1ca3ef1ff5ae5461cd.json
+../../.git/annex/objects/6G/qj/MD5E-s8510895--eda1192d7270394f8c8f803cf32e98d8.json/MD5E-s8510895--eda1192d7270394f8c8f803cf32e98d8.json

+ 1 - 1
datasets/childes_json_corpora/eu.json

@@ -1 +1 @@
-../../.git/annex/objects/gw/pQ/MD5E-s1472131--5b9acb68d334a8e8682beb740356dc91.json/MD5E-s1472131--5b9acb68d334a8e8682beb740356dc91.json
+../../.git/annex/objects/Mj/k6/MD5E-s1471684--5a1181aa8b8978b87a9dc0c892037c7a.json/MD5E-s1471684--5a1181aa8b8978b87a9dc0c892037c7a.json

+ 1 - 1
datasets/childes_json_corpora/fr.json

@@ -1 +1 @@
-../../.git/annex/objects/jq/Vx/MD5E-s3234823--ca293387898ab63ac7c7794b873f9e0c.json/MD5E-s3234823--ca293387898ab63ac7c7794b873f9e0c.json
+../../.git/annex/objects/WJ/Fq/MD5E-s3229799--9ca02feed25b6e39d947530451144a3b.json/MD5E-s3229799--9ca02feed25b6e39d947530451144a3b.json

+ 1 - 1
datasets/childes_json_corpora/ja.json

@@ -1 +1 @@
-../../.git/annex/objects/Vk/mV/MD5E-s7591871--e2e5bc66db54eaf1b073a6d115d86a04.json/MD5E-s7591871--e2e5bc66db54eaf1b073a6d115d86a04.json
+../../.git/annex/objects/mM/fW/MD5E-s7576050--1345451c2cb335107bca1b2b76f787a6.json/MD5E-s7576050--1345451c2cb335107bca1b2b76f787a6.json

+ 1 - 1
datasets/childes_json_corpora/pl.json

@@ -1 +1 @@
-../../.git/annex/objects/WM/qg/MD5E-s16258378--858cf5b3b3eaa45e0146ba94778f0fe5.json/MD5E-s16258378--858cf5b3b3eaa45e0146ba94778f0fe5.json
+../../.git/annex/objects/W8/Pv/MD5E-s16205319--65fa716e7e1bf98973d28b6a0825ab90.json/MD5E-s16205319--65fa716e7e1bf98973d28b6a0825ab90.json

+ 1 - 1
datasets/childes_json_corpora/pt.json

@@ -1 +1 @@
-../../.git/annex/objects/Qf/kG/MD5E-s6161273--53ac1b2e1337d602434858fb01393f5b.json/MD5E-s6161273--53ac1b2e1337d602434858fb01393f5b.json
+../../.git/annex/objects/F0/WM/MD5E-s6160640--c494b016644486e985bdf66ee5026f24.json/MD5E-s6160640--c494b016644486e985bdf66ee5026f24.json

+ 1 - 1
datasets/childes_json_corpora/sr.json

@@ -1 +1 @@
-../../.git/annex/objects/0w/3x/MD5E-s7914319--f4e61920a973cda4ec9629c33959a501.json/MD5E-s7914319--f4e61920a973cda4ec9629c33959a501.json
+../../.git/annex/objects/jJ/FF/MD5E-s7911866--42b4530f71f2f27b6b3a8bae229d7f8d.json/MD5E-s7911866--42b4530f71f2f27b6b3a8bae229d7f8d.json

+ 1 - 1
datasets/childes_json_corpora/tr.json

@@ -1 +1 @@
-../../.git/annex/objects/ZG/V9/MD5E-s913674--2990a8cf6fd26ef03c0c28154606bbb6.json/MD5E-s913674--2990a8cf6fd26ef03c0c28154606bbb6.json
+../../.git/annex/objects/7J/KZ/MD5E-s913366--6c2544ff0dad5f838aee28aa3cc31287.json/MD5E-s913366--6c2544ff0dad5f838aee28aa3cc31287.json

+ 1 - 0
datasets/childes_one_utterance_per_line_files/da.one_utterance_per_line

@@ -0,0 +1 @@
+../../.git/annex/objects/81/8Z/MD5E-s4522557--f2af9f79413a10c65273a1b3e8ebdae5/MD5E-s4522557--f2af9f79413a10c65273a1b3e8ebdae5

+ 1 - 0
datasets/childes_one_utterance_per_line_files/de.one_utterance_per_line

@@ -0,0 +1 @@
+../../.git/annex/objects/VP/k7/MD5E-s48271457--1571db7b4153ac79aa881130cbb180c4/MD5E-s48271457--1571db7b4153ac79aa881130cbb180c4

+ 1 - 0
datasets/childes_one_utterance_per_line_files/en.one_utterance_per_line

@@ -0,0 +1 @@
+../../.git/annex/objects/03/Fm/MD5E-s39751512--db561b3c2b236c543d148d4be6c68302/MD5E-s39751512--db561b3c2b236c543d148d4be6c68302

+ 1 - 0
datasets/childes_one_utterance_per_line_files/es.one_utterance_per_line

@@ -0,0 +1 @@
+../../.git/annex/objects/68/37/MD5E-s13920227--df0a021631fc74f8b7a21ad2947a0e8f/MD5E-s13920227--df0a021631fc74f8b7a21ad2947a0e8f

+ 1 - 0
datasets/childes_one_utterance_per_line_files/et.one_utterance_per_line

@@ -0,0 +1 @@
+../../.git/annex/objects/wz/mq/MD5E-s12262395--56255fe3e6502be7117803bed1891236/MD5E-s12262395--56255fe3e6502be7117803bed1891236

+ 1 - 0
datasets/childes_one_utterance_per_line_files/eu.one_utterance_per_line

@@ -0,0 +1 @@
+../../.git/annex/objects/Km/XJ/MD5E-s2347275--3ff726a380f86ea7829070770d3a5add/MD5E-s2347275--3ff726a380f86ea7829070770d3a5add

+ 1 - 0
datasets/childes_one_utterance_per_line_files/fr.one_utterance_per_line

@@ -0,0 +1 @@
+../../.git/annex/objects/1J/VJ/MD5E-s4950748--53eb94bc964b4944c4786e0bc116aa12/MD5E-s4950748--53eb94bc964b4944c4786e0bc116aa12

+ 1 - 0
datasets/childes_one_utterance_per_line_files/ja.one_utterance_per_line

@@ -0,0 +1 @@
+../../.git/annex/objects/g0/q2/MD5E-s11677950--362fdb2c641231bac1a59772d4347984/MD5E-s11677950--362fdb2c641231bac1a59772d4347984

+ 1 - 0
datasets/childes_one_utterance_per_line_files/not_downloaded_data.txt

@@ -0,0 +1 @@
+../../.git/annex/objects/93/1P/MD5E-s103--3d2a6963fc888f7069784dc560979713.txt/MD5E-s103--3d2a6963fc888f7069784dc560979713.txt

+ 1 - 0
datasets/childes_one_utterance_per_line_files/pl.one_utterance_per_line

@@ -0,0 +1 @@
+../../.git/annex/objects/XK/Zj/MD5E-s15486053--8e85693f81dcd0d4432983c9d47e568a/MD5E-s15486053--8e85693f81dcd0d4432983c9d47e568a

+ 1 - 0
datasets/childes_one_utterance_per_line_files/pt.one_utterance_per_line

@@ -0,0 +1 @@
+../../.git/annex/objects/z1/7V/MD5E-s8140234--9401d864c5b1263e16f740b85ac1b3d9/MD5E-s8140234--9401d864c5b1263e16f740b85ac1b3d9

+ 1 - 0
datasets/childes_one_utterance_per_line_files/sr.one_utterance_per_line

@@ -0,0 +1 @@
+../../.git/annex/objects/MW/81/MD5E-s13343895--deccb5e7ad1ac130de88bd7bbb107ece/MD5E-s13343895--deccb5e7ad1ac130de88bd7bbb107ece

+ 1 - 0
datasets/childes_one_utterance_per_line_files/tr.one_utterance_per_line

@@ -0,0 +1 @@
+../../.git/annex/objects/Fw/F5/MD5E-s1229238--b544181cb4b319bb4ef87b348c91a9cb/MD5E-s1229238--b544181cb4b319bb4ef87b348c91a9cb

+ 1 - 0
estimated/da.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/5x/74/MD5E-s28028840--71fbf9fb169884d736da26c047e16f4e.arpa/MD5E-s28028840--71fbf9fb169884d736da26c047e16f4e.arpa

+ 1 - 0
estimated/de.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/Z2/0W/MD5E-s22540364--11e64685c900b25e47a7c2a137dd7a9b.arpa/MD5E-s22540364--11e64685c900b25e47a7c2a137dd7a9b.arpa

+ 1 - 0
estimated/en.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/KG/5q/MD5E-s31436879--847b2a7d2e5210d87f638963a8764808.arpa/MD5E-s31436879--847b2a7d2e5210d87f638963a8764808.arpa

+ 1 - 0
estimated/es.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/Zq/pj/MD5E-s10061705--b466f7fc80c31c74891f85256d324c43.arpa/MD5E-s10061705--b466f7fc80c31c74891f85256d324c43.arpa

+ 1 - 0
estimated/et.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/w4/9Q/MD5E-s18873182--89176dfdd746dd62fe277cf760489709.arpa/MD5E-s18873182--89176dfdd746dd62fe277cf760489709.arpa

+ 1 - 0
estimated/eu.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/vZ/2G/MD5E-s12176188--ae20d403fb51fef0b7572521b95d47a9.arpa/MD5E-s12176188--ae20d403fb51fef0b7572521b95d47a9.arpa

+ 1 - 0
estimated/fr.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/QG/ff/MD5E-s20901089--1873e4fa871af748a4028e962a941b74.arpa/MD5E-s20901089--1873e4fa871af748a4028e962a941b74.arpa

+ 1 - 0
estimated/ja.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/6W/kM/MD5E-s8026445--d320df753b865052827e96c0be67e418.arpa/MD5E-s8026445--d320df753b865052827e96c0be67e418.arpa

+ 1 - 0
estimated/pl.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/5j/46/MD5E-s23833364--0c4492ab80d3c7f37ff923288dc88d80.arpa/MD5E-s23833364--0c4492ab80d3c7f37ff923288dc88d80.arpa

+ 1 - 0
estimated/pt.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/ZF/pz/MD5E-s22346672--1a9f56836b07f9a0d981e329ce47e1c9.arpa/MD5E-s22346672--1a9f56836b07f9a0d981e329ce47e1c9.arpa

+ 1 - 0
estimated/sr.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/6M/xg/MD5E-s20755431--b4f26a89a36c9c4a61bb39a00c83c116.arpa/MD5E-s20755431--b4f26a89a36c9c4a61bb39a00c83c116.arpa

+ 1 - 0
estimated/tr.one_sentence_per_line.arpa

@@ -0,0 +1 @@
+../.git/annex/objects/Gf/70/MD5E-s18935056--4fe9ce073a5c9cb9e601fa1424524c3a.arpa/MD5E-s18935056--4fe9ce073a5c9cb9e601fa1424524c3a.arpa

+ 1 - 0
final_results_analysis.Rmd

@@ -0,0 +1 @@
+.git/annex/objects/QF/K4/MD5E-s8054--5b841a6350a21641fbf42de9283b83a3.Rmd/MD5E-s8054--5b841a6350a21641fbf42de9283b83a3.Rmd

+ 1 - 0
results/results.csv

@@ -0,0 +1 @@
+../.git/annex/objects/V1/Q3/MD5E-s10147494--3d57c9e2bb1e22146849572799f84041.csv/MD5E-s10147494--3d57c9e2bb1e22146849572799f84041.csv