1 year ago · 202ad44f37
--- a/code/__pycache__/make_noiser.cpython-310.pyc
+++ b/code/__pycache__/make_noiser.cpython-310.pyc
--- a/code/download_childes_corpora.py
+++ b/code/download_childes_corpora.py
@@ -246,7 +246,9 @@ if __name__ == "__main__" :
 
				     parser.add_argument("--markers_json",
			
 
				                         help="Json markers that serve for cleaning.",
			
 
				                         required=True)
			
 
				-    parser.add_argument("--phonemize_child", action=BooleanOptionalAction)
			
 
				+    parser.add_argument("--phonemize_child",
			
 
				+        help="Whether phonemize child utterances or not.",
			
 
				+        action=BooleanOptionalAction)
			
 
				     args = parser.parse_args()
			
 
				     phonemize_child_or_not = args.phonemize_child
			
 
				     yaml_file = args.yaml_file
			
--- a/code/download_opensubtitles_corpora.py
+++ b/code/download_opensubtitles_corpora.py
@@ -8,18 +8,18 @@
 
				     and only extract the extract the necessary number of sentences only on these chunks.
			
 
				 """
			
 
				 import os
			
 
				-from random import shuffle
			
 
				 import re
			
 
				+from typing import Iterator
			
 
				+import string
			
 
				+import random
			
 
				 from io import BytesIO
			
 
				+from random import shuffle
			
 
				 import gzip
			
 
				 import yaml
			
 
				 import requests
			
 
				 from tqdm import tqdm
			
 
				-import string
			
 
				 from phonemizer.backend import EspeakBackend
			
 
				 from phonemizer.separator import Separator
			
 
				-from typing import Iterator
			
 
				-import random
			
 
				 random.seed(80)
			
 
				 
			
 
				 class DownloadOpenSubtitlesData :
			
@@ -40,11 +40,11 @@ class DownloadOpenSubtitlesData :
 
				         self.base_url  = f"https://opus.nlpl.eu/download.php?f=OpenSubtitles/v{version}/mono/OpenSubtitles.raw."
			
 
				         self.separator = Separator(phone='$', word='@')
			
 
				         self.total_sents = 0
			
 
				-    
			
 
				+
			
 
				     def _remove_ponctuations(self, sentence: str) -> str :
			
 
				         """
			
 
				         Method that removes ponctuations from a given sentence.
			
 
				-        
			
 
				+
			
 
				         Parameters
			
 
				         ----------
			
 
				         - sent : str
			
@@ -60,20 +60,22 @@ class DownloadOpenSubtitlesData :
 
				     def _remove_brackets(self, sentence: str) -> str:
			
 
				         """
			
 
				         Method that removes brackets from a given sentence.
			
 
				-        
			
 
				+
			
 
				         Parameters
			
 
				         ----------
			
 
				         - sentence : str
			
 
				             The sentence for which brackets need to be removed.
			
 
				-        
			
 
				+
			
 
				         Returns
			
 
				         -------
			
 
				         - str :
			
 
				             The sentence without brackets.
			
 
				         """
			
 
				         return re.sub(r"[\(\[].*?[\)\]]", "", sentence)
			
 
				-        
			
 
				-    def get_sentences(self, language: str, max_sents_to_download: int, chunk: int=128) -> Iterator[tuple]:
			
 
				+
			
 
				+    def get_sentences(self, language: str,
			
 
				+                        max_sents_to_download: int,
			
 
				+                        chunk: int=128) -> Iterator[tuple]:
			
 
				         """
			
 
				         Function for getting sentences from opensubtitles for a given language\
			
 
				         and a number of sentences.
			
@@ -84,10 +86,10 @@ class DownloadOpenSubtitlesData :
 
				             The language for which to retrieve the sentences.
			
 
				         - max_sents_to_process : str
			
 
				             The number of sentences to retrieve.
			
 
				-        
			
 
				+
			
 
				         Returns
			
 
				         -------
			
 
				-        - Iterator : 
			
 
				+        - Iterator :
			
 
				             Iterator over sentences and progressbars
			
 
				         """
			
 
				         # stream in order to not load all on memory
			
@@ -102,28 +104,37 @@ class DownloadOpenSubtitlesData :
 
				                 chunk = next(chunks)
			
 
				                 try :
			
 
				                     for sent in gzip.open(BytesIO(chunk), "rt") :
			
 
				-                        if self.total_sents >= max_sents_to_download : 
			
 
				+                        if self.total_sents >= max_sents_to_download :
			
 
				                             break
			
 
				                         else :
			
 
				                             yield sent, progress_bar
			
 
				                 except : # if exception, this means the chunk size is too small for gzip
			
 
				-                    print(f"The chunk size is to small for {max_sents_to_download} sentences to download")
			
 
				+                    print(f"The chunk size is to small for {max_sents_to_download}\
			
 
				+                            sentences to download")
			
 
				                     break
			
 
				-    
			
 
				-    def __call__(self, loaded_yaml_file, train_sentences, dev_sentences, chunk, out_dirname) -> None:
			
 
				+
			
 
				+    def __call__(self,
			
 
				+                loaded_yaml_file,
			
 
				+                train_sentences,
			
 
				+                dev_sentences,
			
 
				+                chunk,
			
 
				+                out_dirname) -> None:
			
 
				         """
			
 
				         Collect the sentences for all languages.
			
 
				 
			
 
				         Paramaters
			
 
				         ----------
			
 
				         - loaded_yaml_file : dict
			
 
				-            This dictionary contains all informations relevant for this study, for each language. \
			
 
				-            This dictionary also contains informations about espeak ids for the languages, and this is relevant
			
 
				-            for phonemization.
			
 
				+            This dictionary contains all informations relevant\
			
 
				+            for this study, for each language. This dictionary also\
			
 
				+            contains informations about espeak ids for the languages,\
			
 
				+            and this is relevant for phonemization.
			
 
				         - train_sentences : int
			
 
				-            Number of sentences to download for train corpora. This number is the same for all languages.
			
 
				+            Number of sentences to download for train corpora.\
			
 
				+            This number is the same for all languages.
			
 
				         - dev_sentences : int
			
 
				-            Number of sentences to download for dev corpora. This number is the same for all languages.
			
 
				+            Number of sentences to download for dev corpora.\
			
 
				+            This number is the same for all languages.
			
 
				         - out_dirname : str
			
 
				             The folder where the outputs will be saved.
			
 
				         """
			
@@ -154,7 +165,7 @@ class DownloadOpenSubtitlesData :
 
				             shuffle(added_sents)
			
 
				             train = added_sents[:train_sentences]
			
 
				             dev = added_sents[train_sentences:max_sents_to_download]
			
 
				-            for sent_train in train : 
			
 
				+            for sent_train in train :
			
 
				                 output_file_train.write(sent_train + "\n")
			
 
				             for sent_dev in dev :
			
 
				                 output_file_dev.write(sent_dev + "\n")
			
@@ -165,22 +176,24 @@ if __name__ == "__main__" :
 
				     parser = ArgumentParser()
			
 
				 
			
 
				     parser.add_argument("--yaml_file",
			
 
				-                        help="YAML File containing for each language, all relevant information for downloading the data.",
			
 
				+                        help="YAML File containing for each language,\
			
 
				+                            all relevant information for downloading the data.",
			
 
				                         required=True)
			
 
				     parser.add_argument("--out_dirname",
			
 
				                         help="The directory where outputs will be stored.",
			
 
				                         required=True)
			
 
				     parser.add_argument("--chunk",
			
 
				-                        help="For the chunk size. This number should grow as much as you want to download many sentences.\
			
 
				+                        help="For the chunk size. This number should\
			
 
				+                            grow as much as you want to download many sentences.\
			
 
				                             256 is a good number when you want to get 1_000_000 or less sentences",
			
 
				                         default=1024,
			
 
				                         required=False)
			
 
				     parser.add_argument("--train_sentences",
			
 
				-                        help="Number of sent for the train corpora.",
			
 
				+                        help="Number of sent for the training corpora.",
			
 
				                         default=200_000,
			
 
				                         required=False)
			
 
				     parser.add_argument("--dev_sentences",
			
 
				-                        help="Number of sent for the dev copora.",
			
 
				+                        help="Number of sent for the dev or test copora.",
			
 
				                         default=10_000,
			
 
				                         required=False)
			
 
				     args = parser.parse_args()
			
@@ -196,5 +209,8 @@ if __name__ == "__main__" :
 
				         os.makedirs(f"{out_dirname}/tokenized_in_phonemes_dev")
			
 
				     languages_to_download_informations = yaml.safe_load(open(args.yaml_file))
			
 
				     downloader = DownloadOpenSubtitlesData()
			
 
				-    downloader(languages_to_download_informations, args.train_sentences, args.dev_sentences, chunk, out_dirname)
			
 
				-
			
 
				+    downloader(languages_to_download_informations,
			
 
				+                args.train_sentences,
			
 
				+                args.dev_sentences,
			
 
				+                chunk,
			
 
				+                out_dirname)
			
--- a/code/evaluate_language_models.py
+++ b/code/evaluate_language_models.py
@@ -0,0 +1,85 @@
 
				+"""This module implements a function that\
			
 
				+    evaluate the trained language moedels"""
			
 
				+import os
			
 
				+from math import log
			
 
				+import random
			
 
				+import pandas as pd
			
 
				+import kenlm
			
 
				+random.seed(1023)
			
 
				+
			
 
				+LANGUAGES_TYPOLOGIES = {
			
 
				+    'da' : ("Danish", "fusional"),
			
 
				+    'de' : ("German", "fusional"),
			
 
				+    'en' : ("English", "fusional"),
			
 
				+    'es' : ("Spanish", "fusional"),
			
 
				+    'et' : ("Estonian", "agglutinative"),
			
 
				+    'eu' : ("Basque", "agglutinative"),
			
 
				+    'fr' : ("French", "fusional"),
			
 
				+    'ja' : ("Japanese", "agglutinative"),
			
 
				+    'pl' : ("Polish", "fusional"),
			
 
				+    'pt' : ("Portuguese", "fusional"),
			
 
				+    'sr' : ("Serbian", "fusional"),
			
 
				+    'tr' : ("Turkish", "agglutinative")}
			
 
				+
			
 
				+def evaluate(train_files_directory: str,
			
 
				+                dev_files_directory: str,
			
 
				+                models_directory: str) -> pd.DataFrame:
			
 
				+    """
			
 
				+    This function will compute the entropies of\
			
 
				+    test files for all languages.
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+    - train_files_directory: str
			
 
				+        The path to the directory containing training files.
			
 
				+    - dev_files_directory: str
			
 
				+        The path to the directory containing testing/development files.
			
 
				+    - models_directory: str
			
 
				+        The path to the directory containing training trained\
			
 
				+        language models.
			
 
				+    """
			
 
				+    triplets_files_model = zip(sorted(os.listdir(train_files_directory)),
			
 
				+                                sorted(os.listdir(dev_files_directory)),
			
 
				+                                sorted(os.listdir(models_directory)))
			
 
				+    columns = ["language", "train_entropy", "dev_entropy"]
			
 
				+    evaluation = pd.DataFrame(columns=columns, index=None)
			
 
				+    for train_filename, dev_filename, model_filename in triplets_files_model :
			
 
				+        language, _ = train_filename.split(".")
			
 
				+        model = model = kenlm.Model(f"{models_directory}/{model_filename}")
			
 
				+        train_sents = "\n".join(sent.strip() for sent in open(f"{train_files_directory}/{train_filename}"))
			
 
				+        train_entropy = log(model.perplexity(train_sents))
			
 
				+        dev_sents = "\n".join(sent.strip() for sent in open(f"{dev_files_directory}/{dev_filename}"))
			
 
				+        dev_entropy = log(model.perplexity(dev_sents))
			
 
				+        new_row = {
			
 
				+            "language" : LANGUAGES_TYPOLOGIES[language][0],
			
 
				+            "train_entropy" : train_entropy,
			
 
				+            "dev_entropy" : dev_entropy
			
 
				+        }
			
 
				+        evaluation = evaluation.append(new_row, ignore_index=True)
			
 
				+    return evaluation
			
 
				+if __name__ == "__main__":
			
 
				+    import argparse
			
 
				+
			
 
				+    parser = argparse.ArgumentParser()
			
 
				+    parser.add_argument('--train_files_directory',
			
 
				+        required=True,
			
 
				+        help="The directory containing the OpenSubtitles training files"
			
 
				+        )
			
 
				+    parser.add_argument('--dev_files_directory',
			
 
				+        required=True,
			
 
				+        help="The directory containing the OpenSubtitles test files"
			
 
				+        )
			
 
				+    parser.add_argument('--models_directory',
			
 
				+        required=True,
			
 
				+        help="The directory containing the trained language models"
			
 
				+        )
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+    train_files = args.train_files_directory
			
 
				+    dev_files = args.dev_files_directory
			
 
				+    models_directory = args.models_directory
			
 
				+    if not os.path.exists("results"):
			
 
				+        os.makedirs("results")
			
 
				+    evaluate(train_files,
			
 
				+                dev_files,
			
 
				+                models_directory).to_csv("results/evaluation.csv")
			
--- a/code/make_noiser.py
+++ b/code/make_noiser.py
@@ -9,32 +9,38 @@ random.seed(80)
 
				 
			
 
				 class Noise :
			
 
				     """
			
 
				-    This class simulate noise in the data. Crucially, noise can be made on three points :\
			
 
				-    (1) The noise of phonemes order of a given sequence by making the order of the sequence more aribitrary,\
			
 
				-    (2) Replacement of some phonemes of a given sequence by arbitrary sampled phonemes from a vocabulary and\
			
 
				+    This class simulate noise in the data. Crucially,\
			
 
				+    noise can be made on three points :\
			
 
				+    (1) The noise of phonemes order of a given sequence\
			
 
				+    by making the order of the sequence more aribitrary,\
			
 
				+    (2) Replacement of some phonemes of a given sequence\
			
 
				+    by arbitrary sampled phonemes from a vocabulary and\
			
 
				     (3) By arbitrary interverting some sequences of two different speakers.
			
 
				 
			
 
				     Atributes
			
 
				     ---------
			
 
				     - phonemes_order_noise :
			
 
				-        Parameter for controling the degree of noise at the level of phonemes order. See the point 1 mentioned above.
			
 
				+        Parameter for controling the degree of noise at the level\
			
 
				+        of phonemes order. See the point 1 mentioned above.
			
 
				     - speakers_noise :
			
 
				-        Parameters for controling the degree of noise at the level of speakers. See the point 3 mentioned above.
			
 
				+        Parameters for controling the degree of noise at the level\
			
 
				+        of speakers. See the point 3 mentioned above.
			
 
				     - phonemes_noise :
			
 
				-        Parameter for controling the degree of noise at the level of phonemes. See the point 2 mentioned above.
			
 
				+        Parameter for controling the degree of noise at the level of phonemes.
			
 
				+        See the point 2 mentioned above.
			
 
				     """
			
 
				 
			
 
				     def __init__(self,
			
 
				                     most_probable_phonemes: list,
			
 
				-                    phonemes_order_noise=0.3,
			
 
				-                    speakers_noise=(0.5, 0.5),
			
 
				-                    phonemes_noise=0.5) :
			
 
				+                    phonemes_order_noise_value=0.3,
			
 
				+                    speakers_noise_values=(0.5, 0.5),
			
 
				+                    phonemes_noise_value=0.5) :
			
 
				         self.most_probable_phonemes = most_probable_phonemes
			
 
				-        self.phonemes_order_noise = phonemes_order_noise
			
 
				-        self.speakers_noise = speakers_noise
			
 
				-        self.phonemes_noise = phonemes_noise
			
 
				-    
			
 
				-    def _order_noise(self, sequence: List[str]) -> str :
			
 
				+        self.phonemes_order_noise_value = phonemes_order_noise_value
			
 
				+        self.speakers_noise_values = speakers_noise_values
			
 
				+        self.phonemes_noise_value = phonemes_noise_value
			
 
				+
			
 
				+    def order_noise(self, sequence: List[str]) -> str :
			
 
				         """
			
 
				         Making noise the order of the phonemes in a given sequence
			
 
				 
			
@@ -42,24 +48,24 @@ class Noise :
 
				         ----------
			
 
				         - sequence : list
			
 
				             The sequence for which the phonemes order must be noised.
			
 
				-        
			
 
				+
			
 
				         Returns
			
 
				         -------
			
 
				         - str :
			
 
				             The sequence with the order of phonemes noised.
			
 
				         """
			
 
				         # number of phonemes to noise in the sequence = len(sequence) / nb_phonemes_to_noise
			
 
				-        phonemes_to_noise = round(len(sequence) * self.phonemes_order_noise)
			
 
				+        phonemes_to_noise = round(len(sequence) * self.phonemes_order_noise_value)
			
 
				         # sample nb_phonemes_to_noise positions in the sequence
			
 
				         positions_sampled = list(sample(range(len(sequence)), k=phonemes_to_noise))
			
 
				         copied_positions = copy.deepcopy(positions_sampled)
			
 
				         shuffle(copied_positions)
			
 
				-        # change the positions of the sampled phonemes 
			
 
				+        # change the positions of the sampled phonemes
			
 
				         for original_position, new_position in zip(positions_sampled, copied_positions):
			
 
				             sequence[original_position] = sequence[new_position]
			
 
				         return " ".join(sequence)
			
 
				 
			
 
				-    def _phonemes_noise(self, sequence: List[str]) -> str :
			
 
				+    def phonemes_noise(self, sequence: List[str]) -> str :
			
 
				         """
			
 
				         Makinng noise the phonemes of the sequence by replacing\
			
 
				         some phonemes of the sequence by arbitrary sampled phonemes\
			
@@ -69,13 +75,14 @@ class Noise :
 
				         ----------
			
 
				         - sequence : list
			
 
				             The sequence for which the phonemes must be noised.
			
 
				-        
			
 
				+
			
 
				         Returns
			
 
				         -------
			
 
				         - str :
			
 
				             The sequence with noised phonemes.
			
 
				         """
			
 
				-        phonemes_to_noise = round(len(sequence) * self.phonemes_noise)
			
 
				+        phonemes_to_noise = round(len(sequence) * self.phonemes_noise_value)
			
 
				+        assert phonemes_to_noise < len(sequence), "Number of phoneme to noise greather that sequence's length"
			
 
				         indexes = choices(range(len(sequence)), k=phonemes_to_noise)
			
 
				         # choose new phonemes only from the most probable phonemes.
			
 
				         phonemes = choices(self.most_probable_phonemes, k=phonemes_to_noise)
			
@@ -83,8 +90,8 @@ class Noise :
 
				         for idx, phonemes in zip(indexes, phonemes) :
			
 
				             sequence[idx] = phonemes
			
 
				         return " ".join(sequence)
			
 
				-    
			
 
				-    def _speakers_noise(self, speakers_sequences: Dict[str, set]) -> Dict[str, set] :
			
 
				+
			
 
				+    def speakers_noise(self, speakers_sequences: Dict[str, set]) -> Dict[str, set] :
			
 
				         """
			
 
				         Making noise in the speaker's statements.
			
 
				 
			
@@ -92,14 +99,15 @@ class Noise :
 
				         ----------
			
 
				         - speakers_sequences : dict
			
 
				             Dictionary containing the utterances for each speaker.
			
 
				-        
			
 
				+
			
 
				         Returns
			
 
				         -------
			
 
				         - dict :
			
 
				-            The dictionary containing the few statements interchanged between the two speakers.
			
 
				+            The dictionary containing the few statements\
			
 
				+            interchanged between the two speakers.
			
 
				         """
			
 
				         first_speaker, second_speaker = "Target_Child", "Adult"
			
 
				-        noise_first_speaker, noise_second_speaker = self.speakers_noise
			
 
				+        noise_first_speaker, noise_second_speaker = self.speakers_noise_values
			
 
				         speakers_sequences[second_speaker] = set(speakers_sequences[second_speaker])
			
 
				         speakers_sequences[first_speaker] = set(speakers_sequences[first_speaker])
			
 
				         # sample some percentage of utterances from each speaker
			
@@ -115,9 +123,9 @@ class Noise :
 
				         # set to list
			
 
				         speakers_sequences[first_speaker] = list(speakers_sequences[first_speaker])
			
 
				         speakers_sequences[second_speaker] = list(speakers_sequences[second_speaker])
			
 
				-        
			
 
				+
			
 
				         return speakers_sequences
			
 
				-    
			
 
				+
			
 
				     def __call__(self, loaded_dataset: dict) -> dict:
			
 
				         """
			
 
				         Apply the three types of noise.
			
@@ -125,8 +133,9 @@ class Noise :
 
				         Parameters
			
 
				         ----------
			
 
				         loaded_dataset : dict
			
 
				-            The dictionary containing the utterances for each family, at each and for each speaker.
			
 
				-        
			
 
				+            The dictionary containing the utterances for each family,\
			
 
				+            at each and for each speaker.
			
 
				+
			
 
				         Returns
			
 
				         -------
			
 
				         dict :
			
@@ -137,15 +146,17 @@ class Noise :
 
				                 if "Adult" in loaded_dataset[family][age] :
			
 
				                     for idx, utterance in enumerate(loaded_dataset[family][age]["Adult"]) :
			
 
				                         splitted_utterance = utterance.split(" ")
			
 
				-                        loaded_dataset[family][age]["Adult"][idx] = self._order_noise(splitted_utterance)
			
 
				-                        loaded_dataset[family][age]["Adult"][idx] = self._phonemes_noise(splitted_utterance)
			
 
				+                        loaded_dataset[family][age]["Adult"][idx] = self.order_noise(splitted_utterance)
			
 
				+                        loaded_dataset[family][age]["Adult"][idx] = self.phonemes_noise(splitted_utterance)
			
 
				                 if "Target_Child" in loaded_dataset[family][age] :
			
 
				                     for idx, utterance in enumerate(loaded_dataset[family][age]["Target_Child"]) :
			
 
				                         splitted_utterance = utterance.split(" ")
			
 
				-                        loaded_dataset[family][age]["Target_Child"][idx] = self._order_noise(splitted_utterance)
			
 
				-                        loaded_dataset[family][age]["Target_Child"][idx] = self._phonemes_noise(splitted_utterance)
			
 
				-                if "Target_Child" in loaded_dataset[family][age] and "Adult" in loaded_dataset[family][age] and all(self.speakers_noise):
			
 
				-                    noised_speaker = self._speakers_noise(loaded_dataset[family][age])
			
 
				+                        loaded_dataset[family][age]["Target_Child"][idx] = self.order_noise(splitted_utterance)
			
 
				+                        loaded_dataset[family][age]["Target_Child"][idx] = self.phonemes_noise(splitted_utterance)
			
 
				+                if("Target_Child" in loaded_dataset[family][age]
			
 
				+                    and "Adult" in loaded_dataset[family][age]
			
 
				+                    and all(self.speakers_noise_values)):
			
 
				+                    noised_speaker = self.speakers_noise(loaded_dataset[family][age])
			
 
				                     loaded_dataset[family][age]["Target_Child"] = noised_speaker["Target_Child"]
			
 
				                     loaded_dataset[family][age]["Adult"] = noised_speaker["Adult"]
			
 
				-        return loaded_dataset
			
 
				+        return loaded_dataset
			
--- a/code/test_on_all_languages.py
+++ b/code/test_on_all_languages.py
@@ -1,20 +1,14 @@
 
				 import os
			
 
				-import sys
			
 
				+import random
			
 
				 import json
			
 
				-sys.path.append("./")
			
 
				-sys.path.append("../")
			
 
				-sys.path.append(".../")
			
 
				+from math import log
			
 
				+from typing import Iterable
			
 
				 from itertools import product
			
 
				 from tqdm import tqdm
			
 
				 import kenlm
			
 
				-from math import log
			
 
				-import numpy as np
			
 
				 from make_noiser import Noise
			
 
				 import pandas as pd
			
 
				-import sys
			
 
				 from get_most_probable_phonemes import get_most_probable_phonemes
			
 
				-import random
			
 
				-from collections import Counter
			
 
				 random.seed(1023)
			
 
				 
			
 
				 
			
@@ -26,29 +20,29 @@ LANGUAGES_TYPOLOGIES = {
 
				     'et' : ("Estonian", "agglutinative"),
			
 
				     'eu' : ("Basque", "agglutinative"),
			
 
				     'fr' : ("French", "fusional"),
			
 
				-    'ja' : ("Japanese", "agglutinative"), 
			
 
				-    'pl' : ("Polish", "fusional"), 
			
 
				-    'pt' : ("Portuguese", "fusional"), 
			
 
				-    'sr' : ("Serbian", "fusional"), 
			
 
				+    'ja' : ("Japanese", "agglutinative"),
			
 
				+    'pl' : ("Polish", "fusional"),
			
 
				+    'pt' : ("Portuguese", "fusional"),
			
 
				+    'sr' : ("Serbian", "fusional"),
			
 
				     'tr' : ("Turkish", "agglutinative")}
			
 
				 
			
 
				-def compute_word_frequencies(word_train_corpus, pct=0.95) :
			
 
				-    frequencies = Counter()
			
 
				-    for line in word_train_corpus :
			
 
				-        line = line.strip()
			
 
				-        if not line : continue
			
 
				-        # line = line.strip()
			
 
				-        frequencies.update(Counter(line.split(" ")))
			
 
				-    return dict(frequencies)
			
 
				-
			
 
				-
			
 
				-def statistics_word(utterances, word_frequencies, model) :
			
 
				+def statistics_word(utterances: list, model: kenlm.Model) -> dict:
			
 
				+    """
			
 
				+    This function will test a given language model\
			
 
				+    on a given list of utterances.\
			
 
				+    The function will also compute some statistics; MLU, TTR, etc
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+    - model
			
 
				+        The estimated language model
			
 
				+    - utterances: list
			
 
				+        The utterances to test
			
 
				+    """
			
 
				     phoneme_utterances = []
			
 
				     unique_words = set()
			
 
				-    nb_unk = 0
			
 
				     mlu_w = 0.0
			
 
				     mlu_p = 0.0
			
 
				-    mean_word_frequencies = 0
			
 
				     nb_utterances = 0
			
 
				     nb_words = 0
			
 
				 
			
@@ -68,13 +62,6 @@ def statistics_word(utterances, word_frequencies, model) :
 
				         nb_words += len(utterance_words)
			
 
				         unique_words |= set(utterance_words)
			
 
				 
			
 
				-        for word in utterance_words :
			
 
				-            word = word.strip()
			
 
				-            if word in word_frequencies :
			
 
				-                mean_word_frequencies += word_frequencies[word]
			
 
				-            else : 
			
 
				-                nb_unk += 1
			
 
				-    
			
 
				     mlu_w /= nb_utterances
			
 
				     mlu_p /= nb_utterances
			
 
				     ttr_w = len(unique_words) / nb_words
			
@@ -87,60 +74,62 @@ def statistics_word(utterances, word_frequencies, model) :
 
				     statistics["mlu_w"] = mlu_w
			
 
				     statistics["mlu_p"] = mlu_p
			
 
				     statistics["ttr_w"] = ttr_w
			
 
				-    statistics["mean_word_frequencies"] = mean_word_frequencies
			
 
				-    statistics["nb_unk"] = nb_unk
			
 
				 
			
 
				     return statistics
			
 
				 
			
 
				-def create_sparse_combinantions(values) :
			
 
				+def create_sparse_combinantions(values: Iterable) -> set:
			
 
				+    """
			
 
				+    This function will create combinantions for noising.
			
 
				+    Each item in the returned set contains four values corresponding\
			
 
				+    to (1) phoneme noise, (2) noise of from adult to child utterances,\
			
 
				+    (3) noise of from child to adult utterances and (4) noise of
			
 
				+    These combinantions are sparse because we only noise one value at time.
			
 
				+    For example, an item can be (0.0, 0.0, 0.0, 0.25), which means that we only
			
 
				+    noise 25 percent of the phonemes, and nothing else is affected.
			
 
				+    See the file make_noiser.py for more infomrations.
			
 
				+    """
			
 
				     sparse_combinantions = []
			
 
				     for value in values :
			
 
				-        for idx in range(len(values)) : 
			
 
				+        for idx in range(len(values)) :
			
 
				             sparse_values = [0.0] * len(values)
			
 
				             sparse_values[idx] = value
			
 
				             sparse_combinantions.append(tuple(sparse_values))
			
 
				     return set(sparse_combinantions)
			
 
				 
			
 
				-def test(json_files_directory, models_directory, phoneme_train_files, word_train_files, add_noise=False) :
			
 
				+def test(json_files_directory, models_directory, train_files, add_noise=True) :
			
 
				     """
			
 
				+    This function will test the language models on CHILDES corpora
			
 
				     """
			
 
				     columns = ["language", "typology", "family", "speaker",\
			
 
				                 "age", "perplexity", "entropy", "mlu", "mlu_without_repetition",\
			
 
				                 "phonemes_order_noise", "speakers_noise_adult",\
			
 
				                 "speakers_noise_child", "phonemes_noise"]
			
 
				     results = pd.DataFrame(columns=columns, index=None)
			
 
				-    all_combinations = list(product((0.0, 0.25, 0.5, 0.75), repeat=4)) if add_noise else [((0.0, 0.0, 0.0, 0.0))]
			
 
				-    # sparse_combinantions = create_sparse_combinantions((0.0, 0.25, 0.5, 0.75))
			
 
				+    # all_combinations = (list(product((0.0, 0.25, 0.5, 0.75), repeat=4))
			
 
				+    #                       if add_noise else [((0.0, 0.0, 0.0, 0.0))])
			
 
				+    sparse_combinantions = create_sparse_combinantions((0.0, 0.25, 0.5, 0.75))
			
 
				     # noise_values = np.linspace(0.0, 1.0, num=6)
			
 
				-    for phonemes_noise, speakers_noise_child, speakers_noise_adult, phonemes_order_noise in tqdm(all_combinations, total=len(all_combinations)) :
			
 
				+    for phonemes_noise, speakers_noise_child, speakers_noise_adult, phonemes_order_noise in tqdm(sparse_combinantions, total=len(sparse_combinantions)) :
			
 
				         for test_filename, model_filename in product(os.listdir(json_files_directory), os.listdir(models_directory)) :
			
 
				             lg_iso, _ = test_filename.split(".")
			
 
				             model_lg = model_filename.split(".")[0]
			
 
				-            if lg_iso != model_lg : continue
			
 
				-            print(lg_iso, model_lg)
			
 
				-            most_probable_phonemes = get_most_probable_phonemes(f"{phoneme_train_files}/{lg_iso}.one_sentence_per_line")
			
 
				-            word_frequencies = compute_word_frequencies(f"{word_train_files}/{lg_iso}.one_sentence_per_line")
			
 
				+            if lg_iso != model_lg :
			
 
				+                continue
			
 
				+            most_probable_phonemes = get_most_probable_phonemes(f"{train_files}/{lg_iso}.one_sentence_per_line")
			
 
				             loaded_json = json.load(open(f"{json_files_directory}/{test_filename}"))
			
 
				             if add_noise :
			
 
				                 noise = Noise(most_probable_phonemes,
			
 
				-                                phonemes_order_noise=phonemes_order_noise,
			
 
				-                                speakers_noise=(speakers_noise_child, speakers_noise_adult),
			
 
				-                                phonemes_noise=phonemes_noise)
			
 
				+                                phonemes_order_noise_value=phonemes_order_noise,
			
 
				+                                speakers_noise_values=(speakers_noise_child, speakers_noise_adult),
			
 
				+                                phonemes_noise_value=phonemes_noise)
			
 
				                 loaded_json = noise(loaded_json)
			
 
				             model = kenlm.Model(f"{models_directory}/{model_filename}")
			
 
				             for family in loaded_json :
			
 
				-                for age in loaded_json[family] : 
			
 
				+                for age in loaded_json[family] :
			
 
				                     if age == "None" : print(family, lg_iso, age); continue
			
 
				                     for speaker in loaded_json[family][age] :
			
 
				                         if speaker not in ["Adult", "Target_Child"] : continue
			
 
				-                        # test_utterances = "\n".join(loaded_json[family][age][speaker])
			
 
				-                        # utterances = [utterance.split(" ") for utterance in loaded_json[family][age][speaker]]
			
 
				-                        # mlu = np.mean([len(utterance) for utterance in utterances])
			
 
				-                        # mlu_without_repetition = np.mean([len(set(utterance)) for utterance in utterances])
			
 
				-                        # ppl = model.perplexity(test_utterances)
			
 
				-                        # entropy = log(ppl)
			
 
				-
			
 
				-                        results_statistics = statistics_word(loaded_json[family][age][speaker], word_frequencies, model)
			
 
				+                        results_statistics = statistics_word(loaded_json[family][age][speaker], model)
			
 
				                         language, typology = LANGUAGES_TYPOLOGIES[lg_iso]
			
 
				                         new_row =  {"language" : language,
			
 
				                                     "typology" : typology,
			
@@ -152,8 +141,6 @@ def test(json_files_directory, models_directory, phoneme_train_files, word_train
 
				                                     "mlu_w" : results_statistics["mlu_w"],
			
 
				                                     "mlu_p" : results_statistics["mlu_p"],
			
 
				                                     "ttr_w" : results_statistics["ttr_w"],
			
 
				-                                    "mean_word_frequencies" : results_statistics["mean_word_frequencies"],
			
 
				-                                    "nb_unk" : results_statistics["nb_unk"],
			
 
				                                     "phonemes_order_noise" : phonemes_order_noise,
			
 
				                                     "speakers_noise_adult" : speakers_noise_adult,
			
 
				                                     "speakers_noise_child" : speakers_noise_child,
			
@@ -164,46 +151,33 @@ if __name__ == "__main__":
 
				     from argparse import ArgumentParser, BooleanOptionalAction
			
 
				 
			
 
				     parser = ArgumentParser()
			
 
				-    parser.add_argument('--phoneme_train_directory',
			
 
				-        required=True,
			
 
				-        help="Dataset containing the train files in phonemes (dot one_sentence_per_line) "
			
 
				-        )
			
 
				-    parser.add_argument('--word_train_directory',
			
 
				+    parser.add_argument('--train_directory',
			
 
				         required=True,
			
 
				-        help="Dataset containing the train files in words (dot one_sentence_per_line) "
			
 
				+        help="The directory containing the train files tokenized in phonemes."
			
 
				         )
			
 
				-    parser.add_argument('--models_directory', 
			
 
				+    parser.add_argument('--models_directory',
			
 
				         required=True,
			
 
				-        help="Folder containing the estimated parameters"
			
 
				+        help="The directory containing the trained language models."
			
 
				         )
			
 
				-    
			
 
				+
			
 
				     parser.add_argument('--json_files_directory',
			
 
				         required=True,
			
 
				-        help="Directory containing json files for test"
			
 
				+        help="The directory containing CHILDES utterances in json format for each language"
			
 
				         )
			
 
				-    
			
 
				-    parser.add_argument('--out_dirname',
			
 
				-        required=True,
			
 
				-        help="Out directory"
			
 
				-        )
			
 
				-    parser.add_argument('--out_filename',
			
 
				-            required=True,
			
 
				-            help="Out filename"
			
 
				-            )
			
 
				-    parser.add_argument("--add_noise", action=BooleanOptionalAction)
			
 
				+
			
 
				+    parser.add_argument("--add_noise",
			
 
				+        help="Whether noise the CHILDES utterances or not",
			
 
				+        action=BooleanOptionalAction)
			
 
				 
			
 
				     args = parser.parse_args()
			
 
				     add_noise = args.add_noise
			
 
				     json_files_directory = args.json_files_directory
			
 
				-    phoneme_train_files, word_train_files = args.phoneme_train_directory, args.word_train_directory
			
 
				+    phoneme_train_files = args.train_directory
			
 
				     models_directory = args.models_directory
			
 
				-    out_dirname = args.out_dirname
			
 
				-    out_filename = args.out_filename
			
 
				 
			
 
				     if not os.path.exists("results"):
			
 
				         os.makedirs("results")
			
 
				-    test(json_files_directory, 
			
 
				-        models_directory, 
			
 
				-        phoneme_train_files,
			
 
				-        word_train_files, 
			
 
				-        add_noise).to_csv(f"{out_dirname}/{out_filename}.csv")
			
 
				+    test(json_files_directory,
			
 
				+            models_directory,
			
 
				+            phoneme_train_files,
			
 
				+            add_noise=add_noise).to_csv("results/results.csv")
			
--- a/datasets/opensubtitles_corpora/tokenized_in_words/da.one_sentence_per_line
+++ b/datasets/opensubtitles_corpora/tokenized_in_words/da.one_sentence_per_line
@@ -1 +0,0 @@
 
				-../../../.git/annex/objects/Xf/J3/MD5E-s15872283--589ddd6867dae20a40048aaed160f751/MD5E-s15872283--589ddd6867dae20a40048aaed160f751
			
--- a/datasets/opensubtitles_corpora/tokenized_in_words/de.one_sentence_per_line
+++ b/datasets/opensubtitles_corpora/tokenized_in_words/de.one_sentence_per_line
@@ -1 +0,0 @@
 
				-../../../.git/annex/objects/PK/VG/MD5E-s16478486--16c8639a6e1bc848b9e7c835138bc779/MD5E-s16478486--16c8639a6e1bc848b9e7c835138bc779
			
--- a/datasets/opensubtitles_corpora/tokenized_in_words/en.one_sentence_per_line
+++ b/datasets/opensubtitles_corpora/tokenized_in_words/en.one_sentence_per_line
@@ -1 +0,0 @@
 
				-../../../.git/annex/objects/k5/41/MD5E-s24346046--828c32d227d0c2d934ac85954c0549be/MD5E-s24346046--828c32d227d0c2d934ac85954c0549be
			
--- a/datasets/opensubtitles_corpora/tokenized_in_words/es.one_sentence_per_line
+++ b/datasets/opensubtitles_corpora/tokenized_in_words/es.one_sentence_per_line
@@ -1 +0,0 @@
 
				-../../../.git/annex/objects/Vj/Zq/MD5E-s15998814--588f8969805d7e19a2fb8555c805c7bf/MD5E-s15998814--588f8969805d7e19a2fb8555c805c7bf
			
--- a/datasets/opensubtitles_corpora/tokenized_in_words/et.one_sentence_per_line
+++ b/datasets/opensubtitles_corpora/tokenized_in_words/et.one_sentence_per_line
@@ -1 +0,0 @@
 
				-../../../.git/annex/objects/kK/m3/MD5E-s16250386--1fa013016100de4c1369aaaf5b51c3bc/MD5E-s16250386--1fa013016100de4c1369aaaf5b51c3bc
			
--- a/datasets/opensubtitles_corpora/tokenized_in_words/eu.one_sentence_per_line
+++ b/datasets/opensubtitles_corpora/tokenized_in_words/eu.one_sentence_per_line
@@ -1 +0,0 @@
 
				-../../../.git/annex/objects/Z5/WJ/MD5E-s10355435--f2c85a6f418cd351e643387f77f80466/MD5E-s10355435--f2c85a6f418cd351e643387f77f80466
			
--- a/datasets/opensubtitles_corpora/tokenized_in_words/fr.one_sentence_per_line
+++ b/datasets/opensubtitles_corpora/tokenized_in_words/fr.one_sentence_per_line
@@ -1 +0,0 @@
 
				-../../../.git/annex/objects/xQ/Mz/MD5E-s13149110--66bc34c1a1005f80f734ff7c9e0330ef/MD5E-s13149110--66bc34c1a1005f80f734ff7c9e0330ef
			
--- a/datasets/opensubtitles_corpora/tokenized_in_words/ja.one_sentence_per_line
+++ b/datasets/opensubtitles_corpora/tokenized_in_words/ja.one_sentence_per_line
@@ -1 +0,0 @@
 
				-../../../.git/annex/objects/Fx/P8/MD5E-s14790606--0468f28b5726a09d980c07fb217974e8/MD5E-s14790606--0468f28b5726a09d980c07fb217974e8
			
--- a/datasets/opensubtitles_corpora/tokenized_in_words/pl.one_sentence_per_line
+++ b/datasets/opensubtitles_corpora/tokenized_in_words/pl.one_sentence_per_line
@@ -1 +0,0 @@
 
				-../../../.git/annex/objects/8k/jP/MD5E-s27403501--65e8c8d45fe6075ab088da985dde5bfd/MD5E-s27403501--65e8c8d45fe6075ab088da985dde5bfd
			
--- a/datasets/opensubtitles_corpora/tokenized_in_words/pt.one_sentence_per_line
+++ b/datasets/opensubtitles_corpora/tokenized_in_words/pt.one_sentence_per_line
@@ -1 +0,0 @@
 
				-../../../.git/annex/objects/pq/Gx/MD5E-s19442758--174c64a1b49c9d94615dfcfb5c52508b/MD5E-s19442758--174c64a1b49c9d94615dfcfb5c52508b
			
--- a/datasets/opensubtitles_corpora/tokenized_in_words/sr.one_sentence_per_line
+++ b/datasets/opensubtitles_corpora/tokenized_in_words/sr.one_sentence_per_line
@@ -1 +0,0 @@
 
				-../../../.git/annex/objects/VK/59/MD5E-s15029711--76eb34c274a6e5ab100575ce2fa60ab0/MD5E-s15029711--76eb34c274a6e5ab100575ce2fa60ab0
			
--- a/datasets/opensubtitles_corpora/tokenized_in_words/tr.one_sentence_per_line
+++ b/datasets/opensubtitles_corpora/tokenized_in_words/tr.one_sentence_per_line
@@ -1 +0,0 @@
 
				-../../../.git/annex/objects/FM/Vk/MD5E-s22131015--b159f0714684e9fc60945904418a1240/MD5E-s22131015--b159f0714684e9fc60945904418a1240
			
--- a/environment.yml
+++ b/environment.yml
@@ -0,0 +1 @@
 
				+.git/annex/objects/V8/56/MD5E-s5478--6ce1a1dfc33f3c2aee2a0c4f0f11aa02.yml/MD5E-s5478--6ce1a1dfc33f3c2aee2a0c4f0f11aa02.yml
			
--- a/estimated/da.one_sentence_per_line.arpa
+++ b/estimated/da.one_sentence_per_line.arpa
@@ -0,0 +1 @@
 
				+../.git/annex/objects/5x/74/MD5E-s28028840--71fbf9fb169884d736da26c047e16f4e.arpa/MD5E-s28028840--71fbf9fb169884d736da26c047e16f4e.arpa
			
--- a/estimated/de.one_sentence_per_line.arpa
+++ b/estimated/de.one_sentence_per_line.arpa
@@ -0,0 +1 @@
 
				+../.git/annex/objects/Z2/0W/MD5E-s22540364--11e64685c900b25e47a7c2a137dd7a9b.arpa/MD5E-s22540364--11e64685c900b25e47a7c2a137dd7a9b.arpa
			
--- a/estimated/en.one_sentence_per_line.arpa
+++ b/estimated/en.one_sentence_per_line.arpa
@@ -0,0 +1 @@
 
				+../.git/annex/objects/KG/5q/MD5E-s31436879--847b2a7d2e5210d87f638963a8764808.arpa/MD5E-s31436879--847b2a7d2e5210d87f638963a8764808.arpa
			
--- a/estimated/es.one_sentence_per_line.arpa
+++ b/estimated/es.one_sentence_per_line.arpa
@@ -0,0 +1 @@
 
				+../.git/annex/objects/Zq/pj/MD5E-s10061705--b466f7fc80c31c74891f85256d324c43.arpa/MD5E-s10061705--b466f7fc80c31c74891f85256d324c43.arpa
			
--- a/estimated/et.one_sentence_per_line.arpa
+++ b/estimated/et.one_sentence_per_line.arpa
@@ -0,0 +1 @@
 
				+../.git/annex/objects/w4/9Q/MD5E-s18873182--89176dfdd746dd62fe277cf760489709.arpa/MD5E-s18873182--89176dfdd746dd62fe277cf760489709.arpa
			
--- a/estimated/eu.one_sentence_per_line.arpa
+++ b/estimated/eu.one_sentence_per_line.arpa
@@ -0,0 +1 @@
 
				+../.git/annex/objects/vZ/2G/MD5E-s12176188--ae20d403fb51fef0b7572521b95d47a9.arpa/MD5E-s12176188--ae20d403fb51fef0b7572521b95d47a9.arpa
			
--- a/estimated/fr.one_sentence_per_line.arpa
+++ b/estimated/fr.one_sentence_per_line.arpa
@@ -0,0 +1 @@
 
				+../.git/annex/objects/QG/ff/MD5E-s20901089--1873e4fa871af748a4028e962a941b74.arpa/MD5E-s20901089--1873e4fa871af748a4028e962a941b74.arpa
			
--- a/estimated/ja.one_sentence_per_line.arpa
+++ b/estimated/ja.one_sentence_per_line.arpa
@@ -0,0 +1 @@
 
				+../.git/annex/objects/6W/kM/MD5E-s8026445--d320df753b865052827e96c0be67e418.arpa/MD5E-s8026445--d320df753b865052827e96c0be67e418.arpa
			
--- a/estimated/pl.one_sentence_per_line.arpa
+++ b/estimated/pl.one_sentence_per_line.arpa
@@ -0,0 +1 @@
 
				+../.git/annex/objects/5j/46/MD5E-s23833364--0c4492ab80d3c7f37ff923288dc88d80.arpa/MD5E-s23833364--0c4492ab80d3c7f37ff923288dc88d80.arpa
			
--- a/estimated/pt.one_sentence_per_line.arpa
+++ b/estimated/pt.one_sentence_per_line.arpa
@@ -0,0 +1 @@
 
				+../.git/annex/objects/ZF/pz/MD5E-s22346672--1a9f56836b07f9a0d981e329ce47e1c9.arpa/MD5E-s22346672--1a9f56836b07f9a0d981e329ce47e1c9.arpa
			
--- a/estimated/sr.one_sentence_per_line.arpa
+++ b/estimated/sr.one_sentence_per_line.arpa
@@ -0,0 +1 @@
 
				+../.git/annex/objects/6M/xg/MD5E-s20755431--b4f26a89a36c9c4a61bb39a00c83c116.arpa/MD5E-s20755431--b4f26a89a36c9c4a61bb39a00c83c116.arpa
			
--- a/estimated/tr.one_sentence_per_line.arpa
+++ b/estimated/tr.one_sentence_per_line.arpa
@@ -0,0 +1 @@
 
				+../.git/annex/objects/Gf/70/MD5E-s18935056--4fe9ce073a5c9cb9e601fa1424524c3a.arpa/MD5E-s18935056--4fe9ce073a5c9cb9e601fa1424524c3a.arpa
			
--- a/results/evaluation.csv
+++ b/results/evaluation.csv
@@ -0,0 +1 @@
 
				+../.git/annex/objects/5g/Gj/MD5E-s607--3a8fb15dbd039d29e12a0e126c73112d.csv/MD5E-s607--3a8fb15dbd039d29e12a0e126c73112d.csv
			
--- a/results/results_for_study2_datalad.csv
+++ b/results/results_for_study2_datalad.csv
@@ -1 +0,0 @@
 
				-../.git/annex/objects/gg/pJ/MD5E-s1271727--76f1f59ac68fe619ae34bd03ede011a8.csv/MD5E-s1271727--76f1f59ac68fe619ae34bd03ede011a8.csv
			
--- a/ter
+++ b/ter
@@ -1 +0,0 @@
 
				-.git/annex/objects/K5/g1/MD5E-s11843--f7a613986d426f7c5667fc59487dd559/MD5E-s11843--f7a613986d426f7c5667fc59487dd559
		`@@ -1 +0,0 @@`
		`-../../../.git/annex/objects/Xf/J3/MD5E-s15872283--589ddd6867dae20a40048aaed160f751/MD5E-s15872283--589ddd6867dae20a40048aaed160f751`
		`@@ -1 +0,0 @@`
		`-../../../.git/annex/objects/PK/VG/MD5E-s16478486--16c8639a6e1bc848b9e7c835138bc779/MD5E-s16478486--16c8639a6e1bc848b9e7c835138bc779`
		`@@ -1 +0,0 @@`
		`-../../../.git/annex/objects/k5/41/MD5E-s24346046--828c32d227d0c2d934ac85954c0549be/MD5E-s24346046--828c32d227d0c2d934ac85954c0549be`
		`@@ -1 +0,0 @@`
		`-../../../.git/annex/objects/Vj/Zq/MD5E-s15998814--588f8969805d7e19a2fb8555c805c7bf/MD5E-s15998814--588f8969805d7e19a2fb8555c805c7bf`
		`@@ -1 +0,0 @@`
		`-../../../.git/annex/objects/kK/m3/MD5E-s16250386--1fa013016100de4c1369aaaf5b51c3bc/MD5E-s16250386--1fa013016100de4c1369aaaf5b51c3bc`
		`@@ -1 +0,0 @@`
		`-../../../.git/annex/objects/Z5/WJ/MD5E-s10355435--f2c85a6f418cd351e643387f77f80466/MD5E-s10355435--f2c85a6f418cd351e643387f77f80466`
		`@@ -1 +0,0 @@`
		`-../../../.git/annex/objects/xQ/Mz/MD5E-s13149110--66bc34c1a1005f80f734ff7c9e0330ef/MD5E-s13149110--66bc34c1a1005f80f734ff7c9e0330ef`
		`@@ -1 +0,0 @@`
		`-../../../.git/annex/objects/Fx/P8/MD5E-s14790606--0468f28b5726a09d980c07fb217974e8/MD5E-s14790606--0468f28b5726a09d980c07fb217974e8`
		`@@ -1 +0,0 @@`
		`-../../../.git/annex/objects/8k/jP/MD5E-s27403501--65e8c8d45fe6075ab088da985dde5bfd/MD5E-s27403501--65e8c8d45fe6075ab088da985dde5bfd`
		`@@ -1 +0,0 @@`
		`-../../../.git/annex/objects/pq/Gx/MD5E-s19442758--174c64a1b49c9d94615dfcfb5c52508b/MD5E-s19442758--174c64a1b49c9d94615dfcfb5c52508b`