Browse Source

add training and testing scripts

yaya-sy 1 year ago
parent
commit
f77b7928d2
57 changed files with 478 additions and 47 deletions
  1. BIN
      code/__pycache__/get_most_probable_phonemes.cpython-310.pyc
  2. BIN
      code/__pycache__/make_noiser.cpython-310.pyc
  3. BIN
      code/__pycache__/utterances_cleaner.cpython-310.pyc
  4. 5 0
      code/download_childes_corpora.py
  5. 2 2
      code/download_opensubtitles_corpora.py
  6. 16 0
      code/get_most_probable_phonemes.py
  7. 151 0
      code/make_noiser.py
  8. 209 0
      code/test_on_all_languages.py
  9. 36 0
      code/train_language_models_cp.sh
  10. 1 1
      datasets/childes_json_corpora/da.json
  11. 1 1
      datasets/childes_json_corpora/de.json
  12. 1 1
      datasets/childes_json_corpora/en.json
  13. 1 1
      datasets/childes_json_corpora/et.json
  14. 1 1
      datasets/childes_json_corpora/eu.json
  15. 1 1
      datasets/childes_json_corpora/ja.json
  16. 1 1
      datasets/childes_json_corpora/pl.json
  17. 1 1
      datasets/childes_json_corpora/pt.json
  18. 1 1
      datasets/childes_json_corpora/sr.json
  19. 1 1
      datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/da.one_sentence_per_line
  20. 1 1
      datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/de.one_sentence_per_line
  21. 1 1
      datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/en.one_sentence_per_line
  22. 1 1
      datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/es.one_sentence_per_line
  23. 1 1
      datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/et.one_sentence_per_line
  24. 1 1
      datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/eu.one_sentence_per_line
  25. 1 1
      datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/fr.one_sentence_per_line
  26. 1 1
      datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/ja.one_sentence_per_line
  27. 1 1
      datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/pl.one_sentence_per_line
  28. 1 1
      datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/pt.one_sentence_per_line
  29. 1 1
      datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/sr.one_sentence_per_line
  30. 1 1
      datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/tr.one_sentence_per_line
  31. 1 1
      datasets/opensubtitles_corpora/tokenized_in_phonemes_train/da.one_sentence_per_line
  32. 1 1
      datasets/opensubtitles_corpora/tokenized_in_phonemes_train/de.one_sentence_per_line
  33. 1 1
      datasets/opensubtitles_corpora/tokenized_in_phonemes_train/en.one_sentence_per_line
  34. 1 1
      datasets/opensubtitles_corpora/tokenized_in_phonemes_train/es.one_sentence_per_line
  35. 1 1
      datasets/opensubtitles_corpora/tokenized_in_phonemes_train/et.one_sentence_per_line
  36. 1 1
      datasets/opensubtitles_corpora/tokenized_in_phonemes_train/eu.one_sentence_per_line
  37. 1 1
      datasets/opensubtitles_corpora/tokenized_in_phonemes_train/fr.one_sentence_per_line
  38. 1 1
      datasets/opensubtitles_corpora/tokenized_in_phonemes_train/ja.one_sentence_per_line
  39. 1 1
      datasets/opensubtitles_corpora/tokenized_in_phonemes_train/pl.one_sentence_per_line
  40. 1 1
      datasets/opensubtitles_corpora/tokenized_in_phonemes_train/pt.one_sentence_per_line
  41. 1 1
      datasets/opensubtitles_corpora/tokenized_in_phonemes_train/sr.one_sentence_per_line
  42. 1 1
      datasets/opensubtitles_corpora/tokenized_in_phonemes_train/tr.one_sentence_per_line
  43. 1 1
      datasets/opensubtitles_corpora/tokenized_in_words/da.one_sentence_per_line
  44. 1 1
      datasets/opensubtitles_corpora/tokenized_in_words/de.one_sentence_per_line
  45. 1 1
      datasets/opensubtitles_corpora/tokenized_in_words/en.one_sentence_per_line
  46. 1 1
      datasets/opensubtitles_corpora/tokenized_in_words/es.one_sentence_per_line
  47. 1 1
      datasets/opensubtitles_corpora/tokenized_in_words/et.one_sentence_per_line
  48. 1 1
      datasets/opensubtitles_corpora/tokenized_in_words/eu.one_sentence_per_line
  49. 1 1
      datasets/opensubtitles_corpora/tokenized_in_words/fr.one_sentence_per_line
  50. 1 1
      datasets/opensubtitles_corpora/tokenized_in_words/ja.one_sentence_per_line
  51. 1 1
      datasets/opensubtitles_corpora/tokenized_in_words/pl.one_sentence_per_line
  52. 1 1
      datasets/opensubtitles_corpora/tokenized_in_words/pt.one_sentence_per_line
  53. 1 1
      datasets/opensubtitles_corpora/tokenized_in_words/sr.one_sentence_per_line
  54. 1 1
      datasets/opensubtitles_corpora/tokenized_in_words/tr.one_sentence_per_line
  55. 12 0
      extra/languages_to_download_informations.yaml
  56. 1 0
      results/results_for_study2_datalad.csv
  57. 1 0
      ter

BIN
code/__pycache__/get_most_probable_phonemes.cpython-310.pyc


BIN
code/__pycache__/make_noiser.cpython-310.pyc


BIN
code/__pycache__/utterances_cleaner.cpython-310.pyc


+ 5 - 0
code/download_childes_corpora.py

@@ -53,6 +53,7 @@ class DownloadChildCorpora :
     def participants_data(self,
                             chat,
                             participants_to_consider: List[str],
+                            ort_tier,
                             phonemize_child: bool,
                             ) -> Generator:
         """
@@ -86,6 +87,8 @@ class DownloadChildCorpora :
             file_utterances = chat.utterances(by_files=True, participants=participant)
             if not(phonemize_child) and participant == "CHI" :
                 tiers = ["pho", "%pho", "xpho", "%xpho"]
+            elif ort_tier :
+                tiers = ["ort", "%ort", "xort", "%xort"]
             else :
                 tiers = [participant]
             for age, utterances in zip(ages, file_utterances) :
@@ -171,6 +174,7 @@ class DownloadChildCorpora :
         - phonemize_child: bool
 
         """
+        print(language, languages_to_download_informations[language]["ort_tier"])
         participants_to_consider = languages_to_download_informations[language]["participants"]
         downloading_file = open(f"{out_dirname}/{language}.one_utterance_per_line", 
                                 "w", encoding="UTF-8")
@@ -196,6 +200,7 @@ class DownloadChildCorpora :
                 chat_family = chat.filter(match=family)
                 for role, participant, age, utterances in self.participants_data(chat_family,
                                                                 participants_to_consider,
+                                                                ort_tier=languages_to_download_informations[language]["ort_tier"],
                                                                 phonemize_child=phonemize_child) :
                     for utterance in self.get_phonetic_utterances(utterances,
                                                                     participant,

+ 2 - 2
code/download_opensubtitles_corpora.py

@@ -133,7 +133,7 @@ class DownloadOpenSubtitlesData :
             output_file_words = open(f"{out_dirname}/tokenized_in_words/{language}.one_sentence_per_line", "w")
             output_file_dev = open(f"{out_dirname}/tokenized_in_phonemes_dev/{language}.one_sentence_per_line", "w")
             espeak_language_id = loaded_yaml_file[language]["espeak_language_id"]
-            backend = EspeakBackend(language=espeak_language_id)
+            backend = EspeakBackend(language=espeak_language_id, language_switch="remove-utterance")
             added_sents = set()
             for sent, progress_bar in self.get_sentences(language, max_sents_to_download, chunk=chunk) :
                 sent = self._remove_ponctuations(sent)
@@ -177,7 +177,7 @@ if __name__ == "__main__" :
                         required=False)
     parser.add_argument("--train_sentences",
                         help="Number of sent for the train corpora.",
-                        default=500_000,
+                        default=200_000,
                         required=False)
     parser.add_argument("--dev_sentences",
                         help="Number of sent for the dev copora.",

+ 16 - 0
code/get_most_probable_phonemes.py

@@ -0,0 +1,16 @@
+from collections import defaultdict
+def get_most_probable_phonemes(one_sentence_per_line_file, p=0.007) :
+    """
+    Compute the probabilities of phonemes and return the phonemes for
+    which probabilities > p.
+    """
+    counts = defaultdict(int)
+    for sentence in open(one_sentence_per_line_file) :
+        sentence = sentence.rstrip()
+        for word in sentence.split("@") :
+            for phoneme in word.split("$") :
+                counts[phoneme] += 1
+    total = sum(counts.values())
+    for phoneme in counts :
+        counts[phoneme] /= total
+    return [phoneme for phoneme, probability in counts.items() if probability >= p]

+ 151 - 0
code/make_noiser.py

@@ -0,0 +1,151 @@
+"""This module inmplements some methods to artificialy noise the data
+"""
+
+from typing import List, Dict
+from random import choices, sample, shuffle
+import copy
+import random
+random.seed(80)
+
+class Noise :
+    """
+    This class simulate noise in the data. Crucially, noise can be made on three points :\
+    (1) The noise of phonemes order of a given sequence by making the order of the sequence more aribitrary,\
+    (2) Replacement of some phonemes of a given sequence by arbitrary sampled phonemes from a vocabulary and\
+    (3) By arbitrary interverting some sequences of two different speakers.
+
+    Atributes
+    ---------
+    - phonemes_order_noise :
+        Parameter for controling the degree of noise at the level of phonemes order. See the point 1 mentioned above.
+    - speakers_noise :
+        Parameters for controling the degree of noise at the level of speakers. See the point 3 mentioned above.
+    - phonemes_noise :
+        Parameter for controling the degree of noise at the level of phonemes. See the point 2 mentioned above.
+    """
+
+    def __init__(self,
+                    most_probable_phonemes: list,
+                    phonemes_order_noise=0.3,
+                    speakers_noise=(0.5, 0.5),
+                    phonemes_noise=0.5) :
+        self.most_probable_phonemes = most_probable_phonemes
+        self.phonemes_order_noise = phonemes_order_noise
+        self.speakers_noise = speakers_noise
+        self.phonemes_noise = phonemes_noise
+    
+    def _order_noise(self, sequence: List[str]) -> str :
+        """
+        Making noise the order of the phonemes in a given sequence
+
+        Parameters
+        ----------
+        - sequence : list
+            The sequence for which the phonemes order must be noised.
+        
+        Returns
+        -------
+        - str :
+            The sequence with the order of phonemes noised.
+        """
+        # number of phonemes to noise in the sequence = len(sequence) / nb_phonemes_to_noise
+        phonemes_to_noise = round(len(sequence) * self.phonemes_order_noise)
+        # sample nb_phonemes_to_noise positions in the sequence
+        positions_sampled = list(sample(range(len(sequence)), k=phonemes_to_noise))
+        copied_positions = copy.deepcopy(positions_sampled)
+        shuffle(copied_positions)
+        # change the positions of the sampled phonemes 
+        for original_position, new_position in zip(positions_sampled, copied_positions):
+            sequence[original_position] = sequence[new_position]
+        return " ".join(sequence)
+
+    def _phonemes_noise(self, sequence: List[str]) -> str :
+        """
+        Makinng noise the phonemes of the sequence by replacing\
+        some phonemes of the sequence by arbitrary sampled phonemes\
+        from the vocabulary.
+
+        Parameters
+        ----------
+        - sequence : list
+            The sequence for which the phonemes must be noised.
+        
+        Returns
+        -------
+        - str :
+            The sequence with noised phonemes.
+        """
+        phonemes_to_noise = round(len(sequence) * self.phonemes_noise)
+        indexes = choices(range(len(sequence)), k=phonemes_to_noise)
+        # choose new phonemes only from the most probable phonemes.
+        phonemes = choices(self.most_probable_phonemes, k=phonemes_to_noise)
+        # and replace some indices of the sequence by those choosen phonemes
+        for idx, phonemes in zip(indexes, phonemes) :
+            sequence[idx] = phonemes
+        return " ".join(sequence)
+    
+    def _speakers_noise(self, speakers_sequences: Dict[str, set]) -> Dict[str, set] :
+        """
+        Making noise in the speaker's statements.
+
+        Parameters
+        ----------
+        - speakers_sequences : dict
+            Dictionary containing the utterances for each speaker.
+        
+        Returns
+        -------
+        - dict :
+            The dictionary containing the few statements interchanged between the two speakers.
+        """
+        first_speaker, second_speaker = "Target_Child", "Adult"
+        noise_first_speaker, noise_second_speaker = self.speakers_noise
+        speakers_sequences[second_speaker] = set(speakers_sequences[second_speaker])
+        speakers_sequences[first_speaker] = set(speakers_sequences[first_speaker])
+        # sample some percentage of utterances from each speaker
+        sequences_to_noise_second_speaker = round(len(speakers_sequences[second_speaker]) * noise_second_speaker)
+        sequences_to_noise_first_speaker = round(len(speakers_sequences[first_speaker]) * noise_first_speaker)
+        sequences_noise_second_speaker = sample(list(speakers_sequences[second_speaker]), k=sequences_to_noise_second_speaker)
+        sequences_noise_first_speaker = sample(list(speakers_sequences[first_speaker]), k=sequences_to_noise_first_speaker)
+        # noise by interchanging sampled utterances
+        speakers_sequences[second_speaker] -= set(sequences_noise_second_speaker)
+        speakers_sequences[first_speaker] -= set(sequences_noise_first_speaker)
+        speakers_sequences[second_speaker] |= set(sequences_noise_first_speaker)
+        speakers_sequences[first_speaker] |= set(sequences_noise_second_speaker)
+        # set to list
+        speakers_sequences[first_speaker] = list(speakers_sequences[first_speaker])
+        speakers_sequences[second_speaker] = list(speakers_sequences[second_speaker])
+        
+        return speakers_sequences
+    
+    def __call__(self, loaded_dataset: dict) -> dict:
+        """
+        Apply the three types of noise.
+
+        Parameters
+        ----------
+        loaded_dataset : dict
+            The dictionary containing the utterances for each family, at each and for each speaker.
+        
+        Returns
+        -------
+        dict :
+            The noised data.
+        """
+        for family in loaded_dataset :
+            for age in loaded_dataset[family] :
+                if "Adult" in loaded_dataset[family][age] :
+                    for idx, utterance in enumerate(loaded_dataset[family][age]["Adult"]) :
+                        splitted_utterance = utterance.split(" ")
+                        loaded_dataset[family][age]["Adult"][idx] = self._order_noise(splitted_utterance)
+                        loaded_dataset[family][age]["Adult"][idx] = self._phonemes_noise(splitted_utterance)
+                if "Target_Child" in loaded_dataset[family][age] :
+                    for idx, utterance in enumerate(loaded_dataset[family][age]["Target_Child"]) :
+                        splitted_utterance = utterance.split(" ")
+                        loaded_dataset[family][age]["Target_Child"][idx] = self._order_noise(splitted_utterance)
+                        loaded_dataset[family][age]["Target_Child"][idx] = self._phonemes_noise(splitted_utterance)
+                if "Target_Child" in loaded_dataset[family][age] and "Adult" in loaded_dataset[family][age] and all(self.speakers_noise):
+                    noised_speaker = self._speakers_noise(loaded_dataset[family][age])
+                    loaded_dataset[family][age]["Target_Child"] = noised_speaker["Target_Child"]
+                    loaded_dataset[family][age]["Adult"] = noised_speaker["Adult"]
+        return loaded_dataset

+ 209 - 0
code/test_on_all_languages.py

@@ -0,0 +1,209 @@
+import os
+import sys
+import json
+sys.path.append("./")
+sys.path.append("../")
+sys.path.append(".../")
+from itertools import product
+from tqdm import tqdm
+import kenlm
+from math import log
+import numpy as np
+from make_noiser import Noise
+import pandas as pd
+import sys
+from get_most_probable_phonemes import get_most_probable_phonemes
+import random
+from collections import Counter
+random.seed(1023)
+
+
+LANGUAGES_TYPOLOGIES = {
+    'da' : ("Danish", "fusional"),
+    'de' : ("German", "fusional"),
+    'en' : ("English", "fusional"),
+    'es' : ("Spanish", "fusional"),
+    'et' : ("Estonian", "agglutinative"),
+    'eu' : ("Basque", "agglutinative"),
+    'fr' : ("French", "fusional"),
+    'ja' : ("Japanese", "agglutinative"), 
+    'pl' : ("Polish", "fusional"), 
+    'pt' : ("Portuguese", "fusional"), 
+    'sr' : ("Serbian", "fusional"), 
+    'tr' : ("Turkish", "agglutinative")}
+
+def compute_word_frequencies(word_train_corpus, pct=0.95) :
+    frequencies = Counter()
+    for line in word_train_corpus :
+        line = line.strip()
+        if not line : continue
+        # line = line.strip()
+        frequencies.update(Counter(line.split(" ")))
+    return dict(frequencies)
+
+
+def statistics_word(utterances, word_frequencies, model) :
+    phoneme_utterances = []
+    unique_words = set()
+    nb_unk = 0
+    mlu_w = 0.0
+    mlu_p = 0.0
+    mean_word_frequencies = 0
+    nb_utterances = 0
+    nb_words = 0
+
+    statistics = {}
+    for utterance in utterances :
+        utterance = utterance.strip()
+        if not utterance : continue
+        nb_utterances += 1
+
+        utterance_w = utterance.replace("@", " ").replace("$", "")
+        utterance_p = utterance.replace("@", " ").replace("$", " ")
+        phoneme_utterances.append(utterance_p)
+
+        utterance_words = utterance_w.split(" ")
+        mlu_w += len(utterance_words)
+        mlu_p += len(utterance_p.split(" "))
+        nb_words += len(utterance_words)
+        unique_words |= set(utterance_words)
+
+        for word in utterance_words :
+            word = word.strip()
+            if word in word_frequencies :
+                mean_word_frequencies += word_frequencies[word]
+            else : 
+                nb_unk += 1
+    
+    mlu_w /= nb_utterances
+    mlu_p /= nb_utterances
+    ttr_w = len(unique_words) / nb_words
+    
+    ppl = model.perplexity("\n".join(phoneme_utterances))
+    entropy = log(ppl)
+
+    statistics["ppl"] = ppl
+    statistics["entropy"] = entropy
+    statistics["mlu_w"] = mlu_w
+    statistics["mlu_p"] = mlu_p
+    statistics["ttr_w"] = ttr_w
+    statistics["mean_word_frequencies"] = mean_word_frequencies
+    statistics["nb_unk"] = nb_unk
+
+    return statistics
+
+def create_sparse_combinantions(values) :
+    sparse_combinantions = []
+    for value in values :
+        for idx in range(len(values)) : 
+            sparse_values = [0.0] * len(values)
+            sparse_values[idx] = value
+            sparse_combinantions.append(tuple(sparse_values))
+    return set(sparse_combinantions)
+
+def test(json_files_directory, models_directory, phoneme_train_files, word_train_files, add_noise=False) :
+    """
+    """
+    columns = ["language", "typology", "family", "speaker",\
+                "age", "perplexity", "entropy", "mlu", "mlu_without_repetition",\
+                "phonemes_order_noise", "speakers_noise_adult",\
+                "speakers_noise_child", "phonemes_noise"]
+    results = pd.DataFrame(columns=columns, index=None)
+    all_combinations = list(product((0.0, 0.25, 0.5, 0.75), repeat=4)) if add_noise else [((0.0, 0.0, 0.0, 0.0))]
+    # sparse_combinantions = create_sparse_combinantions((0.0, 0.25, 0.5, 0.75))
+    # noise_values = np.linspace(0.0, 1.0, num=6)
+    for phonemes_noise, speakers_noise_child, speakers_noise_adult, phonemes_order_noise in tqdm(all_combinations, total=len(all_combinations)) :
+        for test_filename, model_filename in product(os.listdir(json_files_directory), os.listdir(models_directory)) :
+            lg_iso, _ = test_filename.split(".")
+            model_lg = model_filename.split(".")[0]
+            if lg_iso != model_lg : continue
+            print(lg_iso, model_lg)
+            most_probable_phonemes = get_most_probable_phonemes(f"{phoneme_train_files}/{lg_iso}.one_sentence_per_line")
+            word_frequencies = compute_word_frequencies(f"{word_train_files}/{lg_iso}.one_sentence_per_line")
+            loaded_json = json.load(open(f"{json_files_directory}/{test_filename}"))
+            if add_noise :
+                noise = Noise(most_probable_phonemes,
+                                phonemes_order_noise=phonemes_order_noise,
+                                speakers_noise=(speakers_noise_child, speakers_noise_adult),
+                                phonemes_noise=phonemes_noise)
+                loaded_json = noise(loaded_json)
+            model = kenlm.Model(f"{models_directory}/{model_filename}")
+            for family in loaded_json :
+                for age in loaded_json[family] : 
+                    if age == "None" : print(family, lg_iso, age); continue
+                    for speaker in loaded_json[family][age] :
+                        if speaker not in ["Adult", "Target_Child"] : continue
+                        # test_utterances = "\n".join(loaded_json[family][age][speaker])
+                        # utterances = [utterance.split(" ") for utterance in loaded_json[family][age][speaker]]
+                        # mlu = np.mean([len(utterance) for utterance in utterances])
+                        # mlu_without_repetition = np.mean([len(set(utterance)) for utterance in utterances])
+                        # ppl = model.perplexity(test_utterances)
+                        # entropy = log(ppl)
+
+                        results_statistics = statistics_word(loaded_json[family][age][speaker], word_frequencies, model)
+                        language, typology = LANGUAGES_TYPOLOGIES[lg_iso]
+                        new_row =  {"language" : language,
+                                    "typology" : typology,
+                                    "family" : family,
+                                    "speaker" : speaker,
+                                    "age" : float(age),
+                                    "perplexity" : results_statistics["ppl"],
+                                    "entropy" : results_statistics["entropy"],
+                                    "mlu_w" : results_statistics["mlu_w"],
+                                    "mlu_p" : results_statistics["mlu_p"],
+                                    "ttr_w" : results_statistics["ttr_w"],
+                                    "mean_word_frequencies" : results_statistics["mean_word_frequencies"],
+                                    "nb_unk" : results_statistics["nb_unk"],
+                                    "phonemes_order_noise" : phonemes_order_noise,
+                                    "speakers_noise_adult" : speakers_noise_adult,
+                                    "speakers_noise_child" : speakers_noise_child,
+                                    "phonemes_noise" : phonemes_noise}
+                        results = results.append(new_row, ignore_index=True)
+    return results
+if __name__ == "__main__":
+    from argparse import ArgumentParser, BooleanOptionalAction
+
+    parser = ArgumentParser()
+    parser.add_argument('--phoneme_train_directory',
+        required=True,
+        help="Dataset containing the train files in phonemes (dot one_sentence_per_line) "
+        )
+    parser.add_argument('--word_train_directory',
+        required=True,
+        help="Dataset containing the train files in words (dot one_sentence_per_line) "
+        )
+    parser.add_argument('--models_directory', 
+        required=True,
+        help="Folder containing the estimated parameters"
+        )
+    
+    parser.add_argument('--json_files_directory',
+        required=True,
+        help="Directory containing json files for test"
+        )
+    
+    parser.add_argument('--out_dirname',
+        required=True,
+        help="Out directory"
+        )
+    parser.add_argument('--out_filename',
+            required=True,
+            help="Out filename"
+            )
+    parser.add_argument("--add_noise", action=BooleanOptionalAction)
+
+    args = parser.parse_args()
+    add_noise = args.add_noise
+    json_files_directory = args.json_files_directory
+    phoneme_train_files, word_train_files = args.phoneme_train_directory, args.word_train_directory
+    models_directory = args.models_directory
+    out_dirname = args.out_dirname
+    out_filename = args.out_filename
+
+    if not os.path.exists("results"):
+        os.makedirs("results")
+    test(json_files_directory, 
+        models_directory, 
+        phoneme_train_files,
+        word_train_files, 
+        add_noise).to_csv(f"{out_dirname}/{out_filename}.csv")

+ 36 - 0
code/train_language_models_cp.sh

@@ -0,0 +1,36 @@
+#!/bin/bash
+
+programname=$0
+function usage {
+    echo "usage: $programname [-h] [-t trainfolder] [-o outfolder] [-k kenlm_path] [-n ngram_size]"
+    echo "  -h  display help"
+    echo "  -n  size of the ngrams for the language model"
+    echo "  -t  folder that contains the train files"
+    echo "  -o  out folder where the estimated parameters will be saved"
+    echo "  -k  path to kenlm folder"
+    exit 1
+}
+
+while getopts t:o:k:n: flag
+do
+    case "${flag}" in
+        n) ngram_size=${OPTARG};;
+        t) train_files=${OPTARG};;
+        o) out_dirname=${OPTARG};;
+        k) kenlm_folder=${OPTARG};;
+    esac
+done
+case $1 in
+    -h) usage; shift ;;
+esac
+shift
+echo "======================================================"
+echo "ngram size: $ngram_size";
+echo "trainfolder: $train_files";
+echo "outfolder: $out_dirname";
+echo "kenlm: $kenlm_folder";
+mkdir -p $out_dirname
+echo "================= STARTING ESTIMATION ================"
+for filename in $train_files/*.one_sentence_per_line; do # train_files*.one_sentence_per_line
+    $kenlm_folder/build/bin/lmplz --discount_fallback -o $ngram_size < $filename > $out_dirname/${filename##*/}.arpa
+done

+ 1 - 1
datasets/childes_json_corpora/da.json

@@ -1 +1 @@
-../../.git/annex/objects/wM/J4/MD5E-s2940316--a8934de46f3d6e2096f15f7f096e92b0.json/MD5E-s2940316--a8934de46f3d6e2096f15f7f096e92b0.json
+../../.git/annex/objects/4W/Xg/MD5E-s2940316--507efe8e52bf5ce75f8df711d87d1f38.json/MD5E-s2940316--507efe8e52bf5ce75f8df711d87d1f38.json

+ 1 - 1
datasets/childes_json_corpora/de.json

@@ -1 +1 @@
-../../.git/annex/objects/kf/Pm/MD5E-s45738282--8c6de93f7bdb6075368bfe148dcb8bd6.json/MD5E-s45738282--8c6de93f7bdb6075368bfe148dcb8bd6.json
+../../.git/annex/objects/m8/K7/MD5E-s45738282--c42b1e618dc3371fc04798b8aec56033.json/MD5E-s45738282--c42b1e618dc3371fc04798b8aec56033.json

+ 1 - 1
datasets/childes_json_corpora/en.json

@@ -1 +1 @@
-../../.git/annex/objects/Pj/f9/MD5E-s32246267--df1c90893070ae0af58f44aad29a0395.json/MD5E-s32246267--df1c90893070ae0af58f44aad29a0395.json
+../../.git/annex/objects/mM/Fk/MD5E-s32246267--53fa0ec80e98ef57b52100fa3e52a686.json/MD5E-s32246267--53fa0ec80e98ef57b52100fa3e52a686.json

+ 1 - 1
datasets/childes_json_corpora/et.json

@@ -1 +1 @@
-../../.git/annex/objects/Q1/jM/MD5E-s8512506--c58972370312e6afca4c11f2d584fd3c.json/MD5E-s8512506--c58972370312e6afca4c11f2d584fd3c.json
+../../.git/annex/objects/1x/Gv/MD5E-s8512506--8f79ccb462b01e1ca3ef1ff5ae5461cd.json/MD5E-s8512506--8f79ccb462b01e1ca3ef1ff5ae5461cd.json

+ 1 - 1
datasets/childes_json_corpora/eu.json

@@ -1 +1 @@
-../../.git/annex/objects/m3/51/MD5E-s1472131--3b9a4df49b35409b79e1e1c74b2fccad.json/MD5E-s1472131--3b9a4df49b35409b79e1e1c74b2fccad.json
+../../.git/annex/objects/gw/pQ/MD5E-s1472131--5b9acb68d334a8e8682beb740356dc91.json/MD5E-s1472131--5b9acb68d334a8e8682beb740356dc91.json

+ 1 - 1
datasets/childes_json_corpora/ja.json

@@ -1 +1 @@
-../../.git/annex/objects/JG/X1/MD5E-s345--bdf72b5e07c0bce649cab0b1a5f8bd69.json/MD5E-s345--bdf72b5e07c0bce649cab0b1a5f8bd69.json
+../../.git/annex/objects/Vk/mV/MD5E-s7591871--e2e5bc66db54eaf1b073a6d115d86a04.json/MD5E-s7591871--e2e5bc66db54eaf1b073a6d115d86a04.json

+ 1 - 1
datasets/childes_json_corpora/pl.json

@@ -1 +1 @@
-../../.git/annex/objects/6m/QZ/MD5E-s16258378--266d0b09899bd9c5628c0d784e1ae49b.json/MD5E-s16258378--266d0b09899bd9c5628c0d784e1ae49b.json
+../../.git/annex/objects/WM/qg/MD5E-s16258378--858cf5b3b3eaa45e0146ba94778f0fe5.json/MD5E-s16258378--858cf5b3b3eaa45e0146ba94778f0fe5.json

+ 1 - 1
datasets/childes_json_corpora/pt.json

@@ -1 +1 @@
-../../.git/annex/objects/3j/4M/MD5E-s6161273--9a3580baaa195d7f526e514e6202eaf4.json/MD5E-s6161273--9a3580baaa195d7f526e514e6202eaf4.json
+../../.git/annex/objects/Qf/kG/MD5E-s6161273--53ac1b2e1337d602434858fb01393f5b.json/MD5E-s6161273--53ac1b2e1337d602434858fb01393f5b.json

+ 1 - 1
datasets/childes_json_corpora/sr.json

@@ -1 +1 @@
-../../.git/annex/objects/7x/Vq/MD5E-s7914319--25b870815a1065c8438bb910b8318073.json/MD5E-s7914319--25b870815a1065c8438bb910b8318073.json
+../../.git/annex/objects/0w/3x/MD5E-s7914319--f4e61920a973cda4ec9629c33959a501.json/MD5E-s7914319--f4e61920a973cda4ec9629c33959a501.json

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/da.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/22/q6/MD5E-s620529--3e4179641f7a348d39505a9a1333c26e/MD5E-s620529--3e4179641f7a348d39505a9a1333c26e
+../../../.git/annex/objects/12/Qf/MD5E-s595067--26daef9a8e7037a20f64d3ad8d8209d3/MD5E-s595067--26daef9a8e7037a20f64d3ad8d8209d3

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/de.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/p6/Kp/MD5E-s689485--d4f53c518c1d1dd09307030cdbdb0151/MD5E-s689485--d4f53c518c1d1dd09307030cdbdb0151
+../../../.git/annex/objects/2g/jv/MD5E-s673727--420d9cd7580290e7c05e7e549ee858ec/MD5E-s673727--420d9cd7580290e7c05e7e549ee858ec

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/en.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/JZ/87/MD5E-s625387--a6f26de5279b172fd418d24b29a2b059/MD5E-s625387--a6f26de5279b172fd418d24b29a2b059
+../../../.git/annex/objects/8Z/2j/MD5E-s613363--0ad1391f05853d19d8ce90476241e71c/MD5E-s613363--0ad1391f05853d19d8ce90476241e71c

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/es.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/JV/2g/MD5E-s608100--dc406486e645c5aeeac92e84fd0eec85/MD5E-s608100--dc406486e645c5aeeac92e84fd0eec85
+../../../.git/annex/objects/5M/Xk/MD5E-s592515--da9b5dff8a63b0b51b41c9f21c6dd235/MD5E-s592515--da9b5dff8a63b0b51b41c9f21c6dd235

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/et.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/Qm/WG/MD5E-s604630--bee32118984c8a3cde226ec022de0797/MD5E-s604630--bee32118984c8a3cde226ec022de0797
+../../../.git/annex/objects/24/Wx/MD5E-s606313--bc07ac2955249b15e6d734dc8dd19e11/MD5E-s606313--bc07ac2955249b15e6d734dc8dd19e11

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/eu.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/Zp/8m/MD5E-s646644--f43bb88e1c39fc23a7f0490ce2d4260a/MD5E-s646644--f43bb88e1c39fc23a7f0490ce2d4260a
+../../../.git/annex/objects/fq/qk/MD5E-s635915--f8669d471d298cd961dd4d71c7029820/MD5E-s635915--f8669d471d298cd961dd4d71c7029820

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/fr.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/JM/4Q/MD5E-s513721--bc168f268a7a7eb87c9621abf76c858a/MD5E-s513721--bc168f268a7a7eb87c9621abf76c858a
+../../../.git/annex/objects/W5/wz/MD5E-s491440--98c1c6200f0cf71bab1f19afc6ae786a/MD5E-s491440--98c1c6200f0cf71bab1f19afc6ae786a

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/ja.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/8x/zf/MD5E-s1625087--e594546108226554dac59cdd0ab2ca2e/MD5E-s1625087--e594546108226554dac59cdd0ab2ca2e
+../../../.git/annex/objects/ZG/Mv/MD5E-s411971--9401ebc08169e833758f22dd545948c7/MD5E-s411971--9401ebc08169e833758f22dd545948c7

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/pl.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/8m/k1/MD5E-s659395--ab04c68cbdf80baec54b754ec18bfbc9/MD5E-s659395--ab04c68cbdf80baec54b754ec18bfbc9
+../../../.git/annex/objects/j2/v7/MD5E-s656657--53d1b3e06925058cfcae62e7336936db/MD5E-s656657--53d1b3e06925058cfcae62e7336936db

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/pt.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/qV/6F/MD5E-s699709--b798071d8c259c4688d455f2d6f09ffc/MD5E-s699709--b798071d8c259c4688d455f2d6f09ffc
+../../../.git/annex/objects/XF/1f/MD5E-s694072--7b2360e5f4798bbe7fd1f1a6976b24cc/MD5E-s694072--7b2360e5f4798bbe7fd1f1a6976b24cc

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/sr.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/56/7m/MD5E-s602585--c472364aa49ebd8287926311425fd9aa/MD5E-s602585--c472364aa49ebd8287926311425fd9aa
+../../../.git/annex/objects/k5/FG/MD5E-s599916--9d1ec3fce9ddb349d98f0f92f1dffe2a/MD5E-s599916--9d1ec3fce9ddb349d98f0f92f1dffe2a

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_phonemes_dev/tr.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/wG/6Z/MD5E-s667909--4b1dacd17b2eb82471d553414475cf54/MD5E-s667909--4b1dacd17b2eb82471d553414475cf54
+../../../.git/annex/objects/G9/9v/MD5E-s647034--a7d85ff63f867ba9b46680c9eef961ee/MD5E-s647034--a7d85ff63f867ba9b46680c9eef961ee

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_phonemes_train/da.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/16/Fj/MD5E-s31107032--e698d686ce611047a1ce737a619056de/MD5E-s31107032--e698d686ce611047a1ce737a619056de
+../../../.git/annex/objects/PP/V3/MD5E-s12008967--476f02a58c035edf541c6590def4bc52/MD5E-s12008967--476f02a58c035edf541c6590def4bc52

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_phonemes_train/de.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/K4/x9/MD5E-s34412970--bcea547d24542c2d907adb45c217bd52/MD5E-s34412970--bcea547d24542c2d907adb45c217bd52
+../../../.git/annex/objects/Xg/9g/MD5E-s13505000--de3cba8644030a72a5548df2c5c16020/MD5E-s13505000--de3cba8644030a72a5548df2c5c16020

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_phonemes_train/en.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/29/58/MD5E-s31351782--9a33006b69492c7d114a9a7456869a98/MD5E-s31351782--9a33006b69492c7d114a9a7456869a98
+../../../.git/annex/objects/kQ/1j/MD5E-s12222344--ac4aadce59191ecd68e78881a778e89d/MD5E-s12222344--ac4aadce59191ecd68e78881a778e89d

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_phonemes_train/es.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/9k/7w/MD5E-s30321219--2d8998016a695d15b67609150a988316/MD5E-s30321219--2d8998016a695d15b67609150a988316
+../../../.git/annex/objects/3M/6F/MD5E-s11853653--434f1f83e2876d6a32538dade4cc8ad6/MD5E-s11853653--434f1f83e2876d6a32538dade4cc8ad6

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_phonemes_train/et.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/xW/J8/MD5E-s30304598--8fed9d9b82dd421f28b51acd464823a6/MD5E-s30304598--8fed9d9b82dd421f28b51acd464823a6
+../../../.git/annex/objects/m4/Zk/MD5E-s12016578--141ac10bd93fe71a564e0b7fc2dc6992/MD5E-s12016578--141ac10bd93fe71a564e0b7fc2dc6992

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_phonemes_train/eu.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/Xp/8G/MD5E-s32429278--e6e85a8b62e2214622487ff408ef4e22/MD5E-s32429278--e6e85a8b62e2214622487ff408ef4e22
+../../../.git/annex/objects/fx/w5/MD5E-s12770799--0cfaaf9eb1132b8019cfec861172b2f8/MD5E-s12770799--0cfaaf9eb1132b8019cfec861172b2f8

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_phonemes_train/fr.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/1q/JJ/MD5E-s25810878--d85276e4c8eff319b9211f274d008ca0/MD5E-s25810878--d85276e4c8eff319b9211f274d008ca0
+../../../.git/annex/objects/PZ/jM/MD5E-s9854135--1b38bf46f6af95339f19aede45d525f2/MD5E-s9854135--1b38bf46f6af95339f19aede45d525f2

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_phonemes_train/ja.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/F5/K1/MD5E-s81417420--a774ba1fd6507464c964f081839bf39a/MD5E-s81417420--a774ba1fd6507464c964f081839bf39a
+../../../.git/annex/objects/P8/F1/MD5E-s8244649--eee20b1c9e63baaa249a232269b92689/MD5E-s8244649--eee20b1c9e63baaa249a232269b92689

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_phonemes_train/pl.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/g0/41/MD5E-s33219564--169b868d2c3648b0393b6e0c3a044b17/MD5E-s33219564--169b868d2c3648b0393b6e0c3a044b17
+../../../.git/annex/objects/vP/Pm/MD5E-s12963284--fbc0fd3c0cde55268df3c4d6a3bb65a5/MD5E-s12963284--fbc0fd3c0cde55268df3c4d6a3bb65a5

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_phonemes_train/pt.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/2f/4m/MD5E-s35272955--ba7dcb3f711316f6af9bf357d95d9e98/MD5E-s35272955--ba7dcb3f711316f6af9bf357d95d9e98
+../../../.git/annex/objects/wp/jV/MD5E-s13808136--74940aca41dd54ffc3b04d0c45179642/MD5E-s13808136--74940aca41dd54ffc3b04d0c45179642

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_phonemes_train/sr.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/x1/4p/MD5E-s30061053--5620fe89e6d26e074bb379bbd95e36fa/MD5E-s30061053--5620fe89e6d26e074bb379bbd95e36fa
+../../../.git/annex/objects/95/8M/MD5E-s12099820--03a98f2b4b89b71bb3d9f0a567925e11/MD5E-s12099820--03a98f2b4b89b71bb3d9f0a567925e11

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_phonemes_train/tr.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/WW/Mm/MD5E-s33685177--d95e01d72ee0dd6f0790fca4dbca48de/MD5E-s33685177--d95e01d72ee0dd6f0790fca4dbca48de
+../../../.git/annex/objects/Z7/kQ/MD5E-s13012737--6ccb2eef54d0e81e131c17bbeb73ade6/MD5E-s13012737--6ccb2eef54d0e81e131c17bbeb73ade6

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_words/da.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/P8/qq/MD5E-s40842094--5e414ba9258d198e4c16a418d21b67de/MD5E-s40842094--5e414ba9258d198e4c16a418d21b67de
+../../../.git/annex/objects/Xf/J3/MD5E-s15872283--589ddd6867dae20a40048aaed160f751/MD5E-s15872283--589ddd6867dae20a40048aaed160f751

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_words/de.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/q1/g4/MD5E-s39490897--cad7d85081ffde5ef56405d24c68c682/MD5E-s39490897--cad7d85081ffde5ef56405d24c68c682
+../../../.git/annex/objects/PK/VG/MD5E-s16478486--16c8639a6e1bc848b9e7c835138bc779/MD5E-s16478486--16c8639a6e1bc848b9e7c835138bc779

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_words/en.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/Fq/pZ/MD5E-s57888856--a82069bcea0a269d92cbbbbe672a4d88/MD5E-s57888856--a82069bcea0a269d92cbbbbe672a4d88
+../../../.git/annex/objects/k5/41/MD5E-s24346046--828c32d227d0c2d934ac85954c0549be/MD5E-s24346046--828c32d227d0c2d934ac85954c0549be

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_words/es.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/f5/14/MD5E-s41007348--440c56d371eb18aca5ce834982c43aad/MD5E-s41007348--440c56d371eb18aca5ce834982c43aad
+../../../.git/annex/objects/Vj/Zq/MD5E-s15998814--588f8969805d7e19a2fb8555c805c7bf/MD5E-s15998814--588f8969805d7e19a2fb8555c805c7bf

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_words/et.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/Zq/1W/MD5E-s41552139--97c3f7451c3512dd06e3ae273490ba44/MD5E-s41552139--97c3f7451c3512dd06e3ae273490ba44
+../../../.git/annex/objects/kK/m3/MD5E-s16250386--1fa013016100de4c1369aaaf5b51c3bc/MD5E-s16250386--1fa013016100de4c1369aaaf5b51c3bc

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_words/eu.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/f0/p4/MD5E-s26601340--e0f57b293612bd2424307139fe8a189b/MD5E-s26601340--e0f57b293612bd2424307139fe8a189b
+../../../.git/annex/objects/Z5/WJ/MD5E-s10355435--f2c85a6f418cd351e643387f77f80466/MD5E-s10355435--f2c85a6f418cd351e643387f77f80466

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_words/fr.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/Wq/xq/MD5E-s38046127--184b33f856068727f9405bf33a97371f/MD5E-s38046127--184b33f856068727f9405bf33a97371f
+../../../.git/annex/objects/xQ/Mz/MD5E-s13149110--66bc34c1a1005f80f734ff7c9e0330ef/MD5E-s13149110--66bc34c1a1005f80f734ff7c9e0330ef

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_words/ja.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/Qk/wk/MD5E-s76292553--606db16d9750d6dd927c3c96ccfb3f3e/MD5E-s76292553--606db16d9750d6dd927c3c96ccfb3f3e
+../../../.git/annex/objects/Fx/P8/MD5E-s14790606--0468f28b5726a09d980c07fb217974e8/MD5E-s14790606--0468f28b5726a09d980c07fb217974e8

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_words/pl.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/0Q/4V/MD5E-s81519425--54c4f3d055c65aaf52d17eeedd2f5104/MD5E-s81519425--54c4f3d055c65aaf52d17eeedd2f5104
+../../../.git/annex/objects/8k/jP/MD5E-s27403501--65e8c8d45fe6075ab088da985dde5bfd/MD5E-s27403501--65e8c8d45fe6075ab088da985dde5bfd

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_words/pt.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/3V/m7/MD5E-s50411350--1d515b5fb8f071aa67d32a6d16d68f1b/MD5E-s50411350--1d515b5fb8f071aa67d32a6d16d68f1b
+../../../.git/annex/objects/pq/Gx/MD5E-s19442758--174c64a1b49c9d94615dfcfb5c52508b/MD5E-s19442758--174c64a1b49c9d94615dfcfb5c52508b

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_words/sr.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/6M/8P/MD5E-s40816898--31575a0edbb081e58f31f0c367bb7f64/MD5E-s40816898--31575a0edbb081e58f31f0c367bb7f64
+../../../.git/annex/objects/VK/59/MD5E-s15029711--76eb34c274a6e5ab100575ce2fa60ab0/MD5E-s15029711--76eb34c274a6e5ab100575ce2fa60ab0

+ 1 - 1
datasets/opensubtitles_corpora/tokenized_in_words/tr.one_sentence_per_line

@@ -1 +1 @@
-../../../.git/annex/objects/53/j5/MD5E-s64456933--7ba9d1f86839b0e5da308d2809d8f4eb/MD5E-s64456933--7ba9d1f86839b0e5da308d2809d8f4eb
+../../../.git/annex/objects/FM/Vk/MD5E-s22131015--b159f0714684e9fc60945904418a1240/MD5E-s22131015--b159f0714684e9fc60945904418a1240

+ 12 - 0
extra/languages_to_download_informations.yaml

@@ -5,6 +5,7 @@ da:
   - Target_Child
   - Mother
   - Father
+  ort_tier: False
   urls:
   - https://childes.talkbank.org/data/Scandinavian/Danish/Plunkett.zip
 de:
@@ -14,6 +15,7 @@ de:
   - Target_Child
   - Mother
   - Father
+  ort_tier: False
   urls:
   - https://childes.talkbank.org/data/German/Caroline.zip
   - https://childes.talkbank.org/data/German/Password/Leo.zip
@@ -26,6 +28,7 @@ en:
   - Target_Child
   - Mother
   - Father
+  ort_tier: False
   urls:
   - https://phonbank.talkbank.org/data/Eng-NA/Providence.zip
   - https://childes.talkbank.org/data/Eng-NA/Warren.zip
@@ -37,6 +40,7 @@ es:
   - Target_Child
   - Mother
   - Father
+  ort_tier: False
   urls:
   - https://childes.talkbank.org/data/Spanish/OreaPine.zip
   - https://childes.talkbank.org/data/Spanish/Aguirre.zip
@@ -52,6 +56,7 @@ et:
   - Target_Child
   - Mother
   - Father
+  ort_tier: False
   urls:
   - https://childes.talkbank.org/data/Other/Estonian/Argus.zip
   - https://childes.talkbank.org/data/Other/Estonian/Beek.zip
@@ -68,6 +73,7 @@ eu:
   - Target_Child
   - Mother
   - Father
+  ort_tier: False
   urls:
   - https://childes.talkbank.org/data/Other/Basque/Soto.zip
 fr:
@@ -77,6 +83,7 @@ fr:
   - Target_Child
   - Mother
   - Father
+  ort_tier: False
   urls:
   - https://phonbank.talkbank.org/data/French/Paris.zip
   - https://phonbank.talkbank.org/data/French/Hunkeler.zip
@@ -91,6 +98,7 @@ ja:
   - Target_Child
   - Mother
   - Father
+  ort_tier: True
   urls:
   - https://childes.talkbank.org/data/Japanese/Hamasaki.zip
   - https://childes.talkbank.org/data/Japanese/Miyata.zip
@@ -103,6 +111,7 @@ pl:
   - Target_Child
   - Mother
   - Father
+  ort_tier: False
   urls:
   - https://phonbank.talkbank.org/data/Slavic/Polish/WeistJarosz.zip
   - https://childes.talkbank.org/data/Slavic/Polish/Szuman.zip
@@ -113,6 +122,7 @@ pt:
   - Target_Child
   - Mother
   - Father
+  ort_tier: False
   urls:
   - https://phonbank.talkbank.org/data/Romance/Portuguese/CCF.zip
   - https://childes.talkbank.org/data/Romance/Portuguese/Florianopolis.zip
@@ -124,6 +134,7 @@ sr:
   - Target_Child
   - Mother
   - Father
+  ort_tier: False
   urls:
   - https://childes.talkbank.org/data/Slavic/Serbian/SCECL.zip
 tr:
@@ -133,6 +144,7 @@ tr:
   - Target_Child
   - Mother
   - Father
+  ort_tier: False
   urls:
   - https://childes.talkbank.org/data/Other/Turkish/Aksu.zip
   - https://childes.talkbank.org/data/Other/Turkish/Altinkamis.zip

+ 1 - 0
results/results_for_study2_datalad.csv

@@ -0,0 +1 @@
+../.git/annex/objects/gg/pJ/MD5E-s1271727--76f1f59ac68fe619ae34bd03ede011a8.csv/MD5E-s1271727--76f1f59ac68fe619ae34bd03ede011a8.csv

+ 1 - 0
ter

@@ -0,0 +1 @@
+.git/annex/objects/K5/g1/MD5E-s11843--f7a613986d426f7c5667fc59487dd559/MD5E-s11843--f7a613986d426f7c5667fc59487dd559