yaya-sy 1 year ago
parent
commit
f7c6ced59f
3 changed files with 6 additions and 8 deletions
  1. 3 1
      README.md
  2. 2 7
      code/test_on_all_languages.py
  3. 1 0
      results/plot_results.png

+ 3 - 1
README.md

@@ -9,7 +9,9 @@ and activate it :
 
 ```conda activate measuring_cld```
 
-We provide all the data already pre-processed and phonemized. But if you want to re-download the raw data and to re-pre-processed it entierely, then you will need to install phonemizer (https://github.com/bootphon/phonemizer) with the espeak backend.
+You will also need to install KenLM (https://github.com/kpu/kenlm).
+
+We provide all the data already pre-processed and phonemized. But if you want to re-download the raw data and to re-pre-process them entierely, then you will need to install phonemizer (https://github.com/bootphon/phonemizer) with the espeak backend.
 
 ## Folder structure
 

+ 2 - 7
code/test_on_all_languages.py

@@ -81,10 +81,9 @@ def create_sparse_combinantions(values: Iterable, variables=3) -> set:
     """
     This function will create combinantions for noising.
     Each item in the returned set contains four values corresponding\
-    to (1) phoneme noise, (2) noise of from adult to child utterances,\
-    (3) noise of from child to adult utterances and (4) noise of
+    to (1) phoneme noise, (2) speaker noise and (3) noise of the order of the phonemes.
     These combinantions are sparse because we only noise one value at time.
-    For example, an item can be (0.0, 0.0, 0.0, 0.25), which means that we only
+    For example, an item can be (0.0, 0.0, 0.25), which means that we only
     noise 25 percent of the phonemes, and nothing else is affected.
     See the file make_noiser.py for more infomrations.
     """
@@ -104,10 +103,7 @@ def test(json_files_directory, models_directory, train_files, add_noise=True) :
                 "age", "perplexity", "entropy", "phonemes_order_noise",\
                 "speakers_noise", "phonemes_noise"]
     results = pd.DataFrame(columns=columns, index=None)
-    # all_combinations = (list(product((0.0, 0.25, 0.5, 0.75), repeat=4))
-    #                       if add_noise else [((0.0, 0.0, 0.0, 0.0))])
     sparse_combinantions = create_sparse_combinantions((0.0, 0.25, 0.5, 0.75, 1))
-    # noise_values = np.linspace(0.0, 1.0, num=6)
     for phonemes_noise, speakers_noise, phonemes_order_noise in tqdm(sparse_combinantions, total=len(sparse_combinantions)) :
         for test_filename, model_filename in product(os.listdir(json_files_directory), os.listdir(models_directory)) :
             lg_iso, _ = test_filename.split(".")
@@ -128,7 +124,6 @@ def test(json_files_directory, models_directory, train_files, add_noise=True) :
                     if age == "None" : print(family, lg_iso, age); continue
                     for speaker in loaded_json[family][age] :
                         if speaker not in ["Adult", "Target_Child"] : continue
-                        # results_statistics = statistics_word(loaded_json[family][age][speaker], model)
                         language, typology = LANGUAGES_TYPOLOGIES[lg_iso]
                         ppl = model.perplexity("\n".join(loaded_json[family][age][speaker]))
                         entropy = log(ppl)

+ 1 - 0
results/plot_results.png

@@ -0,0 +1 @@
+../.git/annex/objects/Vq/1V/MD5E-s1158490--07e64ccccdf09682d892e990390940c3.png/MD5E-s1158490--07e64ccccdf09682d892e990390940c3.png