yaya-sy 1 year ago
parent
commit
f7c6ced59f
3 changed files with 6 additions and 8 deletions
  1. 3 1
      README.md
  2. 2 7
      code/test_on_all_languages.py
  3. 1 0
      results/plot_results.png

+ 3 - 1
README.md

@@ -9,7 +9,9 @@ and activate it :
 
 
 ```conda activate measuring_cld```
 ```conda activate measuring_cld```
 
 
-We provide all the data already pre-processed and phonemized. But if you want to re-download the raw data and to re-pre-processed it entierely, then you will need to install phonemizer (https://github.com/bootphon/phonemizer) with the espeak backend.
+You will also need to install KenLM (https://github.com/kpu/kenlm).
+
+We provide all the data already pre-processed and phonemized. But if you want to re-download the raw data and to re-pre-process them entierely, then you will need to install phonemizer (https://github.com/bootphon/phonemizer) with the espeak backend.
 
 
 ## Folder structure
 ## Folder structure
 
 

+ 2 - 7
code/test_on_all_languages.py

@@ -81,10 +81,9 @@ def create_sparse_combinantions(values: Iterable, variables=3) -> set:
     """
     """
     This function will create combinantions for noising.
     This function will create combinantions for noising.
     Each item in the returned set contains four values corresponding\
     Each item in the returned set contains four values corresponding\
-    to (1) phoneme noise, (2) noise of from adult to child utterances,\
-    (3) noise of from child to adult utterances and (4) noise of
+    to (1) phoneme noise, (2) speaker noise and (3) noise of the order of the phonemes.
     These combinantions are sparse because we only noise one value at time.
     These combinantions are sparse because we only noise one value at time.
-    For example, an item can be (0.0, 0.0, 0.0, 0.25), which means that we only
+    For example, an item can be (0.0, 0.0, 0.25), which means that we only
     noise 25 percent of the phonemes, and nothing else is affected.
     noise 25 percent of the phonemes, and nothing else is affected.
     See the file make_noiser.py for more infomrations.
     See the file make_noiser.py for more infomrations.
     """
     """
@@ -104,10 +103,7 @@ def test(json_files_directory, models_directory, train_files, add_noise=True) :
                 "age", "perplexity", "entropy", "phonemes_order_noise",\
                 "age", "perplexity", "entropy", "phonemes_order_noise",\
                 "speakers_noise", "phonemes_noise"]
                 "speakers_noise", "phonemes_noise"]
     results = pd.DataFrame(columns=columns, index=None)
     results = pd.DataFrame(columns=columns, index=None)
-    # all_combinations = (list(product((0.0, 0.25, 0.5, 0.75), repeat=4))
-    #                       if add_noise else [((0.0, 0.0, 0.0, 0.0))])
     sparse_combinantions = create_sparse_combinantions((0.0, 0.25, 0.5, 0.75, 1))
     sparse_combinantions = create_sparse_combinantions((0.0, 0.25, 0.5, 0.75, 1))
-    # noise_values = np.linspace(0.0, 1.0, num=6)
     for phonemes_noise, speakers_noise, phonemes_order_noise in tqdm(sparse_combinantions, total=len(sparse_combinantions)) :
     for phonemes_noise, speakers_noise, phonemes_order_noise in tqdm(sparse_combinantions, total=len(sparse_combinantions)) :
         for test_filename, model_filename in product(os.listdir(json_files_directory), os.listdir(models_directory)) :
         for test_filename, model_filename in product(os.listdir(json_files_directory), os.listdir(models_directory)) :
             lg_iso, _ = test_filename.split(".")
             lg_iso, _ = test_filename.split(".")
@@ -128,7 +124,6 @@ def test(json_files_directory, models_directory, train_files, add_noise=True) :
                     if age == "None" : print(family, lg_iso, age); continue
                     if age == "None" : print(family, lg_iso, age); continue
                     for speaker in loaded_json[family][age] :
                     for speaker in loaded_json[family][age] :
                         if speaker not in ["Adult", "Target_Child"] : continue
                         if speaker not in ["Adult", "Target_Child"] : continue
-                        # results_statistics = statistics_word(loaded_json[family][age][speaker], model)
                         language, typology = LANGUAGES_TYPOLOGIES[lg_iso]
                         language, typology = LANGUAGES_TYPOLOGIES[lg_iso]
                         ppl = model.perplexity("\n".join(loaded_json[family][age][speaker]))
                         ppl = model.perplexity("\n".join(loaded_json[family][age][speaker]))
                         entropy = log(ppl)
                         entropy = log(ppl)

+ 1 - 0
results/plot_results.png

@@ -0,0 +1 @@
+../.git/annex/objects/Vq/1V/MD5E-s1158490--07e64ccccdf09682d892e990390940c3.png/MD5E-s1158490--07e64ccccdf09682d892e990390940c3.png