|
@@ -77,7 +77,7 @@ def statistics_word(utterances: list, model: kenlm.Model) -> dict:
|
|
|
|
|
|
return statistics
|
|
|
|
|
|
-def create_sparse_combinantions(values: Iterable) -> set:
|
|
|
+def create_sparse_combinantions(values: Iterable, variables=3) -> set:
|
|
|
"""
|
|
|
This function will create combinantions for noising.
|
|
|
Each item in the returned set contains four values corresponding\
|
|
@@ -90,8 +90,8 @@ def create_sparse_combinantions(values: Iterable) -> set:
|
|
|
"""
|
|
|
sparse_combinantions = []
|
|
|
for value in values :
|
|
|
- for idx in range(len(values)) :
|
|
|
- sparse_values = [0.0] * len(values)
|
|
|
+ for idx in range(variables) :
|
|
|
+ sparse_values = [0.0] * variables
|
|
|
sparse_values[idx] = value
|
|
|
sparse_combinantions.append(tuple(sparse_values))
|
|
|
return set(sparse_combinantions)
|
|
@@ -101,15 +101,14 @@ def test(json_files_directory, models_directory, train_files, add_noise=True) :
|
|
|
This function will test the language models on CHILDES corpora
|
|
|
"""
|
|
|
columns = ["language", "typology", "family", "speaker",\
|
|
|
- "age", "perplexity", "entropy", "mlu", "mlu_without_repetition",\
|
|
|
- "phonemes_order_noise", "speakers_noise_adult",\
|
|
|
- "speakers_noise_child", "phonemes_noise"]
|
|
|
+ "age", "perplexity", "entropy", "phonemes_order_noise",\
|
|
|
+ "speakers_noise", "phonemes_noise"]
|
|
|
results = pd.DataFrame(columns=columns, index=None)
|
|
|
# all_combinations = (list(product((0.0, 0.25, 0.5, 0.75), repeat=4))
|
|
|
# if add_noise else [((0.0, 0.0, 0.0, 0.0))])
|
|
|
- sparse_combinantions = create_sparse_combinantions((0.0, 0.25, 0.5, 0.75))
|
|
|
+ sparse_combinantions = create_sparse_combinantions((0.0, 0.25, 0.5, 0.75, 1))
|
|
|
# noise_values = np.linspace(0.0, 1.0, num=6)
|
|
|
- for phonemes_noise, speakers_noise_child, speakers_noise_adult, phonemes_order_noise in tqdm(sparse_combinantions, total=len(sparse_combinantions)) :
|
|
|
+ for phonemes_noise, speakers_noise, phonemes_order_noise in tqdm(sparse_combinantions, total=len(sparse_combinantions)) :
|
|
|
for test_filename, model_filename in product(os.listdir(json_files_directory), os.listdir(models_directory)) :
|
|
|
lg_iso, _ = test_filename.split(".")
|
|
|
model_lg = model_filename.split(".")[0]
|
|
@@ -120,7 +119,7 @@ def test(json_files_directory, models_directory, train_files, add_noise=True) :
|
|
|
if add_noise :
|
|
|
noise = Noise(most_probable_phonemes,
|
|
|
phonemes_order_noise_value=phonemes_order_noise,
|
|
|
- speakers_noise_values=(speakers_noise_child, speakers_noise_adult),
|
|
|
+ speakers_noise_values=(speakers_noise, speakers_noise),
|
|
|
phonemes_noise_value=phonemes_noise)
|
|
|
loaded_json = noise(loaded_json)
|
|
|
model = kenlm.Model(f"{models_directory}/{model_filename}")
|
|
@@ -129,21 +128,19 @@ def test(json_files_directory, models_directory, train_files, add_noise=True) :
|
|
|
if age == "None" : print(family, lg_iso, age); continue
|
|
|
for speaker in loaded_json[family][age] :
|
|
|
if speaker not in ["Adult", "Target_Child"] : continue
|
|
|
- results_statistics = statistics_word(loaded_json[family][age][speaker], model)
|
|
|
+ # results_statistics = statistics_word(loaded_json[family][age][speaker], model)
|
|
|
language, typology = LANGUAGES_TYPOLOGIES[lg_iso]
|
|
|
+ ppl = model.perplexity("\n".join(loaded_json[family][age][speaker]))
|
|
|
+ entropy = log(ppl)
|
|
|
new_row = {"language" : language,
|
|
|
"typology" : typology,
|
|
|
"family" : family,
|
|
|
"speaker" : speaker,
|
|
|
"age" : float(age),
|
|
|
- "perplexity" : results_statistics["ppl"],
|
|
|
- "entropy" : results_statistics["entropy"],
|
|
|
- "mlu_w" : results_statistics["mlu_w"],
|
|
|
- "mlu_p" : results_statistics["mlu_p"],
|
|
|
- "ttr_w" : results_statistics["ttr_w"],
|
|
|
+ "perplexity" : ppl,
|
|
|
+ "entropy" : entropy,
|
|
|
"phonemes_order_noise" : phonemes_order_noise,
|
|
|
- "speakers_noise_adult" : speakers_noise_adult,
|
|
|
- "speakers_noise_child" : speakers_noise_child,
|
|
|
+ "speakers_noise" : speakers_noise,
|
|
|
"phonemes_noise" : phonemes_noise}
|
|
|
results = results.append(new_row, ignore_index=True)
|
|
|
return results
|
|
@@ -151,11 +148,11 @@ if __name__ == "__main__":
|
|
|
from argparse import ArgumentParser, BooleanOptionalAction
|
|
|
|
|
|
parser = ArgumentParser()
|
|
|
- parser.add_argument('--train_directory',
|
|
|
+ parser.add_argument('--train_files_directory',
|
|
|
required=True,
|
|
|
help="The directory containing the train files tokenized in phonemes."
|
|
|
)
|
|
|
- parser.add_argument('--models_directory',
|
|
|
+ parser.add_argument('--model_files_directory',
|
|
|
required=True,
|
|
|
help="The directory containing the trained language models."
|
|
|
)
|
|
@@ -172,8 +169,8 @@ if __name__ == "__main__":
|
|
|
args = parser.parse_args()
|
|
|
add_noise = args.add_noise
|
|
|
json_files_directory = args.json_files_directory
|
|
|
- phoneme_train_files = args.train_directory
|
|
|
- models_directory = args.models_directory
|
|
|
+ phoneme_train_files = args.train_files_directory
|
|
|
+ models_directory = args.model_files_directory
|
|
|
|
|
|
if not os.path.exists("results"):
|
|
|
os.makedirs("results")
|