import os import random import json from math import log from typing import Iterable from itertools import product from tqdm import tqdm import kenlm from make_noiser import Noise import pandas as pd from get_most_probable_phonemes import get_most_probable_phonemes random.seed(1023) LANGUAGES_TYPOLOGIES = { 'da' : ("Danish", "fusional"), 'de' : ("German", "fusional"), 'en' : ("English", "fusional"), 'es' : ("Spanish", "fusional"), 'et' : ("Estonian", "agglutinative"), 'eu' : ("Basque", "agglutinative"), 'fr' : ("French", "fusional"), 'ja' : ("Japanese", "agglutinative"), 'pl' : ("Polish", "fusional"), 'pt' : ("Portuguese", "fusional"), 'sr' : ("Serbian", "fusional"), 'tr' : ("Turkish", "agglutinative")} def statistics_word(utterances: list, model: kenlm.Model) -> dict: """ This function will test a given language model\ on a given list of utterances.\ The function will also compute some statistics; MLU, TTR, etc Parameters ---------- - model The estimated language model - utterances: list The utterances to test """ phoneme_utterances = [] unique_words = set() mlu_w = 0.0 mlu_p = 0.0 nb_utterances = 0 nb_words = 0 statistics = {} for utterance in utterances : utterance = utterance.strip() if not utterance : continue nb_utterances += 1 utterance_w = utterance.replace("@", " ").replace("$", "") utterance_p = utterance.replace("@", " ").replace("$", " ") phoneme_utterances.append(utterance_p) utterance_words = utterance_w.split(" ") mlu_w += len(utterance_words) mlu_p += len(utterance_p.split(" ")) nb_words += len(utterance_words) unique_words |= set(utterance_words) mlu_w /= nb_utterances mlu_p /= nb_utterances ttr_w = len(unique_words) / nb_words ppl = model.perplexity("\n".join(phoneme_utterances)) entropy = log(ppl) statistics["ppl"] = ppl statistics["entropy"] = entropy statistics["mlu_w"] = mlu_w statistics["mlu_p"] = mlu_p statistics["ttr_w"] = ttr_w return statistics def create_sparse_combinantions(values: Iterable, variables=3) -> set: """ This function will create combinantions for noising. Each item in the returned set contains four values corresponding\ to (1) phoneme noise, (2) speaker noise and (3) noise of the order of the phonemes. These combinantions are sparse because we only noise one value at time. For example, an item can be (0.0, 0.0, 0.25), which means that we only noise 25 percent of the phonemes, and nothing else is affected. See the file make_noiser.py for more infomrations. """ sparse_combinantions = [] for value in values : for idx in range(variables) : sparse_values = [0.0] * variables sparse_values[idx] = value sparse_combinantions.append(tuple(sparse_values)) return set(sparse_combinantions) def test(json_files_directory, models_directory, train_files, add_noise=True) : """ This function will test the language models on CHILDES corpora """ columns = ["language", "typology", "family", "speaker",\ "age", "perplexity", "entropy", "phonemes_order_noise",\ "speakers_noise", "phonemes_noise"] results = pd.DataFrame(columns=columns, index=None) sparse_combinantions = create_sparse_combinantions((0.0, 0.25, 0.5, 0.75, 1)) for phonemes_noise, speakers_noise, phonemes_order_noise in tqdm(sparse_combinantions, total=len(sparse_combinantions)) : for test_filename, model_filename in product(os.listdir(json_files_directory), os.listdir(models_directory)) : lg_iso, _ = test_filename.split(".") model_lg = model_filename.split(".")[0] if lg_iso != model_lg : continue most_probable_phonemes = get_most_probable_phonemes(f"{train_files}/{lg_iso}.one_sentence_per_line") loaded_json = json.load(open(f"{json_files_directory}/{test_filename}")) if add_noise : noise = Noise(most_probable_phonemes, phonemes_order_noise_value=phonemes_order_noise, speakers_noise_values=(speakers_noise, speakers_noise), phonemes_noise_value=phonemes_noise) loaded_json = noise(loaded_json) model = kenlm.Model(f"{models_directory}/{model_filename}") for family in loaded_json : for age in loaded_json[family] : if age == "None" : print(family, lg_iso, age); continue for speaker in loaded_json[family][age] : if speaker not in ["Adult", "Target_Child"] : continue language, typology = LANGUAGES_TYPOLOGIES[lg_iso] ppl = model.perplexity("\n".join(loaded_json[family][age][speaker])) entropy = log(ppl) new_row = {"language" : language, "typology" : typology, "family" : family, "speaker" : speaker, "age" : float(age), "perplexity" : ppl, "entropy" : entropy, "phonemes_order_noise" : phonemes_order_noise, "speakers_noise" : speakers_noise, "phonemes_noise" : phonemes_noise} results = results.append(new_row, ignore_index=True) return results if __name__ == "__main__": from argparse import ArgumentParser, BooleanOptionalAction parser = ArgumentParser() parser.add_argument('--train_files_directory', required=True, help="The directory containing the train files tokenized in phonemes." ) parser.add_argument('--model_files_directory', required=True, help="The directory containing the trained language models." ) parser.add_argument('--json_files_directory', required=True, help="The directory containing CHILDES utterances in json format for each language" ) parser.add_argument("--add_noise", help="Whether noise the CHILDES utterances or not", action=BooleanOptionalAction) args = parser.parse_args() add_noise = args.add_noise json_files_directory = args.json_files_directory phoneme_train_files = args.train_files_directory models_directory = args.model_files_directory if not os.path.exists("results"): os.makedirs("results") test(json_files_directory, models_directory, phoneme_train_files, add_noise=add_noise).to_csv("results/results.csv")