123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175 |
- import os
- import random
- import json
- from math import log
- from typing import Iterable
- from itertools import product
- from tqdm import tqdm
- import kenlm
- from make_noiser import Noise
- import pandas as pd
- from get_most_probable_phonemes import get_most_probable_phonemes
- random.seed(1023)
- LANGUAGES_TYPOLOGIES = {
- 'da' : ("Danish", "fusional"),
- 'de' : ("German", "fusional"),
- 'en' : ("English", "fusional"),
- 'es' : ("Spanish", "fusional"),
- 'et' : ("Estonian", "agglutinative"),
- 'eu' : ("Basque", "agglutinative"),
- 'fr' : ("French", "fusional"),
- 'ja' : ("Japanese", "agglutinative"),
- 'pl' : ("Polish", "fusional"),
- 'pt' : ("Portuguese", "fusional"),
- 'sr' : ("Serbian", "fusional"),
- 'tr' : ("Turkish", "agglutinative")}
- def statistics_word(utterances: list, model: kenlm.Model) -> dict:
- """
- This function will test a given language model\
- on a given list of utterances.\
- The function will also compute some statistics; MLU, TTR, etc
- Parameters
- ----------
- - model
- The estimated language model
- - utterances: list
- The utterances to test
- """
- phoneme_utterances = []
- unique_words = set()
- mlu_w = 0.0
- mlu_p = 0.0
- nb_utterances = 0
- nb_words = 0
- statistics = {}
- for utterance in utterances :
- utterance = utterance.strip()
- if not utterance : continue
- nb_utterances += 1
- utterance_w = utterance.replace("@", " ").replace("$", "")
- utterance_p = utterance.replace("@", " ").replace("$", " ")
- phoneme_utterances.append(utterance_p)
- utterance_words = utterance_w.split(" ")
- mlu_w += len(utterance_words)
- mlu_p += len(utterance_p.split(" "))
- nb_words += len(utterance_words)
- unique_words |= set(utterance_words)
- mlu_w /= nb_utterances
- mlu_p /= nb_utterances
- ttr_w = len(unique_words) / nb_words
-
- ppl = model.perplexity("\n".join(phoneme_utterances))
- entropy = log(ppl)
- statistics["ppl"] = ppl
- statistics["entropy"] = entropy
- statistics["mlu_w"] = mlu_w
- statistics["mlu_p"] = mlu_p
- statistics["ttr_w"] = ttr_w
- return statistics
- def create_sparse_combinantions(values: Iterable, variables=3) -> set:
- """
- This function will create combinantions for noising.
- Each item in the returned set contains four values corresponding\
- to (1) phoneme noise, (2) speaker noise and (3) noise of the order of the phonemes.
- These combinantions are sparse because we only noise one value at time.
- For example, an item can be (0.0, 0.0, 0.25), which means that we only
- noise 25 percent of the phonemes, and nothing else is affected.
- See the file make_noiser.py for more infomrations.
- """
- sparse_combinantions = []
- for value in values :
- for idx in range(variables) :
- sparse_values = [0.0] * variables
- sparse_values[idx] = value
- sparse_combinantions.append(tuple(sparse_values))
- return set(sparse_combinantions)
- def test(json_files_directory, models_directory, train_files, add_noise=True) :
- """
- This function will test the language models on CHILDES corpora
- """
- columns = ["language", "typology", "family", "speaker",\
- "age", "perplexity", "entropy", "phonemes_order_noise",\
- "speakers_noise", "phonemes_noise"]
- results = pd.DataFrame(columns=columns, index=None)
- sparse_combinantions = create_sparse_combinantions((0.0, 0.25, 0.5, 0.75, 1))
- for phonemes_noise, speakers_noise, phonemes_order_noise in tqdm(sparse_combinantions, total=len(sparse_combinantions)) :
- for test_filename, model_filename in product(os.listdir(json_files_directory), os.listdir(models_directory)) :
- lg_iso, _ = test_filename.split(".")
- model_lg = model_filename.split(".")[0]
- if lg_iso != model_lg :
- continue
- most_probable_phonemes = get_most_probable_phonemes(f"{train_files}/{lg_iso}.one_sentence_per_line")
- loaded_json = json.load(open(f"{json_files_directory}/{test_filename}"))
- if add_noise :
- noise = Noise(most_probable_phonemes,
- phonemes_order_noise_value=phonemes_order_noise,
- speakers_noise_values=(speakers_noise, speakers_noise),
- phonemes_noise_value=phonemes_noise)
- loaded_json = noise(loaded_json)
- model = kenlm.Model(f"{models_directory}/{model_filename}")
- for family in loaded_json :
- for age in loaded_json[family] :
- if age == "None" : print(family, lg_iso, age); continue
- for speaker in loaded_json[family][age] :
- if speaker not in ["Adult", "Target_Child"] : continue
- language, typology = LANGUAGES_TYPOLOGIES[lg_iso]
- ppl = model.perplexity("\n".join(loaded_json[family][age][speaker]))
- entropy = log(ppl)
- new_row = {"language" : language,
- "typology" : typology,
- "family" : family,
- "speaker" : speaker,
- "age" : float(age),
- "perplexity" : ppl,
- "entropy" : entropy,
- "phonemes_order_noise" : phonemes_order_noise,
- "speakers_noise" : speakers_noise,
- "phonemes_noise" : phonemes_noise}
- results = results.append(new_row, ignore_index=True)
- return results
- if __name__ == "__main__":
- from argparse import ArgumentParser, BooleanOptionalAction
- parser = ArgumentParser()
- parser.add_argument('--train_files_directory',
- required=True,
- help="The directory containing the train files tokenized in phonemes."
- )
- parser.add_argument('--model_files_directory',
- required=True,
- help="The directory containing the trained language models."
- )
- parser.add_argument('--json_files_directory',
- required=True,
- help="The directory containing CHILDES utterances in json format for each language"
- )
- parser.add_argument("--add_noise",
- help="Whether noise the CHILDES utterances or not",
- action=BooleanOptionalAction)
- args = parser.parse_args()
- add_noise = args.add_noise
- json_files_directory = args.json_files_directory
- phoneme_train_files = args.train_files_directory
- models_directory = args.model_files_directory
- if not os.path.exists("results"):
- os.makedirs("results")
- test(json_files_directory,
- models_directory,
- phoneme_train_files,
- add_noise=add_noise).to_csv("results/results.csv")
|