LAAC-LSCP
/
Unsupervised_Metrics_CLD_Sy


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
							import os
import random
import json
from math import log
from typing import Iterable
from itertools import product
from tqdm import tqdm
import kenlm
from make_noiser import Noise
import pandas as pd
from get_most_probable_phonemes import get_most_probable_phonemes
random.seed(1023)


LANGUAGES_TYPOLOGIES = {
    'da' : ("Danish", "fusional"),
    'de' : ("German", "fusional"),
    'en' : ("English", "fusional"),
    'es' : ("Spanish", "fusional"),
    'et' : ("Estonian", "agglutinative"),
    'eu' : ("Basque", "agglutinative"),
    'fr' : ("French", "fusional"),
    'ja' : ("Japanese", "agglutinative"),
    'pl' : ("Polish", "fusional"),
    'pt' : ("Portuguese", "fusional"),
    'sr' : ("Serbian", "fusional"),
    'tr' : ("Turkish", "agglutinative")}

def statistics_word(utterances: list, model: kenlm.Model) -> dict:
    """
    This function will test a given language model\
    on a given list of utterances.\
    The function will also compute some statistics; MLU, TTR, etc

    Parameters
    ----------
    - model
        The estimated language model
    - utterances: list
        The utterances to test
    """
    phoneme_utterances = []
    unique_words = set()
    mlu_w = 0.0
    mlu_p = 0.0
    nb_utterances = 0
    nb_words = 0

    statistics = {}
    for utterance in utterances :
        utterance = utterance.strip()
        if not utterance : continue
        nb_utterances += 1

        utterance_w = utterance.replace("@", " ").replace("$", "")
        utterance_p = utterance.replace("@", " ").replace("$", " ")
        phoneme_utterances.append(utterance_p)

        utterance_words = utterance_w.split(" ")
        mlu_w += len(utterance_words)
        mlu_p += len(utterance_p.split(" "))
        nb_words += len(utterance_words)
        unique_words |= set(utterance_words)

    mlu_w /= nb_utterances
    mlu_p /= nb_utterances
    ttr_w = len(unique_words) / nb_words
    
    ppl = model.perplexity("\n".join(phoneme_utterances))
    entropy = log(ppl)

    statistics["ppl"] = ppl
    statistics["entropy"] = entropy
    statistics["mlu_w"] = mlu_w
    statistics["mlu_p"] = mlu_p
    statistics["ttr_w"] = ttr_w

    return statistics

def create_sparse_combinantions(values: Iterable, variables=3) -> set:
    """
    This function will create combinantions for noising.
    Each item in the returned set contains four values corresponding\
    to (1) phoneme noise, (2) speaker noise and (3) noise of the order of the phonemes.
    These combinantions are sparse because we only noise one value at time.
    For example, an item can be (0.0, 0.0, 0.25), which means that we only
    noise 25 percent of the phonemes, and nothing else is affected.
    See the file make_noiser.py for more infomrations.
    """
    sparse_combinantions = []
    for value in values :
        for idx in range(variables) :
            sparse_values = [0.0] * variables
            sparse_values[idx] = value
            sparse_combinantions.append(tuple(sparse_values))
    return set(sparse_combinantions)

def test(json_files_directory, models_directory, train_files, add_noise=True) :
    """
    This function will test the language models on CHILDES corpora
    """
    columns = ["language", "typology", "family", "speaker",\
                "age", "perplexity", "entropy", "phonemes_order_noise",\
                "speakers_noise", "phonemes_noise"]
    results = pd.DataFrame(columns=columns, index=None)
    sparse_combinantions = create_sparse_combinantions((0.0, 0.25, 0.5, 0.75, 1))
    for phonemes_noise, speakers_noise, phonemes_order_noise in tqdm(sparse_combinantions, total=len(sparse_combinantions)) :
        for test_filename, model_filename in product(os.listdir(json_files_directory), os.listdir(models_directory)) :
            lg_iso, _ = test_filename.split(".")
            model_lg = model_filename.split(".")[0]
            if lg_iso != model_lg :
                continue
            most_probable_phonemes = get_most_probable_phonemes(f"{train_files}/{lg_iso}.one_sentence_per_line")
            loaded_json = json.load(open(f"{json_files_directory}/{test_filename}"))
            if add_noise :
                noise = Noise(most_probable_phonemes,
                                phonemes_order_noise_value=phonemes_order_noise,
                                speakers_noise_values=(speakers_noise, speakers_noise),
                                phonemes_noise_value=phonemes_noise)
                loaded_json = noise(loaded_json)
            model = kenlm.Model(f"{models_directory}/{model_filename}")
            for family in loaded_json :
                for age in loaded_json[family] :
                    if age == "None" : print(family, lg_iso, age); continue
                    for speaker in loaded_json[family][age] :
                        if speaker not in ["Adult", "Target_Child"] : continue
                        language, typology = LANGUAGES_TYPOLOGIES[lg_iso]
                        ppl = model.perplexity("\n".join(loaded_json[family][age][speaker]))
                        entropy = log(ppl)
                        new_row =  {"language" : language,
                                    "typology" : typology,
                                    "family" : family,
                                    "speaker" : speaker,
                                    "age" : float(age),
                                    "perplexity" : ppl,
                                    "entropy" : entropy,
                                    "phonemes_order_noise" : phonemes_order_noise,
                                    "speakers_noise" : speakers_noise,
                                    "phonemes_noise" : phonemes_noise}
                        results = results.append(new_row, ignore_index=True)
    return results
if __name__ == "__main__":
    from argparse import ArgumentParser, BooleanOptionalAction

    parser = ArgumentParser()
    parser.add_argument('--train_files_directory',
        required=True,
        help="The directory containing the train files tokenized in phonemes."
        )
    parser.add_argument('--model_files_directory',
        required=True,
        help="The directory containing the trained language models."
        )

    parser.add_argument('--json_files_directory',
        required=True,
        help="The directory containing CHILDES utterances in json format for each language"
        )

    parser.add_argument("--add_noise",
        help="Whether noise the CHILDES utterances or not",
        action=BooleanOptionalAction)

    args = parser.parse_args()
    add_noise = args.add_noise
    json_files_directory = args.json_files_directory
    phoneme_train_files = args.train_files_directory
    models_directory = args.model_files_directory

    if not os.path.exists("results"):
        os.makedirs("results")
    test(json_files_directory,
            models_directory,
            phoneme_train_files,
            add_noise=add_noise).to_csv("results/results.csv")