"""This module implements a function that\ evaluate the trained language moedels""" import os from math import log import random import pandas as pd import kenlm random.seed(1023) LANGUAGES_TYPOLOGIES = { 'da' : ("Danish", "fusional"), 'de' : ("German", "fusional"), 'en' : ("English", "fusional"), 'es' : ("Spanish", "fusional"), 'et' : ("Estonian", "agglutinative"), 'eu' : ("Basque", "agglutinative"), 'fr' : ("French", "fusional"), 'ja' : ("Japanese", "agglutinative"), 'pl' : ("Polish", "fusional"), 'pt' : ("Portuguese", "fusional"), 'sr' : ("Serbian", "fusional"), 'tr' : ("Turkish", "agglutinative")} def evaluate(train_files_directory: str, dev_files_directory: str, models_directory: str) -> pd.DataFrame: """ This function will compute the entropies of\ test files for all languages. Parameters ---------- - train_files_directory: str The path to the directory containing training files. - dev_files_directory: str The path to the directory containing testing/development files. - models_directory: str The path to the directory containing training trained\ language models. """ triplets_files_model = zip(sorted(os.listdir(train_files_directory)), sorted(os.listdir(dev_files_directory)), sorted(os.listdir(models_directory))) columns = ["language", "train_entropy", "dev_entropy"] evaluation = pd.DataFrame(columns=columns, index=None) for train_filename, dev_filename, model_filename in triplets_files_model : language, _ = train_filename.split(".") model = model = kenlm.Model(f"{models_directory}/{model_filename}") train_sents = "\n".join(sent.strip() for sent in open(f"{train_files_directory}/{train_filename}")) train_entropy = log(model.perplexity(train_sents)) dev_sents = "\n".join(sent.strip() for sent in open(f"{dev_files_directory}/{dev_filename}")) dev_entropy = log(model.perplexity(dev_sents)) new_row = { "language" : LANGUAGES_TYPOLOGIES[language][0], "train_entropy" : train_entropy, "dev_entropy" : dev_entropy } evaluation = evaluation.append(new_row, ignore_index=True) return evaluation if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument('--train_files_directory', required=True, help="The directory containing the OpenSubtitles training files" ) parser.add_argument('--dev_files_directory', required=True, help="The directory containing the OpenSubtitles test files" ) parser.add_argument('--models_directory', required=True, help="The directory containing the trained language models" ) args = parser.parse_args() train_files = args.train_files_directory dev_files = args.dev_files_directory models_directory = args.models_directory if not os.path.exists("results"): os.makedirs("results") evaluate(train_files, dev_files, models_directory).to_csv("results/evaluation.csv")