12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485 |
- """This module implements a function that\
- evaluate the trained language moedels"""
- import os
- from math import log
- import random
- import pandas as pd
- import kenlm
- random.seed(1023)
- LANGUAGES_TYPOLOGIES = {
- 'da' : ("Danish", "fusional"),
- 'de' : ("German", "fusional"),
- 'en' : ("English", "fusional"),
- 'es' : ("Spanish", "fusional"),
- 'et' : ("Estonian", "agglutinative"),
- 'eu' : ("Basque", "agglutinative"),
- 'fr' : ("French", "fusional"),
- 'ja' : ("Japanese", "agglutinative"),
- 'pl' : ("Polish", "fusional"),
- 'pt' : ("Portuguese", "fusional"),
- 'sr' : ("Serbian", "fusional"),
- 'tr' : ("Turkish", "agglutinative")}
- def evaluate(train_files_directory: str,
- dev_files_directory: str,
- models_directory: str) -> pd.DataFrame:
- """
- This function will compute the entropies of\
- test files for all languages.
- Parameters
- ----------
- - train_files_directory: str
- The path to the directory containing training files.
- - dev_files_directory: str
- The path to the directory containing testing/development files.
- - models_directory: str
- The path to the directory containing training trained\
- language models.
- """
- triplets_files_model = zip(sorted(os.listdir(train_files_directory)),
- sorted(os.listdir(dev_files_directory)),
- sorted(os.listdir(models_directory)))
- columns = ["language", "train_entropy", "dev_entropy"]
- evaluation = pd.DataFrame(columns=columns, index=None)
- for train_filename, dev_filename, model_filename in triplets_files_model :
- language, _ = train_filename.split(".")
- model = model = kenlm.Model(f"{models_directory}/{model_filename}")
- train_sents = "\n".join(sent.strip() for sent in open(f"{train_files_directory}/{train_filename}"))
- train_entropy = log(model.perplexity(train_sents))
- dev_sents = "\n".join(sent.strip() for sent in open(f"{dev_files_directory}/{dev_filename}"))
- dev_entropy = log(model.perplexity(dev_sents))
- new_row = {
- "language" : LANGUAGES_TYPOLOGIES[language][0],
- "train_entropy" : train_entropy,
- "dev_entropy" : dev_entropy
- }
- evaluation = evaluation.append(new_row, ignore_index=True)
- return evaluation
- if __name__ == "__main__":
- import argparse
- parser = argparse.ArgumentParser()
- parser.add_argument('--train_files_directory',
- required=True,
- help="The directory containing the OpenSubtitles training files"
- )
- parser.add_argument('--dev_files_directory',
- required=True,
- help="The directory containing the OpenSubtitles test files"
- )
- parser.add_argument('--models_directory',
- required=True,
- help="The directory containing the trained language models"
- )
- args = parser.parse_args()
- train_files = args.train_files_directory
- dev_files = args.dev_files_directory
- models_directory = args.models_directory
- if not os.path.exists("results"):
- os.makedirs("results")
- evaluate(train_files,
- dev_files,
- models_directory).to_csv("results/evaluation.csv")
|