evaluate_language_models.py 3.2 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485
  1. """This module implements a function that\
  2. evaluate the trained language moedels"""
  3. import os
  4. from math import log
  5. import random
  6. import pandas as pd
  7. import kenlm
  8. random.seed(1023)
  9. LANGUAGES_TYPOLOGIES = {
  10. 'da' : ("Danish", "fusional"),
  11. 'de' : ("German", "fusional"),
  12. 'en' : ("English", "fusional"),
  13. 'es' : ("Spanish", "fusional"),
  14. 'et' : ("Estonian", "agglutinative"),
  15. 'eu' : ("Basque", "agglutinative"),
  16. 'fr' : ("French", "fusional"),
  17. 'ja' : ("Japanese", "agglutinative"),
  18. 'pl' : ("Polish", "fusional"),
  19. 'pt' : ("Portuguese", "fusional"),
  20. 'sr' : ("Serbian", "fusional"),
  21. 'tr' : ("Turkish", "agglutinative")}
  22. def evaluate(train_files_directory: str,
  23. dev_files_directory: str,
  24. models_directory: str) -> pd.DataFrame:
  25. """
  26. This function will compute the entropies of\
  27. test files for all languages.
  28. Parameters
  29. ----------
  30. - train_files_directory: str
  31. The path to the directory containing training files.
  32. - dev_files_directory: str
  33. The path to the directory containing testing/development files.
  34. - models_directory: str
  35. The path to the directory containing training trained\
  36. language models.
  37. """
  38. triplets_files_model = zip(sorted(os.listdir(train_files_directory)),
  39. sorted(os.listdir(dev_files_directory)),
  40. sorted(os.listdir(models_directory)))
  41. columns = ["language", "train_entropy", "dev_entropy"]
  42. evaluation = pd.DataFrame(columns=columns, index=None)
  43. for train_filename, dev_filename, model_filename in triplets_files_model :
  44. language, _ = train_filename.split(".")
  45. model = model = kenlm.Model(f"{models_directory}/{model_filename}")
  46. train_sents = "\n".join(sent.strip() for sent in open(f"{train_files_directory}/{train_filename}"))
  47. train_entropy = log(model.perplexity(train_sents))
  48. dev_sents = "\n".join(sent.strip() for sent in open(f"{dev_files_directory}/{dev_filename}"))
  49. dev_entropy = log(model.perplexity(dev_sents))
  50. new_row = {
  51. "language" : LANGUAGES_TYPOLOGIES[language][0],
  52. "train_entropy" : train_entropy,
  53. "dev_entropy" : dev_entropy
  54. }
  55. evaluation = evaluation.append(new_row, ignore_index=True)
  56. return evaluation
  57. if __name__ == "__main__":
  58. import argparse
  59. parser = argparse.ArgumentParser()
  60. parser.add_argument('--train_files_directory',
  61. required=True,
  62. help="The directory containing the OpenSubtitles training files"
  63. )
  64. parser.add_argument('--dev_files_directory',
  65. required=True,
  66. help="The directory containing the OpenSubtitles test files"
  67. )
  68. parser.add_argument('--models_directory',
  69. required=True,
  70. help="The directory containing the trained language models"
  71. )
  72. args = parser.parse_args()
  73. train_files = args.train_files_directory
  74. dev_files = args.dev_files_directory
  75. models_directory = args.models_directory
  76. if not os.path.exists("results"):
  77. os.makedirs("results")
  78. evaluate(train_files,
  79. dev_files,
  80. models_directory).to_csv("results/evaluation.csv")