test_on_all_languages.py 6.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. import os
  2. import random
  3. import json
  4. from math import log
  5. from typing import Iterable
  6. from itertools import product
  7. from tqdm import tqdm
  8. import kenlm
  9. from make_noiser import Noise
  10. import pandas as pd
  11. from get_most_probable_phonemes import get_most_probable_phonemes
  12. random.seed(1023)
  13. LANGUAGES_TYPOLOGIES = {
  14. 'da' : ("Danish", "fusional"),
  15. 'de' : ("German", "fusional"),
  16. 'en' : ("English", "fusional"),
  17. 'es' : ("Spanish", "fusional"),
  18. 'et' : ("Estonian", "agglutinative"),
  19. 'eu' : ("Basque", "agglutinative"),
  20. 'fr' : ("French", "fusional"),
  21. 'ja' : ("Japanese", "agglutinative"),
  22. 'pl' : ("Polish", "fusional"),
  23. 'pt' : ("Portuguese", "fusional"),
  24. 'sr' : ("Serbian", "fusional"),
  25. 'tr' : ("Turkish", "agglutinative")}
  26. def statistics_word(utterances: list, model: kenlm.Model) -> dict:
  27. """
  28. This function will test a given language model\
  29. on a given list of utterances.\
  30. The function will also compute some statistics; MLU, TTR, etc
  31. Parameters
  32. ----------
  33. - model
  34. The estimated language model
  35. - utterances: list
  36. The utterances to test
  37. """
  38. phoneme_utterances = []
  39. unique_words = set()
  40. mlu_w = 0.0
  41. mlu_p = 0.0
  42. nb_utterances = 0
  43. nb_words = 0
  44. statistics = {}
  45. for utterance in utterances :
  46. utterance = utterance.strip()
  47. if not utterance : continue
  48. nb_utterances += 1
  49. utterance_w = utterance.replace("@", " ").replace("$", "")
  50. utterance_p = utterance.replace("@", " ").replace("$", " ")
  51. phoneme_utterances.append(utterance_p)
  52. utterance_words = utterance_w.split(" ")
  53. mlu_w += len(utterance_words)
  54. mlu_p += len(utterance_p.split(" "))
  55. nb_words += len(utterance_words)
  56. unique_words |= set(utterance_words)
  57. mlu_w /= nb_utterances
  58. mlu_p /= nb_utterances
  59. ttr_w = len(unique_words) / nb_words
  60. ppl = model.perplexity("\n".join(phoneme_utterances))
  61. entropy = log(ppl)
  62. statistics["ppl"] = ppl
  63. statistics["entropy"] = entropy
  64. statistics["mlu_w"] = mlu_w
  65. statistics["mlu_p"] = mlu_p
  66. statistics["ttr_w"] = ttr_w
  67. return statistics
  68. def create_sparse_combinantions(values: Iterable, variables=3) -> set:
  69. """
  70. This function will create combinantions for noising.
  71. Each item in the returned set contains four values corresponding\
  72. to (1) phoneme noise, (2) speaker noise and (3) noise of the order of the phonemes.
  73. These combinantions are sparse because we only noise one value at time.
  74. For example, an item can be (0.0, 0.0, 0.25), which means that we only
  75. noise 25 percent of the phonemes, and nothing else is affected.
  76. See the file make_noiser.py for more infomrations.
  77. """
  78. sparse_combinantions = []
  79. for value in values :
  80. for idx in range(variables) :
  81. sparse_values = [0.0] * variables
  82. sparse_values[idx] = value
  83. sparse_combinantions.append(tuple(sparse_values))
  84. return set(sparse_combinantions)
  85. def test(json_files_directory, models_directory, train_files, add_noise=True) :
  86. """
  87. This function will test the language models on CHILDES corpora
  88. """
  89. columns = ["language", "typology", "family", "speaker",\
  90. "age", "perplexity", "entropy", "phonemes_order_noise",\
  91. "speakers_noise", "phonemes_noise"]
  92. results = pd.DataFrame(columns=columns, index=None)
  93. sparse_combinantions = create_sparse_combinantions((0.0, 0.25, 0.5, 0.75, 1))
  94. for phonemes_noise, speakers_noise, phonemes_order_noise in tqdm(sparse_combinantions, total=len(sparse_combinantions)) :
  95. for test_filename, model_filename in product(os.listdir(json_files_directory), os.listdir(models_directory)) :
  96. lg_iso, _ = test_filename.split(".")
  97. model_lg = model_filename.split(".")[0]
  98. if lg_iso != model_lg :
  99. continue
  100. most_probable_phonemes = get_most_probable_phonemes(f"{train_files}/{lg_iso}.one_sentence_per_line")
  101. loaded_json = json.load(open(f"{json_files_directory}/{test_filename}"))
  102. if add_noise :
  103. noise = Noise(most_probable_phonemes,
  104. phonemes_order_noise_value=phonemes_order_noise,
  105. speakers_noise_values=(speakers_noise, speakers_noise),
  106. phonemes_noise_value=phonemes_noise)
  107. loaded_json = noise(loaded_json)
  108. model = kenlm.Model(f"{models_directory}/{model_filename}")
  109. for family in loaded_json :
  110. for age in loaded_json[family] :
  111. if age == "None" : print(family, lg_iso, age); continue
  112. for speaker in loaded_json[family][age] :
  113. if speaker not in ["Adult", "Target_Child"] : continue
  114. language, typology = LANGUAGES_TYPOLOGIES[lg_iso]
  115. ppl = model.perplexity("\n".join(loaded_json[family][age][speaker]))
  116. entropy = log(ppl)
  117. new_row = {"language" : language,
  118. "typology" : typology,
  119. "family" : family,
  120. "speaker" : speaker,
  121. "age" : float(age),
  122. "perplexity" : ppl,
  123. "entropy" : entropy,
  124. "phonemes_order_noise" : phonemes_order_noise,
  125. "speakers_noise" : speakers_noise,
  126. "phonemes_noise" : phonemes_noise}
  127. results = results.append(new_row, ignore_index=True)
  128. return results
  129. if __name__ == "__main__":
  130. from argparse import ArgumentParser, BooleanOptionalAction
  131. parser = ArgumentParser()
  132. parser.add_argument('--train_files_directory',
  133. required=True,
  134. help="The directory containing the train files tokenized in phonemes."
  135. )
  136. parser.add_argument('--model_files_directory',
  137. required=True,
  138. help="The directory containing the trained language models."
  139. )
  140. parser.add_argument('--json_files_directory',
  141. required=True,
  142. help="The directory containing CHILDES utterances in json format for each language"
  143. )
  144. parser.add_argument("--add_noise",
  145. help="Whether noise the CHILDES utterances or not",
  146. action=BooleanOptionalAction)
  147. args = parser.parse_args()
  148. add_noise = args.add_noise
  149. json_files_directory = args.json_files_directory
  150. phoneme_train_files = args.train_files_directory
  151. models_directory = args.model_files_directory
  152. if not os.path.exists("results"):
  153. os.makedirs("results")
  154. test(json_files_directory,
  155. models_directory,
  156. phoneme_train_files,
  157. add_noise=add_noise).to_csv("results/results.csv")