one_utterance_per_line_to_json.py 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
  1. """This module only implements one function that transforms\
  2. dot one_utterance_per_line files to json. Dot one_utterance_per_line\
  3. are files containing one utterance per line with the speaker role\
  4. that produced this utterance, his family and the age of the child\
  5. at the time the utterance was produced.
  6. """
  7. import os
  8. import json
  9. from collections import defaultdict
  10. from tqdm import tqdm
  11. import random
  12. random.seed(1023)
  13. def one_utterance_per_line_to_json(directory: str, out_dirname: str) -> None:
  14. """
  15. This function will transform dot one_utterance_per_line\
  16. files to json files.
  17. Parameters
  18. ----------
  19. - directory : str
  20. The path to the directory where the one_utterance_per_line\
  21. files are stored.
  22. - out_dirname : str
  23. The path where the output json files will be stored.
  24. """
  25. for language_filename in tqdm(os.listdir(directory)) :
  26. # Dot all files contain raw childes data. One line contains the family, \
  27. # the participant who speaks, the age, and the produced utterance.
  28. if not language_filename.endswith(".one_utterance_per_line") : continue
  29. language, _ = language_filename.split(".")
  30. by_family = defaultdict(lambda : defaultdict(lambda : defaultdict(list)))
  31. for participant_age_utterance in open(f"{directory}/{language_filename}") :
  32. family, participant, age, utterance = participant_age_utterance.strip().split(",")
  33. participant = "Adult" if participant in ["Father", "Mother"] else participant
  34. if utterance in by_family[family][age][participant] :
  35. continue
  36. by_family[family][age][participant].append(utterance)
  37. with open(f"{out_dirname}/{language}.json", "w") as out_filename :
  38. json.dump(
  39. by_family,
  40. out_filename)
  41. if __name__ == "__main__" :
  42. from argparse import ArgumentParser
  43. parser = ArgumentParser()
  44. parser.add_argument("--files_directory",
  45. help="Directory containing one_utterance_per_line files.",
  46. required=True)
  47. parser.add_argument("--out_dirname",
  48. help="The directory where outputs will be stored.",
  49. required=True)
  50. args = parser.parse_args()
  51. files_directory = args.files_directory
  52. out_dirname = args.out_dirname
  53. if not os.path.exists(out_dirname):
  54. os.makedirs(out_dirname)
  55. one_utterance_per_line_to_json(files_directory, out_dirname)