|
@@ -0,0 +1,59 @@
|
|
|
+"""This module only implements one function that transforms\
|
|
|
+ dot one_utterance_per_line files to json. Dot one_utterance_per_line\
|
|
|
+ are files containing one utterance per line with the speaker role\
|
|
|
+ that produced this utterance, his family and the age of the child\
|
|
|
+ at the time the utterance was produced.
|
|
|
+"""
|
|
|
+import os
|
|
|
+import json
|
|
|
+from collections import defaultdict
|
|
|
+from tqdm import tqdm
|
|
|
+import random
|
|
|
+random.seed(1023)
|
|
|
+
|
|
|
+def one_utterance_per_line_to_json(directory: str, out_dirname: str) -> None:
|
|
|
+ """
|
|
|
+ This function will transform dot one_utterance_per_line\
|
|
|
+ files to json files.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ - directory : str
|
|
|
+ The path to the directory where the one_utterance_per_line\
|
|
|
+ files are stored.
|
|
|
+ - out_dirname : str
|
|
|
+ The path where the output json files will be stored.
|
|
|
+ """
|
|
|
+ for language_filename in tqdm(os.listdir(directory)) :
|
|
|
+ # Dot all files contain raw childes data. One line contains the family, \
|
|
|
+ # the participant who speaks, the age, and the produced utterance.
|
|
|
+ if not language_filename.endswith(".one_utterance_per_line") : continue
|
|
|
+ language, _ = language_filename.split(".")
|
|
|
+ by_family = defaultdict(lambda : defaultdict(lambda : defaultdict(list)))
|
|
|
+ for participant_age_utterance in open(f"{directory}/{language_filename}") :
|
|
|
+ family, participant, age, utterance = participant_age_utterance.strip().split(",")
|
|
|
+ participant = "Adult" if participant in ["Father", "Mother"] else participant
|
|
|
+ if utterance in by_family[family][age][participant] : continue
|
|
|
+ by_family[family][age][participant].append(utterance)
|
|
|
+ with open(f"{out_dirname}/{language}.json", "w") as out_filename :
|
|
|
+ json.dump(
|
|
|
+ by_family,
|
|
|
+ out_filename)
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == "__main__" :
|
|
|
+ from argparse import ArgumentParser
|
|
|
+ parser = ArgumentParser()
|
|
|
+
|
|
|
+ parser.add_argument("--files_directory",
|
|
|
+ help="Directory containing one_utterance_per_line files.",
|
|
|
+ required=True)
|
|
|
+ parser.add_argument("--out_dirname",
|
|
|
+ help="The directory where outputs will be stored.",
|
|
|
+ required=True)
|
|
|
+ args = parser.parse_args()
|
|
|
+ files_directory = args.files_directory
|
|
|
+ out_dirname = args.out_dirname
|
|
|
+ if not os.path.exists(out_dirname):
|
|
|
+ os.makedirs(out_dirname)
|
|
|
+ one_utterance_per_line_to_json(files_directory, out_dirname)
|