1 year ago · cfd557e007
--- a/code/one_utterance_per_line_to_json.py
+++ b/code/one_utterance_per_line_to_json.py
@@ -0,0 +1,59 @@
 
				+"""This module only implements one function that transforms\
			
 
				+    dot one_utterance_per_line files to json. Dot one_utterance_per_line\
			
 
				+    are files containing one utterance per line with the speaker role\
			
 
				+    that produced this utterance, his family and the age of the child\
			
 
				+    at the time the utterance was produced.
			
 
				+"""
			
 
				+import os
			
 
				+import json
			
 
				+from collections import defaultdict
			
 
				+from tqdm import tqdm
			
 
				+import random 
			
 
				+random.seed(1023)
			
 
				+    
			
 
				+def one_utterance_per_line_to_json(directory: str, out_dirname: str) -> None:
			
 
				+    """
			
 
				+    This function will transform dot one_utterance_per_line\
			
 
				+    files to json files.
			
 
				+
			
 
				+    Parameters
			
 
				+    ----------
			
 
				+    - directory : str
			
 
				+        The path to the directory where the one_utterance_per_line\
			
 
				+        files are stored.
			
 
				+    - out_dirname : str
			
 
				+        The path where the output json files will be stored.
			
 
				+    """
			
 
				+    for language_filename in tqdm(os.listdir(directory)) :
			
 
				+        # Dot all files contain raw childes data. One line contains the family, \
			
 
				+        # the participant who speaks, the age, and the produced utterance.
			
 
				+        if not language_filename.endswith(".one_utterance_per_line") : continue
			
 
				+        language, _ = language_filename.split(".")
			
 
				+        by_family = defaultdict(lambda : defaultdict(lambda : defaultdict(list)))
			
 
				+        for participant_age_utterance in open(f"{directory}/{language_filename}") :
			
 
				+            family, participant, age, utterance = participant_age_utterance.strip().split(",")
			
 
				+            participant = "Adult" if participant in ["Father", "Mother"] else participant
			
 
				+            if utterance in by_family[family][age][participant] : continue
			
 
				+            by_family[family][age][participant].append(utterance)
			
 
				+        with open(f"{out_dirname}/{language}.json", "w") as out_filename :
			
 
				+            json.dump(
			
 
				+                by_family,
			
 
				+                out_filename)
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__" :
			
 
				+    from argparse import ArgumentParser
			
 
				+    parser = ArgumentParser()
			
 
				+
			
 
				+    parser.add_argument("--files_directory",
			
 
				+                        help="Directory containing one_utterance_per_line files.",
			
 
				+                        required=True)
			
 
				+    parser.add_argument("--out_dirname",
			
 
				+                        help="The directory where outputs will be stored.",
			
 
				+                        required=True)
			
 
				+    args = parser.parse_args()
			
 
				+    files_directory = args.files_directory
			
 
				+    out_dirname = args.out_dirname
			
 
				+    if not os.path.exists(out_dirname):
			
 
				+        os.makedirs(out_dirname)
			
 
				+    one_utterance_per_line_to_json(files_directory, out_dirname)