Browse Source

add script that convert one_utterance_per_line files to json files

yaya-sy 1 year ago
parent
commit
cfd557e007
1 changed files with 59 additions and 0 deletions
  1. 59 0
      code/one_utterance_per_line_to_json.py

+ 59 - 0
code/one_utterance_per_line_to_json.py

@@ -0,0 +1,59 @@
+"""This module only implements one function that transforms\
+    dot one_utterance_per_line files to json. Dot one_utterance_per_line\
+    are files containing one utterance per line with the speaker role\
+    that produced this utterance, his family and the age of the child\
+    at the time the utterance was produced.
+"""
+import os
+import json
+from collections import defaultdict
+from tqdm import tqdm
+import random 
+random.seed(1023)
+    
+def one_utterance_per_line_to_json(directory: str, out_dirname: str) -> None:
+    """
+    This function will transform dot one_utterance_per_line\
+    files to json files.
+
+    Parameters
+    ----------
+    - directory : str
+        The path to the directory where the one_utterance_per_line\
+        files are stored.
+    - out_dirname : str
+        The path where the output json files will be stored.
+    """
+    for language_filename in tqdm(os.listdir(directory)) :
+        # Dot all files contain raw childes data. One line contains the family, \
+        # the participant who speaks, the age, and the produced utterance.
+        if not language_filename.endswith(".one_utterance_per_line") : continue
+        language, _ = language_filename.split(".")
+        by_family = defaultdict(lambda : defaultdict(lambda : defaultdict(list)))
+        for participant_age_utterance in open(f"{directory}/{language_filename}") :
+            family, participant, age, utterance = participant_age_utterance.strip().split(",")
+            participant = "Adult" if participant in ["Father", "Mother"] else participant
+            if utterance in by_family[family][age][participant] : continue
+            by_family[family][age][participant].append(utterance)
+        with open(f"{out_dirname}/{language}.json", "w") as out_filename :
+            json.dump(
+                by_family,
+                out_filename)
+
+
+if __name__ == "__main__" :
+    from argparse import ArgumentParser
+    parser = ArgumentParser()
+
+    parser.add_argument("--files_directory",
+                        help="Directory containing one_utterance_per_line files.",
+                        required=True)
+    parser.add_argument("--out_dirname",
+                        help="The directory where outputs will be stored.",
+                        required=True)
+    args = parser.parse_args()
+    files_directory = args.files_directory
+    out_dirname = args.out_dirname
+    if not os.path.exists(out_dirname):
+        os.makedirs(out_dirname)
+    one_utterance_per_line_to_json(files_directory, out_dirname)