123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960 |
- """This module only implements one function that transforms\
- dot one_utterance_per_line files to json. Dot one_utterance_per_line\
- are files containing one utterance per line with the speaker role\
- that produced this utterance, his family and the age of the child\
- at the time the utterance was produced.
- """
- import os
- import json
- from collections import defaultdict
- from tqdm import tqdm
- import random
- random.seed(1023)
-
- def one_utterance_per_line_to_json(directory: str, out_dirname: str) -> None:
- """
- This function will transform dot one_utterance_per_line\
- files to json files.
- Parameters
- ----------
- - directory : str
- The path to the directory where the one_utterance_per_line\
- files are stored.
- - out_dirname : str
- The path where the output json files will be stored.
- """
- for language_filename in tqdm(os.listdir(directory)) :
- # Dot all files contain raw childes data. One line contains the family, \
- # the participant who speaks, the age, and the produced utterance.
- if not language_filename.endswith(".one_utterance_per_line") : continue
- language, _ = language_filename.split(".")
- by_family = defaultdict(lambda : defaultdict(lambda : defaultdict(list)))
- for participant_age_utterance in open(f"{directory}/{language_filename}") :
- family, participant, age, utterance = participant_age_utterance.strip().split(",")
- participant = "Adult" if participant in ["Father", "Mother"] else participant
- if utterance in by_family[family][age][participant] :
- continue
- by_family[family][age][participant].append(utterance)
- with open(f"{out_dirname}/{language}.json", "w") as out_filename :
- json.dump(
- by_family,
- out_filename)
- if __name__ == "__main__" :
- from argparse import ArgumentParser
- parser = ArgumentParser()
- parser.add_argument("--files_directory",
- help="Directory containing one_utterance_per_line files.",
- required=True)
- parser.add_argument("--out_dirname",
- help="The directory where outputs will be stored.",
- required=True)
- args = parser.parse_args()
- files_directory = args.files_directory
- out_dirname = args.out_dirname
- if not os.path.exists(out_dirname):
- os.makedirs(out_dirname)
- one_utterance_per_line_to_json(files_directory, out_dirname)
|