LAAC-LSCP
/
Unsupervised_Metrics_CLD_Sy


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960
							"""This module only implements one function that transforms\
    dot one_utterance_per_line files to json. Dot one_utterance_per_line\
    are files containing one utterance per line with the speaker role\
    that produced this utterance, his family and the age of the child\
    at the time the utterance was produced.
"""
import os
import json
from collections import defaultdict
from tqdm import tqdm
import random 
random.seed(1023)
    
def one_utterance_per_line_to_json(directory: str, out_dirname: str) -> None:
    """
    This function will transform dot one_utterance_per_line\
    files to json files.

    Parameters
    ----------
    - directory : str
        The path to the directory where the one_utterance_per_line\
        files are stored.
    - out_dirname : str
        The path where the output json files will be stored.
    """
    for language_filename in tqdm(os.listdir(directory)) :
        # Dot all files contain raw childes data. One line contains the family, \
        # the participant who speaks, the age, and the produced utterance.
        if not language_filename.endswith(".one_utterance_per_line") : continue
        language, _ = language_filename.split(".")
        by_family = defaultdict(lambda : defaultdict(lambda : defaultdict(list)))
        for participant_age_utterance in open(f"{directory}/{language_filename}") :
            family, participant, age, utterance = participant_age_utterance.strip().split(",")
            participant = "Adult" if participant in ["Father", "Mother"] else participant
            if utterance in by_family[family][age][participant] :
                continue
            by_family[family][age][participant].append(utterance)
        with open(f"{out_dirname}/{language}.json", "w") as out_filename :
            json.dump(
                by_family,
                out_filename)


if __name__ == "__main__" :
    from argparse import ArgumentParser
    parser = ArgumentParser()

    parser.add_argument("--files_directory",
                        help="Directory containing one_utterance_per_line files.",
                        required=True)
    parser.add_argument("--out_dirname",
                        help="The directory where outputs will be stored.",
                        required=True)
    args = parser.parse_args()
    files_directory = args.files_directory
    out_dirname = args.out_dirname
    if not os.path.exists(out_dirname):
        os.makedirs(out_dirname)
    one_utterance_per_line_to_json(files_directory, out_dirname)