import pandas as pd from ChildProject.projects import ChildProject from os.path import join as opj, basename corpora = [ "input/bergelson", "input/warlaumont", "input/winnipeg", "input/lucid" ] dic = { "input/bergelson": "confidential/original/bergelson_dict.csv", "input/lucid": "confidential/original/lucid_dict.csv", "input/warlaumont": "original/warlaumont_dict_matched.csv", "input/winnipeg": "confidential/original/winnipeg_dict_matched.csv" } correspondance = { "BER": "input/bergelson", "ROW": "input/lucid", "SOD": "input/winnipeg", "WAR": "input/warlaumont" } projects = [ ChildProject(corpus) for corpus in corpora ] for project in projects: project.read() recordings = pd.concat([ projects[i].recordings.assign(corpus=corpus) for i, corpus in enumerate(corpora) ]) recordings["its_filename"] = recordings["its_filename"].str.replace(".its", "") aclew_id = pd.concat([ pd.read_csv(opj(corpus, "metadata", dic[corpus])).assign(corpus=corpus) for corpus in corpora ]) aclew_id["its"] = aclew_id["its"].str.replace(".its", "") aclew_md = pd.read_csv("input/aclew_md.csv") recordings = recordings[["corpus", "child_id", "recording_filename", "its_filename"]].merge( aclew_id[["corpus", "its", "aclew_id"]], how="inner", left_on=["corpus", "its_filename"], right_on=["corpus", "its"] ) recordings = recordings.merge(aclew_md, how="inner", left_on="aclew_id", right_on="aclew_id") children = recordings.groupby(["corpus", "child_id"]).agg(n_siblings=("number_older_sibs", "max")) children = children.reset_index() children["corpus"] = children.corpus.map(basename) children.to_csv("input/siblings.csv")