1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162 |
- import pandas as pd
- from ChildProject.projects import ChildProject
- from os.path import join as opj, basename
- corpora = [
- "input/bergelson",
- "input/warlaumont",
- "input/winnipeg",
- "input/lucid"
- ]
- dic = {
- "input/bergelson": "confidential/original/bergelson_dict.csv",
- "input/lucid": "confidential/original/lucid_dict.csv",
- "input/warlaumont": "original/warlaumont_dict_matched.csv",
- "input/winnipeg": "confidential/original/winnipeg_dict_matched.csv"
- }
- correspondance = {
- "BER": "input/bergelson",
- "ROW": "input/lucid",
- "SOD": "input/winnipeg",
- "WAR": "input/warlaumont"
- }
- projects = [
- ChildProject(corpus) for corpus in corpora
- ]
- for project in projects:
- project.read()
- recordings = pd.concat([
- projects[i].recordings.assign(corpus=corpus)
- for i, corpus in enumerate(corpora)
- ])
- recordings["its_filename"] = recordings["its_filename"].str.replace(".its", "")
- aclew_id = pd.concat([
- pd.read_csv(opj(corpus, "metadata", dic[corpus])).assign(corpus=corpus)
- for corpus in corpora
- ])
- aclew_id["its"] = aclew_id["its"].str.replace(".its", "")
- aclew_md = pd.read_csv("input/aclew_md.csv")
- recordings = recordings[["corpus", "child_id", "recording_filename", "its_filename"]].merge(
- aclew_id[["corpus", "its", "aclew_id"]],
- how="inner",
- left_on=["corpus", "its_filename"],
- right_on=["corpus", "its"]
- )
- recordings = recordings.merge(aclew_md, how="inner", left_on="aclew_id", right_on="aclew_id")
- children = recordings.groupby(["corpus", "child_id"]).agg(n_siblings=("number_older_sibs", "max"))
- children = children.reset_index()
- children["corpus"] = children.corpus.map(basename)
- children.to_csv("input/siblings.csv")
|