siblings.py 1.7 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162
  1. import pandas as pd
  2. from ChildProject.projects import ChildProject
  3. from os.path import join as opj, basename
  4. corpora = [
  5. "input/bergelson",
  6. "input/warlaumont",
  7. "input/winnipeg",
  8. "input/lucid"
  9. ]
  10. dic = {
  11. "input/bergelson": "confidential/original/bergelson_dict.csv",
  12. "input/lucid": "confidential/original/lucid_dict.csv",
  13. "input/warlaumont": "original/warlaumont_dict_matched.csv",
  14. "input/winnipeg": "confidential/original/winnipeg_dict_matched.csv"
  15. }
  16. correspondance = {
  17. "BER": "input/bergelson",
  18. "ROW": "input/lucid",
  19. "SOD": "input/winnipeg",
  20. "WAR": "input/warlaumont"
  21. }
  22. projects = [
  23. ChildProject(corpus) for corpus in corpora
  24. ]
  25. for project in projects:
  26. project.read()
  27. recordings = pd.concat([
  28. projects[i].recordings.assign(corpus=corpus)
  29. for i, corpus in enumerate(corpora)
  30. ])
  31. recordings["its_filename"] = recordings["its_filename"].str.replace(".its", "")
  32. aclew_id = pd.concat([
  33. pd.read_csv(opj(corpus, "metadata", dic[corpus])).assign(corpus=corpus)
  34. for corpus in corpora
  35. ])
  36. aclew_id["its"] = aclew_id["its"].str.replace(".its", "")
  37. aclew_md = pd.read_csv("input/aclew_md.csv")
  38. recordings = recordings[["corpus", "child_id", "recording_filename", "its_filename"]].merge(
  39. aclew_id[["corpus", "its", "aclew_id"]],
  40. how="inner",
  41. left_on=["corpus", "its_filename"],
  42. right_on=["corpus", "its"]
  43. )
  44. recordings = recordings.merge(aclew_md, how="inner", left_on="aclew_id", right_on="aclew_id")
  45. children = recordings.groupby(["corpus", "child_id"]).agg(n_siblings=("number_older_sibs", "max"))
  46. children = children.reset_index()
  47. children["corpus"] = children.corpus.map(basename)
  48. children.to_csv("input/siblings.csv")