corpus_bias_rates.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491
  1. #!/usr/bin/env python3
  2. from ChildProject.projects import ChildProject
  3. from ChildProject.annotations import AnnotationManager
  4. from ChildProject.metrics import segments_to_annotation
  5. from ChildProject.pipelines.metrics import AclewMetrics
  6. import argparse
  7. import datalad.api
  8. from os.path import join as opj
  9. from os.path import basename, exists
  10. import multiprocessing as mp
  11. import numpy as np
  12. import pandas as pd
  13. import pickle
  14. from pyannote.core import Annotation, Segment, Timeline
  15. import stan
  16. parser = argparse.ArgumentParser(
  17. description="main model described throughout the notes."
  18. )
  19. # parser.add_argument("--group", default="child", choices=["corpus", "child"])
  20. parser.add_argument("--apply-bias-from", type=str, default="")
  21. parser.add_argument("--chains", default=4, type=int)
  22. parser.add_argument("--samples", default=2000, type=int)
  23. parser.add_argument("--validation", default=0, type=float)
  24. parser.add_argument("--simulated-children", default=40, type=int)
  25. parser.add_argument("--output", default="corpus_bias")
  26. args = parser.parse_args()
  27. def extrude(self, removed, mode: str = "intersection"):
  28. if isinstance(removed, Segment):
  29. removed = Timeline([removed])
  30. truncating_support = removed.gaps(support=self.extent())
  31. # loose for truncate means strict for crop and vice-versa
  32. if mode == "loose":
  33. mode = "strict"
  34. elif mode == "strict":
  35. mode = "loose"
  36. return self.crop(truncating_support, mode=mode)
  37. def compute_counts(parameters):
  38. corpus = parameters["corpus"]
  39. annotator = parameters["annotator"]
  40. speakers = ["CHI", "OCH", "FEM", "MAL"]
  41. project = ChildProject(parameters["path"])
  42. am = AnnotationManager(project)
  43. am.read()
  44. intersection = AnnotationManager.intersection(am.annotations, ["vtc", annotator])
  45. intersection["path"] = intersection.apply(
  46. lambda r: opj(
  47. project.path, "annotations", r["set"], "converted", r["annotation_filename"]
  48. ),
  49. axis=1,
  50. )
  51. datalad.api.get(list(intersection["path"].unique()))
  52. intersection = intersection.merge(
  53. project.recordings[["recording_filename", "child_id"]], how="left"
  54. )
  55. intersection["child"] = corpus + "_" + intersection["child_id"].astype(str)
  56. intersection["duration"] = (
  57. intersection["range_offset"] - intersection["range_onset"]
  58. )
  59. print(corpus, annotator, (intersection["duration"] / 1000 / 2).sum() / 3600)
  60. data = []
  61. for child, ann in intersection.groupby("child"):
  62. # print(corpus, child)
  63. segments = am.get_collapsed_segments(ann)
  64. if "speaker_type" not in segments.columns:
  65. continue
  66. segments = segments[segments["speaker_type"].isin(speakers)]
  67. vtc = {
  68. speaker: segments_to_annotation(
  69. segments[
  70. (segments["set"] == "vtc") & (segments["speaker_type"] == speaker)
  71. ],
  72. "speaker_type",
  73. ).get_timeline()
  74. for speaker in speakers
  75. }
  76. truth = {
  77. speaker: segments_to_annotation(
  78. segments[
  79. (segments["set"] == annotator)
  80. & (segments["speaker_type"] == speaker)
  81. ],
  82. "speaker_type",
  83. ).get_timeline()
  84. for speaker in speakers
  85. }
  86. for speaker_A in speakers:
  87. vtc[f"{speaker_A}_vocs_explained"] = vtc[speaker_A].crop(
  88. truth[speaker_A], mode="loose"
  89. )
  90. vtc[f"{speaker_A}_vocs_fp"] = extrude(
  91. vtc[speaker_A], vtc[f"{speaker_A}_vocs_explained"]
  92. )
  93. vtc[f"{speaker_A}_vocs_fn"] = extrude(
  94. truth[speaker_A], truth[speaker_A].crop(vtc[speaker_A], mode="loose")
  95. )
  96. for speaker_B in speakers:
  97. vtc[f"{speaker_A}_vocs_fp_{speaker_B}"] = vtc[
  98. f"{speaker_A}_vocs_fp"
  99. ].crop(truth[speaker_B], mode="loose")
  100. for speaker_C in speakers:
  101. if speaker_C != speaker_B and speaker_C != speaker_A:
  102. vtc[f"{speaker_A}_vocs_fp_{speaker_B}"] = extrude(
  103. vtc[f"{speaker_A}_vocs_fp_{speaker_B}"],
  104. vtc[f"{speaker_A}_vocs_fp_{speaker_B}"].crop(
  105. truth[speaker_C], mode="loose"
  106. ),
  107. )
  108. d = {}
  109. keep_child = True
  110. for i, speaker_A in enumerate(speakers):
  111. for j, speaker_B in enumerate(speakers):
  112. if i != j:
  113. z = len(vtc[f"{speaker_A}_vocs_fp_{speaker_B}"])
  114. else:
  115. z = min(
  116. len(vtc[f"{speaker_A}_vocs_explained"]), len(truth[speaker_A])
  117. )
  118. d[f"vtc_{i}_{j}"] = z
  119. if z > len(truth[speaker_B]):
  120. keep_child = False
  121. d[f"truth_{i}"] = len(truth[speaker_A])
  122. d["child"] = child
  123. d["duration"] = ann["duration"].sum() / 2 / 1000
  124. if keep_child:
  125. data.append(d)
  126. return pd.DataFrame(data).assign(
  127. corpus=corpus,
  128. )
  129. def rates(parameters):
  130. corpus = parameters["corpus"]
  131. annotator = parameters["annotator"]
  132. speakers = ["CHI", "OCH", "FEM", "MAL"]
  133. project = ChildProject(parameters["path"])
  134. am = AnnotationManager(project)
  135. am.read()
  136. pipeline = AclewMetrics(
  137. project,
  138. vtc=annotator,
  139. alice=None,
  140. vcm=None,
  141. from_time="09:00:00",
  142. to_time="18:00:00",
  143. by="child_id",
  144. )
  145. metrics = pipeline.extract()
  146. metrics = pd.DataFrame(metrics).assign(corpus=corpus,annotator=annotator)
  147. metrics["duration"] = metrics[f"duration_{annotator}"]/1000/3600
  148. metrics = metrics[metrics["duration"] > 0.01]
  149. speakers = ['CHI', 'OCH', 'FEM', 'MAL']
  150. # metrics.dropna(subset={f"voc_{speaker.lower()}_ph" for speaker in speakers}&set(metrics.columns), inplace=True)
  151. for i, speaker in enumerate(speakers):
  152. # if f"voc_{speaker.lower()}_ph" not in metrics.columns:
  153. # metrics[f"speech_rate_{i}"] = pd.NA
  154. # else:
  155. metrics[f"speech_rate_{i}"] = (metrics[f"voc_{speaker.lower()}_ph"]*(metrics["duration"])).fillna(0).astype(int)
  156. metrics[f"voc_duration_{i}"] = (metrics[f"avg_voc_dur_{speaker.lower()}"]/1000).fillna(0)
  157. return metrics
  158. stan_code = """
  159. data {
  160. int<lower=1> n_clips; // number of clips
  161. int<lower=1> n_groups; // number of groups
  162. int<lower=1> n_corpora;
  163. int<lower=1> n_classes; // number of classes
  164. int group[n_clips];
  165. int corpus[n_clips];
  166. int vtc[n_clips,n_classes,n_classes];
  167. int truth[n_clips,n_classes];
  168. int<lower=1> n_validation;
  169. int<lower=1> n_sim;
  170. int<lower=0> selected_corpus;
  171. int<lower=1> n_rates;
  172. int<lower=0> speech_rates[n_rates,n_classes];
  173. real<lower=0> rates_alphas[n_classes];
  174. real<lower=0> rates_betas[n_classes];
  175. int group_corpus[n_rates];
  176. real<lower=0> duration[n_rates];
  177. real<lower=0> voc_duration[n_rates,n_classes];
  178. }
  179. parameters {
  180. matrix<lower=0,upper=1>[n_classes,n_classes] mus;
  181. matrix<lower=1>[n_classes,n_classes] etas;
  182. matrix<lower=0,upper=1>[n_classes,n_classes] group_confusion[n_groups];
  183. matrix[n_classes,n_classes] corpus_bias[n_corpora];
  184. matrix<lower=0>[n_classes,n_classes] corpus_sigma;
  185. // speech rates
  186. matrix<lower=1>[n_classes,n_corpora] speech_rate_alpha;
  187. matrix<lower=0>[n_classes,n_corpora] speech_rate_mu;
  188. matrix<lower=0> [n_classes,n_rates] speech_rate;
  189. // voc duration
  190. matrix<lower=1> [n_classes,n_corpora] voc_dur_alpha;
  191. matrix<lower=0> [n_classes,n_corpora] voc_dur_mu;
  192. matrix<lower=0> [n_classes,n_rates] voc_dur_child_mean;
  193. //matrix<lower=0> [n_classes,n_rates] voc_dur_child_alpha;
  194. }
  195. transformed parameters {
  196. matrix<lower=0>[n_classes,n_classes] alphas;
  197. matrix<lower=0>[n_classes,n_classes] betas;
  198. alphas = mus * etas;
  199. betas = (1-mus) * etas;
  200. }
  201. model {
  202. for (k in n_validation:n_clips) {
  203. for (i in 1:n_classes) {
  204. for (j in 1:n_classes) {
  205. vtc[k,i,j] ~ binomial(
  206. truth[k,j], inv_logit(logit(group_confusion[group[k],j,i]) + corpus_bias[corpus[k],j,i])
  207. );
  208. }
  209. }
  210. }
  211. for (i in 1:n_classes) {
  212. for (j in 1:n_classes) {
  213. mus[i,j] ~ beta(1,1);
  214. etas[i,j] ~ pareto(1,1.5);
  215. }
  216. }
  217. for (c in 1:n_groups) {
  218. for (i in 1:n_classes) {
  219. for (j in 1:n_classes) {
  220. group_confusion[c,i,j] ~ beta(alphas[i,j], betas[i,j]);
  221. }
  222. }
  223. }
  224. for (i in 1:n_classes) {
  225. for (j in 1:n_classes) {
  226. for (c in 1:n_corpora) {
  227. corpus_bias[c,j,i] ~ normal(0, corpus_sigma[j,i]);
  228. }
  229. corpus_sigma[j,i] ~ normal(0, 1);
  230. }
  231. }
  232. // speech rates
  233. for (i in 1:n_classes) {
  234. speech_rate_alpha[i,:] ~ normal(1, 1);
  235. speech_rate_mu[i,:] ~ exponential(2);
  236. voc_dur_alpha[i,:] ~ normal(1, 1);
  237. voc_dur_mu[i,:] ~ exponential(1);
  238. //voc_dur_child_alpha[i,:] ~ exponential(1);
  239. }
  240. for (g in 1:n_rates) {
  241. for (i in 1:n_classes) {
  242. speech_rate[i,g] ~ gamma(speech_rate_alpha[i,group_corpus[g]], (speech_rate_alpha[i,group_corpus[g]]/speech_rate_mu[i,group_corpus[g]])/1000);
  243. speech_rates[g,i] ~ poisson(speech_rate[i,g]*duration[g]);
  244. if (speech_rates[g,i] > 0) {
  245. //voc_duration[g,i] ~ gamma(speech_rates[g,i]*voc_dur_child_alpha[i,g], speech_rates[g,i]*voc_dur_child_alpha[i,g]/voc_dur_child_mean[i,g]);
  246. //voc_duration[g,i] ~ gamma(speech_rates[g,i], speech_rates[g,i]/voc_dur_child_mean[i,g]);
  247. target += gamma_lpdf(voc_duration[g,i] * speech_rates[g,i] | speech_rates[g,i], 1/voc_dur_child_mean[i,g]);
  248. }
  249. voc_dur_child_mean[i,g] ~ gamma(voc_dur_alpha[i,group_corpus[g]], voc_dur_alpha[i,group_corpus[g]]/voc_dur_mu[i,group_corpus[g]]);
  250. }
  251. }
  252. }
  253. generated quantities {
  254. int pred[n_clips,n_classes,n_classes];
  255. matrix[n_classes,n_classes] probs[n_groups];
  256. matrix[n_classes,n_classes] log_lik[n_clips];
  257. matrix[n_classes,n_classes] random_bias;
  258. matrix[n_classes,n_classes] fixed_bias;
  259. int sim_truth[n_sim,n_classes];
  260. int sim_vtc[n_sim,n_classes];
  261. vector[n_classes] lambdas;
  262. real chi_adu_coef = 0; // null-hypothesis
  263. for (i in 1:n_classes) {
  264. for (j in 1:n_classes) {
  265. if (selected_corpus != 0) {
  266. fixed_bias[j, i] = corpus_bias[selected_corpus, j, i];
  267. }
  268. else {
  269. fixed_bias[j, i] = 0;
  270. }
  271. random_bias[j,i] = normal_rng(0, corpus_sigma[j,i]);
  272. }
  273. }
  274. for (c in 1:n_groups) {
  275. for (i in 1:n_classes) {
  276. for (j in 1:n_classes) {
  277. probs[c,i,j] = beta_rng(alphas[i,j], betas[i,j]);
  278. }
  279. }
  280. }
  281. for (k in 1:n_clips) {
  282. for (i in 1:n_classes) {
  283. for (j in 1:n_classes) {
  284. if (k >= n_validation) {
  285. pred[k,i,j] = binomial_rng(truth[k,j], inv_logit(logit(group_confusion[group[k],j,i]) + corpus_bias[corpus[k], j,i]));
  286. log_lik[k,i,j] = binomial_lpmf(
  287. vtc[k,i,j] | truth[k,j], inv_logit(logit(group_confusion[group[k],j,i]) + corpus_bias[corpus[k], j,i])
  288. );
  289. }
  290. else {
  291. pred[k,i,j] = binomial_rng(
  292. truth[k,j], inv_logit(logit(probs[group[k],j,i]) + corpus_bias[corpus[k], j,i])
  293. );
  294. log_lik[k,i,j] = beta_lpdf(probs[group[k],j,i] | alphas[j,i], betas[j,i]);
  295. log_lik[k,i,j] += binomial_lpmf(
  296. vtc[k,i,j] | truth[k,j], inv_logit(logit(probs[group[k],j,i]) + corpus_bias[corpus[k], j,i])
  297. );
  298. }
  299. }
  300. }
  301. }
  302. real lambda;
  303. for (k in 1:n_sim) {
  304. for (i in 2:n_classes) {
  305. if (selected_corpus != 0) {
  306. lambda = gamma_rng(speech_rate_alpha[i,selected_corpus], (speech_rate_alpha[i,selected_corpus]/speech_rate_mu[i,selected_corpus])/1000)*9;
  307. } else {
  308. lambda = gamma_rng(rates_alphas[i], rates_betas[i]);
  309. }
  310. sim_truth[k,i] = poisson_rng(lambda);
  311. }
  312. if (selected_corpus != 0) {
  313. lambda = gamma_rng(speech_rate_alpha[1,selected_corpus], speech_rate_alpha[1,selected_corpus]/speech_rate_mu[1,selected_corpus]/1000)*9;
  314. } else {
  315. lambda = gamma_rng(rates_alphas[1], rates_betas[1]);
  316. }
  317. sim_truth[k,1] = poisson_rng(lambda + chi_adu_coef*(sim_truth[k,3]+sim_truth[k,4]));
  318. }
  319. for (k in 1:n_sim) {
  320. for (i in 1:n_classes) {
  321. sim_vtc[k,i] = 0;
  322. for (j in 1:n_classes) {
  323. real p = logit(beta_rng(alphas[j,i], betas[j,i]));
  324. if (selected_corpus != 0) {
  325. p += fixed_bias[j,i];
  326. }
  327. else {
  328. p += random_bias[j,i];
  329. }
  330. p = inv_logit(p);
  331. sim_vtc[k,i] += binomial_rng(sim_truth[k,j], p);
  332. }
  333. }
  334. }
  335. }
  336. """
  337. if __name__ == "__main__":
  338. annotators = pd.read_csv("input/annotators.csv")
  339. annotators["path"] = annotators["corpus"].apply(lambda c: opj("input", c))
  340. with mp.Pool(processes=8) as pool:
  341. data = pd.concat(pool.map(compute_counts, annotators.to_dict(orient="records")))
  342. data = data.sample(frac=1)
  343. duration = data["duration"].sum()
  344. vtc = np.moveaxis(
  345. [[data[f"vtc_{j}_{i}"].values for i in range(4)] for j in range(4)], -1, 0
  346. )
  347. truth = np.transpose([data[f"truth_{i}"].values for i in range(4)])
  348. # speech rates at the child level
  349. annotators = annotators[~annotators['annotator'].str.startswith('eaf_2021')]
  350. with mp.Pool(processes=8) as pool:
  351. speech_rates = pd.concat(pool.map(rates, annotators.to_dict(orient="records")))
  352. speech_rates.reset_index(inplace=True)
  353. speech_rates = speech_rates.groupby(["corpus", "child_id"]).sample(1)
  354. speech_rate_matrix = np.transpose([speech_rates[f"speech_rate_{i}"].values for i in range(4)])
  355. voc_duration_matrix = np.transpose([speech_rates[f"voc_duration_{i}"].values for i in range(4)])
  356. speech_rates.to_csv("rates.csv")
  357. print(vtc.shape)
  358. fixed_rates = pd.read_csv("output/speech_dist.csv")
  359. training_set = data.groupby("corpus").agg(
  360. duration=("duration", "sum"), children=("child", lambda x: x.nunique())
  361. )
  362. training_set["duration"] /= 3600
  363. training_set.to_csv("output/training_set.csv")
  364. data["corpus"] = data["corpus"].astype("category")
  365. corpora = data["corpus"].cat.codes.values
  366. corpora_codes = dict(enumerate(data["corpus"].cat.categories))
  367. corpora_codes = {v: k for k, v in corpora_codes.items()}
  368. data = {
  369. "n_clips": truth.shape[0],
  370. "n_classes": truth.shape[1],
  371. "n_groups": data["child"].nunique(),
  372. "n_corpora": data["corpus"].nunique(),
  373. "n_validation": max(1, int(truth.shape[0] * args.validation)),
  374. "n_sim": args.simulated_children,
  375. "group": 1 + data["child"].astype("category").cat.codes.values,
  376. "corpus": 1 + corpora,
  377. "selected_corpus": (
  378. 1 + corpora_codes[args.apply_bias_from]
  379. if args.apply_bias_from in corpora_codes
  380. else 0
  381. ),
  382. "truth": truth.astype(int),
  383. "vtc": vtc.astype(int),
  384. "rates_alphas": fixed_rates["alpha"].values,
  385. "rates_betas": fixed_rates["beta"].values,
  386. "speech_rates": speech_rate_matrix.astype(int),
  387. "voc_duration": voc_duration_matrix,
  388. "group_corpus": 1+speech_rates["corpus"].map(corpora_codes).astype(int).values,
  389. "duration": speech_rates["duration"].values,
  390. "n_rates": len(speech_rates)
  391. }
  392. print(f"clips: {data['n_clips']}")
  393. print(f"groups: {data['n_groups']}")
  394. print("true vocs: {}".format(np.sum(data["truth"])))
  395. print("vtc vocs: {}".format(np.sum(data["vtc"])))
  396. print("duration: {}".format(duration))
  397. print("selected corpus: {}".format(data["selected_corpus"]))
  398. with open(f"output/samples/data_{args.output}.pickle", "wb") as fp:
  399. pickle.dump(data, fp, pickle.HIGHEST_PROTOCOL)
  400. posterior = stan.build(stan_code, data=data)
  401. fit = posterior.sample(num_chains=args.chains, num_samples=args.samples)
  402. df = fit.to_frame()
  403. df.to_parquet(f"output/samples/fit_{args.output}.parquet")