#!/usr/bin/env python3 from ChildProject.projects import ChildProject from ChildProject.annotations import AnnotationManager from ChildProject.metrics import segments_to_annotation import argparse import datalad.api from os.path import join as opj from os.path import basename, exists import multiprocessing as mp import numpy as np import pandas as pd import pickle from pyannote.core import Annotation, Segment, Timeline import stan parser = argparse.ArgumentParser( description="main model described throughout the notes." ) # parser.add_argument("--group", default="child", choices=["corpus", "child"]) parser.add_argument("--apply-bias-from", type=str, default="") parser.add_argument("--chains", default=4, type=int) parser.add_argument("--samples", default=2000, type=int) parser.add_argument("--validation", default=0, type=float) parser.add_argument("--simulated-children", default=40, type=int) parser.add_argument("--output", default="corpus_bias") args = parser.parse_args() def extrude(self, removed, mode: str = "intersection"): if isinstance(removed, Segment): removed = Timeline([removed]) truncating_support = removed.gaps(support=self.extent()) # loose for truncate means strict for crop and vice-versa if mode == "loose": mode = "strict" elif mode == "strict": mode = "loose" return self.crop(truncating_support, mode=mode) def compute_counts(parameters): corpus = parameters["corpus"] annotator = parameters["annotator"] speakers = ["CHI", "OCH", "FEM", "MAL"] project = ChildProject(parameters["path"]) am = AnnotationManager(project) am.read() intersection = AnnotationManager.intersection(am.annotations, ["vtc", annotator]) intersection["path"] = intersection.apply( lambda r: opj( project.path, "annotations", r["set"], "converted", r["annotation_filename"] ), axis=1, ) datalad.api.get(list(intersection["path"].unique())) intersection = intersection.merge( project.recordings[["recording_filename", "child_id"]], how="left" ) intersection["child"] = corpus + "_" + intersection["child_id"].astype(str) intersection["duration"] = ( intersection["range_offset"] - intersection["range_onset"] ) print(corpus, annotator, (intersection["duration"] / 1000 / 2).sum() / 3600) data = [] for child, ann in intersection.groupby("child"): # print(corpus, child) segments = am.get_collapsed_segments(ann) if "speaker_type" not in segments.columns: continue segments = segments[segments["speaker_type"].isin(speakers)] vtc = { speaker: segments_to_annotation( segments[ (segments["set"] == "vtc") & (segments["speaker_type"] == speaker) ], "speaker_type", ).get_timeline() for speaker in speakers } truth = { speaker: segments_to_annotation( segments[ (segments["set"] == annotator) & (segments["speaker_type"] == speaker) ], "speaker_type", ).get_timeline() for speaker in speakers } for speaker_A in speakers: vtc[f"{speaker_A}_vocs_explained"] = vtc[speaker_A].crop( truth[speaker_A], mode="loose" ) vtc[f"{speaker_A}_vocs_fp"] = extrude( vtc[speaker_A], vtc[f"{speaker_A}_vocs_explained"] ) vtc[f"{speaker_A}_vocs_fn"] = extrude( truth[speaker_A], truth[speaker_A].crop(vtc[speaker_A], mode="loose") ) for speaker_B in speakers: vtc[f"{speaker_A}_vocs_fp_{speaker_B}"] = vtc[ f"{speaker_A}_vocs_fp" ].crop(truth[speaker_B], mode="loose") for speaker_C in speakers: if speaker_C != speaker_B and speaker_C != speaker_A: vtc[f"{speaker_A}_vocs_fp_{speaker_B}"] = extrude( vtc[f"{speaker_A}_vocs_fp_{speaker_B}"], vtc[f"{speaker_A}_vocs_fp_{speaker_B}"].crop( truth[speaker_C], mode="loose" ), ) d = {} keep_child = True for i, speaker_A in enumerate(speakers): for j, speaker_B in enumerate(speakers): if i != j: z = len(vtc[f"{speaker_A}_vocs_fp_{speaker_B}"]) else: z = min( len(vtc[f"{speaker_A}_vocs_explained"]), len(truth[speaker_A]) ) d[f"vtc_{i}_{j}"] = z if z > len(truth[speaker_B]): keep_child = False d[f"truth_{i}"] = len(truth[speaker_A]) d["child"] = child d["duration"] = ann["duration"].sum() / 2 / 1000 if keep_child: data.append(d) return pd.DataFrame(data).assign( corpus=corpus, ) stan_code = """ data { int n_clips; // number of clips int n_groups; // number of groups int n_corpora; int n_classes; // number of classes int group[n_clips]; int corpus[n_clips]; int vtc[n_clips,n_classes,n_classes]; int truth[n_clips,n_classes]; int n_validation; int n_sim; int selected_corpus; real rates_alphas[n_classes]; real rates_betas[n_classes]; } parameters { matrix[n_classes,n_classes] mus; matrix[n_classes,n_classes] etas; matrix[n_classes,n_classes] group_confusion[n_groups]; matrix[n_classes,n_classes] corpus_bias[n_corpora]; matrix[n_classes,n_classes] corpus_sigma; } transformed parameters { matrix[n_classes,n_classes] alphas; matrix[n_classes,n_classes] betas; alphas = mus * etas; betas = (1-mus) * etas; } model { for (k in n_validation:n_clips) { for (i in 1:n_classes) { for (j in 1:n_classes) { vtc[k,i,j] ~ binomial( truth[k,j], inv_logit(logit(group_confusion[group[k],j,i]) + corpus_bias[corpus[k],j,i]) ); } } } for (i in 1:n_classes) { for (j in 1:n_classes) { mus[i,j] ~ beta(1,1); etas[i,j] ~ pareto(1,1.5); } } for (c in 1:n_groups) { for (i in 1:n_classes) { for (j in 1:n_classes) { group_confusion[c,i,j] ~ beta(alphas[i,j], betas[i,j]); } } } for (i in 1:n_classes) { for (j in 1:n_classes) { for (c in 1:n_corpora) { corpus_bias[c,j,i] ~ normal(0, corpus_sigma[j,i]); } corpus_sigma[j,i] ~ normal(0, 1); } } } generated quantities { int pred[n_clips,n_classes,n_classes]; matrix[n_classes,n_classes] probs[n_groups]; matrix[n_classes,n_classes] log_lik[n_clips]; matrix[n_classes,n_classes] random_bias; matrix[n_classes,n_classes] fixed_bias; int sim_truth[n_sim,n_classes]; int sim_vtc[n_sim,n_classes]; vector[n_classes] lambdas; real chi_adu_coef = 0; // null-hypothesis for (i in 1:n_classes) { for (j in 1:n_classes) { if (selected_corpus != 0) { fixed_bias[j, i] = corpus_bias[selected_corpus, j, i]; } else { fixed_bias[j, i] = 0; } random_bias[j,i] = normal_rng(0, corpus_sigma[j,i]); } } for (c in 1:n_groups) { for (i in 1:n_classes) { for (j in 1:n_classes) { probs[c,i,j] = beta_rng(alphas[i,j], betas[i,j]); } } } for (k in 1:n_clips) { for (i in 1:n_classes) { for (j in 1:n_classes) { if (k >= n_validation) { pred[k,i,j] = binomial_rng(truth[k,j], inv_logit(logit(group_confusion[group[k],j,i]) + corpus_bias[corpus[k], j,i])); log_lik[k,i,j] = binomial_lpmf( vtc[k,i,j] | truth[k,j], inv_logit(logit(group_confusion[group[k],j,i]) + corpus_bias[corpus[k], j,i]) ); } else { pred[k,i,j] = binomial_rng( truth[k,j], inv_logit(logit(probs[group[k],j,i]) + corpus_bias[corpus[k], j,i]) ); log_lik[k,i,j] = beta_lpdf(probs[group[k],j,i] | alphas[j,i], betas[j,i]); log_lik[k,i,j] += binomial_lpmf( vtc[k,i,j] | truth[k,j], inv_logit(logit(probs[group[k],j,i]) + corpus_bias[corpus[k], j,i]) ); } } } } real lambda; for (k in 1:n_sim) { for (i in 2:n_classes) { lambda = gamma_rng(rates_alphas[i], rates_betas[i]); sim_truth[k,i] = poisson_rng(lambda); } lambda = gamma_rng(rates_alphas[1], rates_betas[1]); sim_truth[k,1] = poisson_rng(lambda + chi_adu_coef*(sim_truth[k,3]+sim_truth[k,4])); } for (k in 1:n_sim) { for (i in 1:n_classes) { sim_vtc[k,i] = 0; for (j in 1:n_classes) { real p = logit(beta_rng(alphas[j,i], betas[j,i])); if (selected_corpus != 0) { p += fixed_bias[j,i]; } else { p += random_bias[j,i]; } p = inv_logit(p); sim_vtc[k,i] += binomial_rng(sim_truth[k,j], p); } } } } """ if __name__ == "__main__": annotators = pd.read_csv("input/annotators.csv") annotators["path"] = annotators["corpus"].apply(lambda c: opj("input", c)) with mp.Pool(processes=8) as pool: data = pd.concat(pool.map(compute_counts, annotators.to_dict(orient="records"))) data = data.sample(frac=1) duration = data["duration"].sum() vtc = np.moveaxis( [[data[f"vtc_{j}_{i}"].values for i in range(4)] for j in range(4)], -1, 0 ) truth = np.transpose([data[f"truth_{i}"].values for i in range(4)]) print(vtc.shape) rates = pd.read_csv("output/speech_dist.csv") training_set = data.groupby("corpus").agg( duration=("duration", "sum"), children=("child", lambda x: x.nunique()) ) training_set["duration"] /= 3600 training_set.to_csv("output/training_set.csv") data["corpus"] = data["corpus"].astype("category") corpora = data["corpus"].cat.codes.values corpora_codes = dict(enumerate(data["corpus"].cat.categories)) corpora_codes = {v: k for k, v in corpora_codes.items()} data = { "n_clips": truth.shape[0], "n_classes": truth.shape[1], "n_groups": data["child"].nunique(), "n_corpora": data["corpus"].nunique(), "n_validation": max(1, int(truth.shape[0] * args.validation)), "n_sim": args.simulated_children, "group": 1 + data["child"].astype("category").cat.codes.values, "corpus": 1 + corpora, "selected_corpus": ( 1 + corpora_codes[args.apply_bias_from] if args.apply_bias_from in corpora_codes else 0 ), "truth": truth.astype(int), "vtc": vtc.astype(int), "rates_alphas": rates["alpha"].values, "rates_betas": rates["beta"].values, } print(f"clips: {data['n_clips']}") print(f"groups: {data['n_groups']}") print("true vocs: {}".format(np.sum(data["truth"]))) print("vtc vocs: {}".format(np.sum(data["vtc"]))) print("duration: {}".format(duration)) print("selected corpus: {}".format(data["selected_corpus"])) with open(f"output/samples/data_{args.output}.pickle", "wb") as fp: pickle.dump(data, fp, pickle.HIGHEST_PROTOCOL) posterior = stan.build(stan_code, data=data) fit = posterior.sample(num_chains=args.chains, num_samples=args.samples) df = fit.to_frame() df.to_parquet(f"output/samples/fit_{args.output}.parquet")