123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316 |
- #!/usr/bin/env python3
- from ChildProject.projects import ChildProject
- from ChildProject.annotations import AnnotationManager
- from ChildProject.metrics import segments_to_annotation
- import argparse
- import datalad.api
- from os.path import join as opj
- from os.path import basename, exists
- import multiprocessing as mp
- import numpy as np
- import pandas as pd
- import pickle
- from pyannote.core import Annotation, Segment, Timeline
- import stan
- parser = argparse.ArgumentParser(
- description="main model described throughout the notes."
- )
- parser.add_argument("--group", default="child", choices=["corpus", "child"])
- parser.add_argument("--chains", default=4, type=int)
- parser.add_argument("--samples", default=2000, type=int)
- parser.add_argument("--validation", default=0, type=float)
- parser.add_argument("--output", default="model3")
- args = parser.parse_args()
- def extrude(self, removed, mode: str = "intersection"):
- if isinstance(removed, Segment):
- removed = Timeline([removed])
- truncating_support = removed.gaps(support=self.extent())
- # loose for truncate means strict for crop and vice-versa
- if mode == "loose":
- mode = "strict"
- elif mode == "strict":
- mode = "loose"
- return self.crop(truncating_support, mode=mode)
- def compute_counts(parameters):
- corpus = parameters["corpus"]
- annotator = parameters["annotator"]
- speakers = ["CHI", "OCH", "FEM", "MAL"]
- project = ChildProject(parameters["path"])
- am = AnnotationManager(project)
- am.read()
- intersection = AnnotationManager.intersection(am.annotations, ["vtc", annotator])
- intersection["path"] = intersection.apply(
- lambda r: opj(
- project.path, "annotations", r["set"], "converted", r["annotation_filename"]
- ),
- axis=1,
- )
- datalad.api.get(list(intersection["path"].unique()))
- intersection = intersection.merge(
- project.recordings[["recording_filename", "child_id"]], how="left"
- )
- intersection["child"] = corpus + "_" + intersection["child_id"].astype(str)
- intersection["duration"] = (
- intersection["range_offset"] - intersection["range_onset"]
- )
- print(corpus, annotator, (intersection["duration"] / 1000 / 2).sum() / 3600)
- data = []
- for child, ann in intersection.groupby("child"):
- # print(corpus, child)
- segments = am.get_collapsed_segments(ann)
- if "speaker_type" not in segments.columns:
- continue
- segments = segments[segments["speaker_type"].isin(speakers)]
- vtc = {
- speaker: segments_to_annotation(
- segments[
- (segments["set"] == "vtc") & (segments["speaker_type"] == speaker)
- ],
- "speaker_type",
- ).get_timeline()
- for speaker in speakers
- }
- truth = {
- speaker: segments_to_annotation(
- segments[
- (segments["set"] == annotator)
- & (segments["speaker_type"] == speaker)
- ],
- "speaker_type",
- ).get_timeline()
- for speaker in speakers
- }
- for speaker_A in speakers:
- vtc[f"{speaker_A}_vocs_explained"] = vtc[speaker_A].crop(
- truth[speaker_A], mode="loose"
- )
- vtc[f"{speaker_A}_vocs_fp"] = extrude(
- vtc[speaker_A], vtc[f"{speaker_A}_vocs_explained"]
- )
- vtc[f"{speaker_A}_vocs_fn"] = extrude(
- truth[speaker_A], truth[speaker_A].crop(vtc[speaker_A], mode="loose")
- )
- for speaker_B in speakers:
- vtc[f"{speaker_A}_vocs_fp_{speaker_B}"] = vtc[
- f"{speaker_A}_vocs_fp"
- ].crop(truth[speaker_B], mode="loose")
- for speaker_C in speakers:
- if speaker_C != speaker_B and speaker_C != speaker_A:
- vtc[f"{speaker_A}_vocs_fp_{speaker_B}"] = extrude(
- vtc[f"{speaker_A}_vocs_fp_{speaker_B}"],
- vtc[f"{speaker_A}_vocs_fp_{speaker_B}"].crop(
- truth[speaker_C], mode="loose"
- ),
- )
- d = {}
- for i, speaker_A in enumerate(speakers):
- for j, speaker_B in enumerate(speakers):
- if i != j:
- z = len(vtc[f"{speaker_A}_vocs_fp_{speaker_B}"])
- else:
- z = min(
- len(vtc[f"{speaker_A}_vocs_explained"]), len(truth[speaker_A])
- )
- d[f"vtc_{i}_{j}"] = z
- d[f"truth_{i}"] = len(truth[speaker_A])
- d["child"] = child
- d["duration"] = ann["duration"].sum() / 2 / 1000
- data.append(d)
- return pd.DataFrame(data).assign(
- corpus=corpus,
- )
- stan_code = """
- data {
- int<lower=1> n_clips; // number of clips
- int<lower=1> n_groups; // number of groups
- int<lower=1> n_classes; // number of classes
- int group[n_clips];
- int vtc[n_clips,n_classes,n_classes];
- int truth[n_clips,n_classes];
- int<lower=1> n_validation;
- int<lower=1> n_sim;
- real<lower=0> rates_alphas[n_classes];
- real<lower=0> rates_betas[n_classes];
- }
- parameters {
- matrix<lower=0,upper=1>[n_classes,n_classes] mus;
- matrix<lower=1>[n_classes,n_classes] etas;
- matrix<lower=0,upper=1>[n_classes,n_classes] group_confusion[n_groups];
- }
- transformed parameters {
- matrix<lower=0>[n_classes,n_classes] alphas;
- matrix<lower=0>[n_classes,n_classes] betas;
- alphas = mus * etas;
- betas = (1-mus) * etas;
- }
- model {
- for (k in n_validation:n_clips) {
- for (i in 1:n_classes) {
- for (j in 1:n_classes) {
- vtc[k,i,j] ~ binomial(truth[k,j], group_confusion[group[k],j,i]);
- }
- }
- }
- for (i in 1:n_classes) {
- for (j in 1:n_classes) {
- mus[i,j] ~ beta(1,1);
- etas[i,j] ~ pareto(1,1.5);
- }
- }
- for (c in 1:n_groups) {
- for (i in 1:n_classes) {
- for (j in 1:n_classes) {
- group_confusion[c,i,j] ~ beta(alphas[i,j], betas[i,j]);
- }
- }
- }
- }
- generated quantities {
- int pred[n_clips,n_classes,n_classes];
- matrix[n_classes,n_classes] probs[n_groups];
- matrix[n_classes,n_classes] log_lik[n_clips];
- int sim_truth[n_sim,n_classes];
- int sim_vtc[n_sim,n_classes];
- vector[n_classes] lambdas;
- real chi_adu_coef = 0; // null-hypothesis
- for (c in 1:n_groups) {
- for (i in 1:n_classes) {
- for (j in 1:n_classes) {
- probs[c,i,j] = beta_rng(alphas[i,j], betas[i,j]);
- }
- }
- }
- for (k in 1:n_clips) {
- for (i in 1:n_classes) {
- for (j in 1:n_classes) {
- if (k >= n_validation) {
- pred[k,i,j] = binomial_rng(truth[k,j], group_confusion[group[k],i,j]);
- log_lik[k,i,j] = binomial_lpmf(vtc[k,i,j] | truth[k,j], group_confusion[group[k],j,i]);
- }
- else {
- pred[k,i,j] = binomial_rng(truth[k,j], probs[group[k],j,i]);
- log_lik[k,i,j] = beta_lpdf(probs[group[k],j,i] | alphas[j,i], betas[j,i]);
- log_lik[k,i,j] += binomial_lpmf(vtc[k,i,j] | truth[k,j], probs[group[k],j,i]);
- }
- }
- }
- }
- real lambda;
- for (k in 1:n_sim) {
- for (i in 2:n_classes) {
- lambda = gamma_rng(rates_alphas[i], rates_betas[i]);
- sim_truth[k,i] = poisson_rng(lambda);
- }
- lambda = gamma_rng(rates_alphas[1], rates_betas[1]);
- sim_truth[k,1] = poisson_rng(lambda + chi_adu_coef*(sim_truth[k,3]+sim_truth[k,4]));
- }
- for (k in 1:n_sim) {
- for (i in 1:n_classes) {
- sim_vtc[k,i] = 0;
- for (j in 1:n_classes) {
- real p = beta_rng(alphas[j,i], betas[j,i]);
- sim_vtc[k,i] += binomial_rng(sim_truth[k,j], p);
- }
- }
- }
- }
- """
- if __name__ == "__main__":
- annotators = pd.read_csv("input/annotators.csv")
- annotators["path"] = annotators["corpus"].apply(lambda c: opj("input", c))
- with mp.Pool(processes=8) as pool:
- data = pd.concat(pool.map(compute_counts, annotators.to_dict(orient="records")))
- data = data.sample(frac=1)
- duration = data["duration"].sum()
- vtc = np.moveaxis(
- [[data[f"vtc_{j}_{i}"].values for i in range(4)] for j in range(4)], -1, 0
- )
- truth = np.transpose([data[f"truth_{i}"].values for i in range(4)])
- print(vtc.shape)
- rates = pd.read_csv("output/speech_dist.csv")
- training_set = data.groupby("corpus").agg(
- duration=("duration", "sum"), children=("child", lambda x: x.nunique())
- )
- training_set["duration"] /= 3600
- training_set.to_csv("output/training_set.csv")
- data = {
- "n_clips": truth.shape[0],
- "n_classes": truth.shape[1],
- "n_groups": data[args.group].nunique(),
- "n_validation": max(1, int(truth.shape[0] * args.validation)),
- "n_sim": 40,
- "group": 1 + data[args.group].astype("category").cat.codes.values,
- "truth": truth.astype(int),
- "vtc": vtc.astype(int),
- "rates_alphas": rates["alpha"].values,
- "rates_betas": rates["beta"].values,
- }
- print(f"clips: {data['n_clips']}")
- print(f"groups: {data['n_groups']}")
- print("true vocs: {}".format(np.sum(data["truth"])))
- print("vtc vocs: {}".format(np.sum(data["vtc"])))
- print("duration: {}".format(duration))
- with open(f"output/samples/data_{args.output}.pickle", "wb") as fp:
- pickle.dump(data, fp, pickle.HIGHEST_PROTOCOL)
- posterior = stan.build(stan_code, data=data)
- fit = posterior.sample(num_chains=args.chains, num_samples=args.samples)
- df = fit.to_frame()
- df.to_parquet(f"output/samples/fit_{args.output}.parquet")
|