|
@@ -0,0 +1,491 @@
|
|
|
+#!/usr/bin/env python3
|
|
|
+
|
|
|
+from ChildProject.projects import ChildProject
|
|
|
+from ChildProject.annotations import AnnotationManager
|
|
|
+from ChildProject.metrics import segments_to_annotation
|
|
|
+from ChildProject.pipelines.metrics import AclewMetrics
|
|
|
+
|
|
|
+import argparse
|
|
|
+
|
|
|
+import datalad.api
|
|
|
+from os.path import join as opj
|
|
|
+from os.path import basename, exists
|
|
|
+
|
|
|
+import multiprocessing as mp
|
|
|
+
|
|
|
+import numpy as np
|
|
|
+import pandas as pd
|
|
|
+import pickle
|
|
|
+from pyannote.core import Annotation, Segment, Timeline
|
|
|
+
|
|
|
+import stan
|
|
|
+
|
|
|
+parser = argparse.ArgumentParser(
|
|
|
+ description="main model described throughout the notes."
|
|
|
+)
|
|
|
+# parser.add_argument("--group", default="child", choices=["corpus", "child"])
|
|
|
+parser.add_argument("--apply-bias-from", type=str, default="")
|
|
|
+parser.add_argument("--chains", default=4, type=int)
|
|
|
+parser.add_argument("--samples", default=2000, type=int)
|
|
|
+parser.add_argument("--validation", default=0, type=float)
|
|
|
+parser.add_argument("--simulated-children", default=40, type=int)
|
|
|
+parser.add_argument("--output", default="corpus_bias")
|
|
|
+args = parser.parse_args()
|
|
|
+
|
|
|
+
|
|
|
+def extrude(self, removed, mode: str = "intersection"):
|
|
|
+ if isinstance(removed, Segment):
|
|
|
+ removed = Timeline([removed])
|
|
|
+
|
|
|
+ truncating_support = removed.gaps(support=self.extent())
|
|
|
+ # loose for truncate means strict for crop and vice-versa
|
|
|
+ if mode == "loose":
|
|
|
+ mode = "strict"
|
|
|
+ elif mode == "strict":
|
|
|
+ mode = "loose"
|
|
|
+
|
|
|
+ return self.crop(truncating_support, mode=mode)
|
|
|
+
|
|
|
+
|
|
|
+def compute_counts(parameters):
|
|
|
+ corpus = parameters["corpus"]
|
|
|
+ annotator = parameters["annotator"]
|
|
|
+ speakers = ["CHI", "OCH", "FEM", "MAL"]
|
|
|
+
|
|
|
+ project = ChildProject(parameters["path"])
|
|
|
+ am = AnnotationManager(project)
|
|
|
+ am.read()
|
|
|
+
|
|
|
+ intersection = AnnotationManager.intersection(am.annotations, ["vtc", annotator])
|
|
|
+
|
|
|
+ intersection["path"] = intersection.apply(
|
|
|
+ lambda r: opj(
|
|
|
+ project.path, "annotations", r["set"], "converted", r["annotation_filename"]
|
|
|
+ ),
|
|
|
+ axis=1,
|
|
|
+ )
|
|
|
+ datalad.api.get(list(intersection["path"].unique()))
|
|
|
+
|
|
|
+ intersection = intersection.merge(
|
|
|
+ project.recordings[["recording_filename", "child_id"]], how="left"
|
|
|
+ )
|
|
|
+ intersection["child"] = corpus + "_" + intersection["child_id"].astype(str)
|
|
|
+ intersection["duration"] = (
|
|
|
+ intersection["range_offset"] - intersection["range_onset"]
|
|
|
+ )
|
|
|
+ print(corpus, annotator, (intersection["duration"] / 1000 / 2).sum() / 3600)
|
|
|
+
|
|
|
+ data = []
|
|
|
+ for child, ann in intersection.groupby("child"):
|
|
|
+ # print(corpus, child)
|
|
|
+
|
|
|
+ segments = am.get_collapsed_segments(ann)
|
|
|
+ if "speaker_type" not in segments.columns:
|
|
|
+ continue
|
|
|
+
|
|
|
+ segments = segments[segments["speaker_type"].isin(speakers)]
|
|
|
+
|
|
|
+ vtc = {
|
|
|
+ speaker: segments_to_annotation(
|
|
|
+ segments[
|
|
|
+ (segments["set"] == "vtc") & (segments["speaker_type"] == speaker)
|
|
|
+ ],
|
|
|
+ "speaker_type",
|
|
|
+ ).get_timeline()
|
|
|
+ for speaker in speakers
|
|
|
+ }
|
|
|
+
|
|
|
+ truth = {
|
|
|
+ speaker: segments_to_annotation(
|
|
|
+ segments[
|
|
|
+ (segments["set"] == annotator)
|
|
|
+ & (segments["speaker_type"] == speaker)
|
|
|
+ ],
|
|
|
+ "speaker_type",
|
|
|
+ ).get_timeline()
|
|
|
+ for speaker in speakers
|
|
|
+ }
|
|
|
+
|
|
|
+ for speaker_A in speakers:
|
|
|
+ vtc[f"{speaker_A}_vocs_explained"] = vtc[speaker_A].crop(
|
|
|
+ truth[speaker_A], mode="loose"
|
|
|
+ )
|
|
|
+ vtc[f"{speaker_A}_vocs_fp"] = extrude(
|
|
|
+ vtc[speaker_A], vtc[f"{speaker_A}_vocs_explained"]
|
|
|
+ )
|
|
|
+ vtc[f"{speaker_A}_vocs_fn"] = extrude(
|
|
|
+ truth[speaker_A], truth[speaker_A].crop(vtc[speaker_A], mode="loose")
|
|
|
+ )
|
|
|
+
|
|
|
+ for speaker_B in speakers:
|
|
|
+ vtc[f"{speaker_A}_vocs_fp_{speaker_B}"] = vtc[
|
|
|
+ f"{speaker_A}_vocs_fp"
|
|
|
+ ].crop(truth[speaker_B], mode="loose")
|
|
|
+
|
|
|
+ for speaker_C in speakers:
|
|
|
+ if speaker_C != speaker_B and speaker_C != speaker_A:
|
|
|
+ vtc[f"{speaker_A}_vocs_fp_{speaker_B}"] = extrude(
|
|
|
+ vtc[f"{speaker_A}_vocs_fp_{speaker_B}"],
|
|
|
+ vtc[f"{speaker_A}_vocs_fp_{speaker_B}"].crop(
|
|
|
+ truth[speaker_C], mode="loose"
|
|
|
+ ),
|
|
|
+ )
|
|
|
+
|
|
|
+ d = {}
|
|
|
+ keep_child = True
|
|
|
+ for i, speaker_A in enumerate(speakers):
|
|
|
+ for j, speaker_B in enumerate(speakers):
|
|
|
+ if i != j:
|
|
|
+ z = len(vtc[f"{speaker_A}_vocs_fp_{speaker_B}"])
|
|
|
+ else:
|
|
|
+ z = min(
|
|
|
+ len(vtc[f"{speaker_A}_vocs_explained"]), len(truth[speaker_A])
|
|
|
+ )
|
|
|
+
|
|
|
+ d[f"vtc_{i}_{j}"] = z
|
|
|
+
|
|
|
+ if z > len(truth[speaker_B]):
|
|
|
+ keep_child = False
|
|
|
+
|
|
|
+ d[f"truth_{i}"] = len(truth[speaker_A])
|
|
|
+ d["child"] = child
|
|
|
+
|
|
|
+ d["duration"] = ann["duration"].sum() / 2 / 1000
|
|
|
+
|
|
|
+ if keep_child:
|
|
|
+ data.append(d)
|
|
|
+
|
|
|
+ return pd.DataFrame(data).assign(
|
|
|
+ corpus=corpus,
|
|
|
+ )
|
|
|
+
|
|
|
+def rates(parameters):
|
|
|
+ corpus = parameters["corpus"]
|
|
|
+ annotator = parameters["annotator"]
|
|
|
+ speakers = ["CHI", "OCH", "FEM", "MAL"]
|
|
|
+
|
|
|
+ project = ChildProject(parameters["path"])
|
|
|
+ am = AnnotationManager(project)
|
|
|
+ am.read()
|
|
|
+
|
|
|
+ pipeline = AclewMetrics(
|
|
|
+ project,
|
|
|
+ vtc=annotator,
|
|
|
+ alice=None,
|
|
|
+ vcm=None,
|
|
|
+ from_time="09:00:00",
|
|
|
+ to_time="18:00:00",
|
|
|
+ by="child_id",
|
|
|
+ )
|
|
|
+ metrics = pipeline.extract()
|
|
|
+ metrics = pd.DataFrame(metrics).assign(corpus=corpus,annotator=annotator)
|
|
|
+ metrics["duration"] = metrics[f"duration_{annotator}"]/1000/3600
|
|
|
+ metrics = metrics[metrics["duration"] > 0.01]
|
|
|
+
|
|
|
+ speakers = ['CHI', 'OCH', 'FEM', 'MAL']
|
|
|
+
|
|
|
+ # metrics.dropna(subset={f"voc_{speaker.lower()}_ph" for speaker in speakers}&set(metrics.columns), inplace=True)
|
|
|
+
|
|
|
+ for i, speaker in enumerate(speakers):
|
|
|
+ # if f"voc_{speaker.lower()}_ph" not in metrics.columns:
|
|
|
+ # metrics[f"speech_rate_{i}"] = pd.NA
|
|
|
+ # else:
|
|
|
+ metrics[f"speech_rate_{i}"] = (metrics[f"voc_{speaker.lower()}_ph"]*(metrics["duration"])).fillna(0).astype(int)
|
|
|
+ metrics[f"voc_duration_{i}"] = (metrics[f"avg_voc_dur_{speaker.lower()}"]/1000).fillna(0)
|
|
|
+
|
|
|
+ return metrics
|
|
|
+
|
|
|
+stan_code = """
|
|
|
+data {
|
|
|
+ int<lower=1> n_clips; // number of clips
|
|
|
+ int<lower=1> n_groups; // number of groups
|
|
|
+ int<lower=1> n_corpora;
|
|
|
+ int<lower=1> n_classes; // number of classes
|
|
|
+ int group[n_clips];
|
|
|
+ int corpus[n_clips];
|
|
|
+ int vtc[n_clips,n_classes,n_classes];
|
|
|
+ int truth[n_clips,n_classes];
|
|
|
+
|
|
|
+ int<lower=1> n_validation;
|
|
|
+ int<lower=1> n_sim;
|
|
|
+ int<lower=0> selected_corpus;
|
|
|
+
|
|
|
+ int<lower=1> n_rates;
|
|
|
+ int<lower=0> speech_rates[n_rates,n_classes];
|
|
|
+ real<lower=0> rates_alphas[n_classes];
|
|
|
+ real<lower=0> rates_betas[n_classes];
|
|
|
+ int group_corpus[n_rates];
|
|
|
+ real<lower=0> duration[n_rates];
|
|
|
+
|
|
|
+ real<lower=0> voc_duration[n_rates,n_classes];
|
|
|
+
|
|
|
+}
|
|
|
+
|
|
|
+parameters {
|
|
|
+ matrix<lower=0,upper=1>[n_classes,n_classes] mus;
|
|
|
+ matrix<lower=1>[n_classes,n_classes] etas;
|
|
|
+ matrix<lower=0,upper=1>[n_classes,n_classes] group_confusion[n_groups];
|
|
|
+
|
|
|
+ matrix[n_classes,n_classes] corpus_bias[n_corpora];
|
|
|
+ matrix<lower=0>[n_classes,n_classes] corpus_sigma;
|
|
|
+
|
|
|
+ // speech rates
|
|
|
+ matrix<lower=1>[n_classes,n_corpora] speech_rate_alpha;
|
|
|
+ matrix<lower=0>[n_classes,n_corpora] speech_rate_mu;
|
|
|
+
|
|
|
+ matrix<lower=0> [n_classes,n_rates] speech_rate;
|
|
|
+
|
|
|
+ // voc duration
|
|
|
+ matrix<lower=1> [n_classes,n_corpora] voc_dur_alpha;
|
|
|
+ matrix<lower=0> [n_classes,n_corpora] voc_dur_mu;
|
|
|
+
|
|
|
+ matrix<lower=0> [n_classes,n_rates] voc_dur_child_mean;
|
|
|
+ //matrix<lower=0> [n_classes,n_rates] voc_dur_child_alpha;
|
|
|
+}
|
|
|
+
|
|
|
+transformed parameters {
|
|
|
+ matrix<lower=0>[n_classes,n_classes] alphas;
|
|
|
+ matrix<lower=0>[n_classes,n_classes] betas;
|
|
|
+
|
|
|
+ alphas = mus * etas;
|
|
|
+ betas = (1-mus) * etas;
|
|
|
+}
|
|
|
+
|
|
|
+model {
|
|
|
+ for (k in n_validation:n_clips) {
|
|
|
+ for (i in 1:n_classes) {
|
|
|
+ for (j in 1:n_classes) {
|
|
|
+ vtc[k,i,j] ~ binomial(
|
|
|
+ truth[k,j], inv_logit(logit(group_confusion[group[k],j,i]) + corpus_bias[corpus[k],j,i])
|
|
|
+ );
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ for (i in 1:n_classes) {
|
|
|
+ for (j in 1:n_classes) {
|
|
|
+ mus[i,j] ~ beta(1,1);
|
|
|
+ etas[i,j] ~ pareto(1,1.5);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ for (c in 1:n_groups) {
|
|
|
+ for (i in 1:n_classes) {
|
|
|
+ for (j in 1:n_classes) {
|
|
|
+ group_confusion[c,i,j] ~ beta(alphas[i,j], betas[i,j]);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ for (i in 1:n_classes) {
|
|
|
+ for (j in 1:n_classes) {
|
|
|
+ for (c in 1:n_corpora) {
|
|
|
+ corpus_bias[c,j,i] ~ normal(0, corpus_sigma[j,i]);
|
|
|
+ }
|
|
|
+ corpus_sigma[j,i] ~ normal(0, 1);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ // speech rates
|
|
|
+ for (i in 1:n_classes) {
|
|
|
+ speech_rate_alpha[i,:] ~ normal(1, 1);
|
|
|
+ speech_rate_mu[i,:] ~ exponential(2);
|
|
|
+
|
|
|
+ voc_dur_alpha[i,:] ~ normal(1, 1);
|
|
|
+ voc_dur_mu[i,:] ~ exponential(1);
|
|
|
+
|
|
|
+ //voc_dur_child_alpha[i,:] ~ exponential(1);
|
|
|
+ }
|
|
|
+
|
|
|
+ for (g in 1:n_rates) {
|
|
|
+ for (i in 1:n_classes) {
|
|
|
+ speech_rate[i,g] ~ gamma(speech_rate_alpha[i,group_corpus[g]], (speech_rate_alpha[i,group_corpus[g]]/speech_rate_mu[i,group_corpus[g]])/1000);
|
|
|
+ speech_rates[g,i] ~ poisson(speech_rate[i,g]*duration[g]);
|
|
|
+
|
|
|
+ if (speech_rates[g,i] > 0) {
|
|
|
+ //voc_duration[g,i] ~ gamma(speech_rates[g,i]*voc_dur_child_alpha[i,g], speech_rates[g,i]*voc_dur_child_alpha[i,g]/voc_dur_child_mean[i,g]);
|
|
|
+ //voc_duration[g,i] ~ gamma(speech_rates[g,i], speech_rates[g,i]/voc_dur_child_mean[i,g]);
|
|
|
+ target += gamma_lpdf(voc_duration[g,i] * speech_rates[g,i] | speech_rates[g,i], 1/voc_dur_child_mean[i,g]);
|
|
|
+ }
|
|
|
+ voc_dur_child_mean[i,g] ~ gamma(voc_dur_alpha[i,group_corpus[g]], voc_dur_alpha[i,group_corpus[g]]/voc_dur_mu[i,group_corpus[g]]);
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+generated quantities {
|
|
|
+ int pred[n_clips,n_classes,n_classes];
|
|
|
+ matrix[n_classes,n_classes] probs[n_groups];
|
|
|
+ matrix[n_classes,n_classes] log_lik[n_clips];
|
|
|
+
|
|
|
+ matrix[n_classes,n_classes] random_bias;
|
|
|
+ matrix[n_classes,n_classes] fixed_bias;
|
|
|
+
|
|
|
+ int sim_truth[n_sim,n_classes];
|
|
|
+ int sim_vtc[n_sim,n_classes];
|
|
|
+ vector[n_classes] lambdas;
|
|
|
+ real chi_adu_coef = 0; // null-hypothesis
|
|
|
+
|
|
|
+ for (i in 1:n_classes) {
|
|
|
+ for (j in 1:n_classes) {
|
|
|
+ if (selected_corpus != 0) {
|
|
|
+ fixed_bias[j, i] = corpus_bias[selected_corpus, j, i];
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ fixed_bias[j, i] = 0;
|
|
|
+ }
|
|
|
+ random_bias[j,i] = normal_rng(0, corpus_sigma[j,i]);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ for (c in 1:n_groups) {
|
|
|
+ for (i in 1:n_classes) {
|
|
|
+ for (j in 1:n_classes) {
|
|
|
+ probs[c,i,j] = beta_rng(alphas[i,j], betas[i,j]);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ for (k in 1:n_clips) {
|
|
|
+ for (i in 1:n_classes) {
|
|
|
+ for (j in 1:n_classes) {
|
|
|
+ if (k >= n_validation) {
|
|
|
+ pred[k,i,j] = binomial_rng(truth[k,j], inv_logit(logit(group_confusion[group[k],j,i]) + corpus_bias[corpus[k], j,i]));
|
|
|
+ log_lik[k,i,j] = binomial_lpmf(
|
|
|
+ vtc[k,i,j] | truth[k,j], inv_logit(logit(group_confusion[group[k],j,i]) + corpus_bias[corpus[k], j,i])
|
|
|
+ );
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ pred[k,i,j] = binomial_rng(
|
|
|
+ truth[k,j], inv_logit(logit(probs[group[k],j,i]) + corpus_bias[corpus[k], j,i])
|
|
|
+ );
|
|
|
+ log_lik[k,i,j] = beta_lpdf(probs[group[k],j,i] | alphas[j,i], betas[j,i]);
|
|
|
+ log_lik[k,i,j] += binomial_lpmf(
|
|
|
+ vtc[k,i,j] | truth[k,j], inv_logit(logit(probs[group[k],j,i]) + corpus_bias[corpus[k], j,i])
|
|
|
+ );
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ real lambda;
|
|
|
+ for (k in 1:n_sim) {
|
|
|
+ for (i in 2:n_classes) {
|
|
|
+ if (selected_corpus != 0) {
|
|
|
+ lambda = gamma_rng(speech_rate_alpha[i,selected_corpus], (speech_rate_alpha[i,selected_corpus]/speech_rate_mu[i,selected_corpus])/1000)*9;
|
|
|
+ } else {
|
|
|
+ lambda = gamma_rng(rates_alphas[i], rates_betas[i]);
|
|
|
+ }
|
|
|
+ sim_truth[k,i] = poisson_rng(lambda);
|
|
|
+ }
|
|
|
+ if (selected_corpus != 0) {
|
|
|
+ lambda = gamma_rng(speech_rate_alpha[1,selected_corpus], speech_rate_alpha[1,selected_corpus]/speech_rate_mu[1,selected_corpus]/1000)*9;
|
|
|
+ } else {
|
|
|
+ lambda = gamma_rng(rates_alphas[1], rates_betas[1]);
|
|
|
+ }
|
|
|
+ sim_truth[k,1] = poisson_rng(lambda + chi_adu_coef*(sim_truth[k,3]+sim_truth[k,4]));
|
|
|
+ }
|
|
|
+
|
|
|
+ for (k in 1:n_sim) {
|
|
|
+ for (i in 1:n_classes) {
|
|
|
+ sim_vtc[k,i] = 0;
|
|
|
+ for (j in 1:n_classes) {
|
|
|
+ real p = logit(beta_rng(alphas[j,i], betas[j,i]));
|
|
|
+
|
|
|
+ if (selected_corpus != 0) {
|
|
|
+ p += fixed_bias[j,i];
|
|
|
+ }
|
|
|
+ else {
|
|
|
+ p += random_bias[j,i];
|
|
|
+ }
|
|
|
+ p = inv_logit(p);
|
|
|
+ sim_vtc[k,i] += binomial_rng(sim_truth[k,j], p);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+"""
|
|
|
+
|
|
|
+if __name__ == "__main__":
|
|
|
+ annotators = pd.read_csv("input/annotators.csv")
|
|
|
+ annotators["path"] = annotators["corpus"].apply(lambda c: opj("input", c))
|
|
|
+
|
|
|
+ with mp.Pool(processes=8) as pool:
|
|
|
+ data = pd.concat(pool.map(compute_counts, annotators.to_dict(orient="records")))
|
|
|
+
|
|
|
+ data = data.sample(frac=1)
|
|
|
+ duration = data["duration"].sum()
|
|
|
+
|
|
|
+ vtc = np.moveaxis(
|
|
|
+ [[data[f"vtc_{j}_{i}"].values for i in range(4)] for j in range(4)], -1, 0
|
|
|
+ )
|
|
|
+ truth = np.transpose([data[f"truth_{i}"].values for i in range(4)])
|
|
|
+
|
|
|
+ # speech rates at the child level
|
|
|
+ annotators = annotators[~annotators['annotator'].str.startswith('eaf_2021')]
|
|
|
+ with mp.Pool(processes=8) as pool:
|
|
|
+ speech_rates = pd.concat(pool.map(rates, annotators.to_dict(orient="records")))
|
|
|
+
|
|
|
+ speech_rates.reset_index(inplace=True)
|
|
|
+ speech_rates = speech_rates.groupby(["corpus", "child_id"]).sample(1)
|
|
|
+ speech_rate_matrix = np.transpose([speech_rates[f"speech_rate_{i}"].values for i in range(4)])
|
|
|
+ voc_duration_matrix = np.transpose([speech_rates[f"voc_duration_{i}"].values for i in range(4)])
|
|
|
+
|
|
|
+ speech_rates.to_csv("rates.csv")
|
|
|
+
|
|
|
+
|
|
|
+ print(vtc.shape)
|
|
|
+
|
|
|
+ fixed_rates = pd.read_csv("output/speech_dist.csv")
|
|
|
+
|
|
|
+ training_set = data.groupby("corpus").agg(
|
|
|
+ duration=("duration", "sum"), children=("child", lambda x: x.nunique())
|
|
|
+ )
|
|
|
+ training_set["duration"] /= 3600
|
|
|
+ training_set.to_csv("output/training_set.csv")
|
|
|
+
|
|
|
+ data["corpus"] = data["corpus"].astype("category")
|
|
|
+ corpora = data["corpus"].cat.codes.values
|
|
|
+ corpora_codes = dict(enumerate(data["corpus"].cat.categories))
|
|
|
+ corpora_codes = {v: k for k, v in corpora_codes.items()}
|
|
|
+
|
|
|
+ data = {
|
|
|
+ "n_clips": truth.shape[0],
|
|
|
+ "n_classes": truth.shape[1],
|
|
|
+ "n_groups": data["child"].nunique(),
|
|
|
+ "n_corpora": data["corpus"].nunique(),
|
|
|
+ "n_validation": max(1, int(truth.shape[0] * args.validation)),
|
|
|
+ "n_sim": args.simulated_children,
|
|
|
+ "group": 1 + data["child"].astype("category").cat.codes.values,
|
|
|
+ "corpus": 1 + corpora,
|
|
|
+ "selected_corpus": (
|
|
|
+ 1 + corpora_codes[args.apply_bias_from]
|
|
|
+ if args.apply_bias_from in corpora_codes
|
|
|
+ else 0
|
|
|
+ ),
|
|
|
+ "truth": truth.astype(int),
|
|
|
+ "vtc": vtc.astype(int),
|
|
|
+ "rates_alphas": fixed_rates["alpha"].values,
|
|
|
+ "rates_betas": fixed_rates["beta"].values,
|
|
|
+ "speech_rates": speech_rate_matrix.astype(int),
|
|
|
+ "voc_duration": voc_duration_matrix,
|
|
|
+ "group_corpus": 1+speech_rates["corpus"].map(corpora_codes).astype(int).values,
|
|
|
+ "duration": speech_rates["duration"].values,
|
|
|
+ "n_rates": len(speech_rates)
|
|
|
+ }
|
|
|
+
|
|
|
+ print(f"clips: {data['n_clips']}")
|
|
|
+ print(f"groups: {data['n_groups']}")
|
|
|
+ print("true vocs: {}".format(np.sum(data["truth"])))
|
|
|
+ print("vtc vocs: {}".format(np.sum(data["vtc"])))
|
|
|
+ print("duration: {}".format(duration))
|
|
|
+
|
|
|
+ print("selected corpus: {}".format(data["selected_corpus"]))
|
|
|
+
|
|
|
+ with open(f"output/samples/data_{args.output}.pickle", "wb") as fp:
|
|
|
+ pickle.dump(data, fp, pickle.HIGHEST_PROTOCOL)
|
|
|
+
|
|
|
+ posterior = stan.build(stan_code, data=data)
|
|
|
+ fit = posterior.sample(num_chains=args.chains, num_samples=args.samples)
|
|
|
+ df = fit.to_frame()
|
|
|
+ df.to_parquet(f"output/samples/fit_{args.output}.parquet")
|
|
|
+
|