Browse Source

[DATALAD] Recorded changes

Lucas Gautheron 1 year ago
parent
commit
94336fd387

+ 491 - 0
code/models/corpus_bias_rates.py

@@ -0,0 +1,491 @@
+#!/usr/bin/env python3
+
+from ChildProject.projects import ChildProject
+from ChildProject.annotations import AnnotationManager
+from ChildProject.metrics import segments_to_annotation
+from ChildProject.pipelines.metrics import AclewMetrics
+
+import argparse
+
+import datalad.api
+from os.path import join as opj
+from os.path import basename, exists
+
+import multiprocessing as mp
+
+import numpy as np
+import pandas as pd
+import pickle
+from pyannote.core import Annotation, Segment, Timeline
+
+import stan
+
+parser = argparse.ArgumentParser(
+    description="main model described throughout the notes."
+)
+# parser.add_argument("--group", default="child", choices=["corpus", "child"])
+parser.add_argument("--apply-bias-from", type=str, default="")
+parser.add_argument("--chains", default=4, type=int)
+parser.add_argument("--samples", default=2000, type=int)
+parser.add_argument("--validation", default=0, type=float)
+parser.add_argument("--simulated-children", default=40, type=int)
+parser.add_argument("--output", default="corpus_bias")
+args = parser.parse_args()
+
+
+def extrude(self, removed, mode: str = "intersection"):
+    if isinstance(removed, Segment):
+        removed = Timeline([removed])
+
+    truncating_support = removed.gaps(support=self.extent())
+    # loose for truncate means strict for crop and vice-versa
+    if mode == "loose":
+        mode = "strict"
+    elif mode == "strict":
+        mode = "loose"
+
+    return self.crop(truncating_support, mode=mode)
+
+
+def compute_counts(parameters):
+    corpus = parameters["corpus"]
+    annotator = parameters["annotator"]
+    speakers = ["CHI", "OCH", "FEM", "MAL"]
+
+    project = ChildProject(parameters["path"])
+    am = AnnotationManager(project)
+    am.read()
+
+    intersection = AnnotationManager.intersection(am.annotations, ["vtc", annotator])
+
+    intersection["path"] = intersection.apply(
+        lambda r: opj(
+            project.path, "annotations", r["set"], "converted", r["annotation_filename"]
+        ),
+        axis=1,
+    )
+    datalad.api.get(list(intersection["path"].unique()))
+
+    intersection = intersection.merge(
+        project.recordings[["recording_filename", "child_id"]], how="left"
+    )
+    intersection["child"] = corpus + "_" + intersection["child_id"].astype(str)
+    intersection["duration"] = (
+        intersection["range_offset"] - intersection["range_onset"]
+    )
+    print(corpus, annotator, (intersection["duration"] / 1000 / 2).sum() / 3600)
+
+    data = []
+    for child, ann in intersection.groupby("child"):
+        # print(corpus, child)
+
+        segments = am.get_collapsed_segments(ann)
+        if "speaker_type" not in segments.columns:
+            continue
+
+        segments = segments[segments["speaker_type"].isin(speakers)]
+
+        vtc = {
+            speaker: segments_to_annotation(
+                segments[
+                    (segments["set"] == "vtc") & (segments["speaker_type"] == speaker)
+                ],
+                "speaker_type",
+            ).get_timeline()
+            for speaker in speakers
+        }
+
+        truth = {
+            speaker: segments_to_annotation(
+                segments[
+                    (segments["set"] == annotator)
+                    & (segments["speaker_type"] == speaker)
+                ],
+                "speaker_type",
+            ).get_timeline()
+            for speaker in speakers
+        }
+
+        for speaker_A in speakers:
+            vtc[f"{speaker_A}_vocs_explained"] = vtc[speaker_A].crop(
+                truth[speaker_A], mode="loose"
+            )
+            vtc[f"{speaker_A}_vocs_fp"] = extrude(
+                vtc[speaker_A], vtc[f"{speaker_A}_vocs_explained"]
+            )
+            vtc[f"{speaker_A}_vocs_fn"] = extrude(
+                truth[speaker_A], truth[speaker_A].crop(vtc[speaker_A], mode="loose")
+            )
+
+            for speaker_B in speakers:
+                vtc[f"{speaker_A}_vocs_fp_{speaker_B}"] = vtc[
+                    f"{speaker_A}_vocs_fp"
+                ].crop(truth[speaker_B], mode="loose")
+
+                for speaker_C in speakers:
+                    if speaker_C != speaker_B and speaker_C != speaker_A:
+                        vtc[f"{speaker_A}_vocs_fp_{speaker_B}"] = extrude(
+                            vtc[f"{speaker_A}_vocs_fp_{speaker_B}"],
+                            vtc[f"{speaker_A}_vocs_fp_{speaker_B}"].crop(
+                                truth[speaker_C], mode="loose"
+                            ),
+                        )
+
+        d = {}
+        keep_child = True
+        for i, speaker_A in enumerate(speakers):
+            for j, speaker_B in enumerate(speakers):
+                if i != j:
+                    z = len(vtc[f"{speaker_A}_vocs_fp_{speaker_B}"])
+                else:
+                    z = min(
+                        len(vtc[f"{speaker_A}_vocs_explained"]), len(truth[speaker_A])
+                    )
+
+                d[f"vtc_{i}_{j}"] = z
+
+                if z > len(truth[speaker_B]):
+                    keep_child = False
+
+            d[f"truth_{i}"] = len(truth[speaker_A])
+            d["child"] = child
+
+        d["duration"] = ann["duration"].sum() / 2 / 1000
+
+        if keep_child:
+            data.append(d)
+
+    return pd.DataFrame(data).assign(
+        corpus=corpus,
+    )
+
+def rates(parameters):
+    corpus = parameters["corpus"]
+    annotator = parameters["annotator"]
+    speakers = ["CHI", "OCH", "FEM", "MAL"]
+
+    project = ChildProject(parameters["path"])
+    am = AnnotationManager(project)
+    am.read()
+
+    pipeline = AclewMetrics(
+        project,
+        vtc=annotator,
+        alice=None,
+        vcm=None,
+        from_time="09:00:00",
+        to_time="18:00:00",
+        by="child_id",
+    )
+    metrics = pipeline.extract()
+    metrics = pd.DataFrame(metrics).assign(corpus=corpus,annotator=annotator)
+    metrics["duration"] = metrics[f"duration_{annotator}"]/1000/3600
+    metrics = metrics[metrics["duration"] > 0.01]
+    
+    speakers = ['CHI', 'OCH', 'FEM', 'MAL']
+
+    # metrics.dropna(subset={f"voc_{speaker.lower()}_ph" for speaker in speakers}&set(metrics.columns), inplace=True)
+
+    for i, speaker in enumerate(speakers):
+        # if f"voc_{speaker.lower()}_ph" not in metrics.columns:
+        #     metrics[f"speech_rate_{i}"] = pd.NA
+        # else:
+        metrics[f"speech_rate_{i}"] = (metrics[f"voc_{speaker.lower()}_ph"]*(metrics["duration"])).fillna(0).astype(int)
+        metrics[f"voc_duration_{i}"] = (metrics[f"avg_voc_dur_{speaker.lower()}"]/1000).fillna(0)
+
+    return metrics
+
+stan_code = """
+data {
+  int<lower=1> n_clips;   // number of clips
+  int<lower=1> n_groups; // number of groups
+  int<lower=1> n_corpora;
+  int<lower=1> n_classes; // number of classes
+  int group[n_clips];
+  int corpus[n_clips];
+  int vtc[n_clips,n_classes,n_classes];
+  int truth[n_clips,n_classes];
+
+  int<lower=1> n_validation;
+  int<lower=1> n_sim;
+  int<lower=0> selected_corpus;
+
+  int<lower=1> n_rates;
+  int<lower=0> speech_rates[n_rates,n_classes];
+  real<lower=0> rates_alphas[n_classes];
+  real<lower=0> rates_betas[n_classes];
+  int group_corpus[n_rates];
+  real<lower=0> duration[n_rates];
+
+  real<lower=0> voc_duration[n_rates,n_classes];
+
+}
+
+parameters {
+  matrix<lower=0,upper=1>[n_classes,n_classes] mus;
+  matrix<lower=1>[n_classes,n_classes] etas;
+  matrix<lower=0,upper=1>[n_classes,n_classes] group_confusion[n_groups];
+
+  matrix[n_classes,n_classes] corpus_bias[n_corpora];
+  matrix<lower=0>[n_classes,n_classes] corpus_sigma;
+
+  // speech rates
+  matrix<lower=1>[n_classes,n_corpora] speech_rate_alpha;
+  matrix<lower=0>[n_classes,n_corpora] speech_rate_mu;
+
+  matrix<lower=0> [n_classes,n_rates] speech_rate;
+
+  // voc duration
+  matrix<lower=1> [n_classes,n_corpora] voc_dur_alpha;
+  matrix<lower=0> [n_classes,n_corpora] voc_dur_mu;
+
+  matrix<lower=0> [n_classes,n_rates] voc_dur_child_mean;
+  //matrix<lower=0> [n_classes,n_rates] voc_dur_child_alpha;
+}
+
+transformed parameters {
+  matrix<lower=0>[n_classes,n_classes] alphas;
+  matrix<lower=0>[n_classes,n_classes] betas;
+
+  alphas = mus * etas;
+  betas = (1-mus) * etas;
+}
+
+model {
+    for (k in n_validation:n_clips) {
+        for (i in 1:n_classes) {
+            for (j in 1:n_classes) {
+                vtc[k,i,j] ~ binomial(
+                    truth[k,j], inv_logit(logit(group_confusion[group[k],j,i]) + corpus_bias[corpus[k],j,i])
+                );
+            }
+        }
+    }
+
+    for (i in 1:n_classes) {
+        for (j in 1:n_classes) {
+            mus[i,j] ~ beta(1,1);
+            etas[i,j] ~ pareto(1,1.5);
+        }
+    }
+
+    for (c in 1:n_groups) {
+        for (i in 1:n_classes) {
+            for (j in 1:n_classes) {
+                group_confusion[c,i,j] ~ beta(alphas[i,j], betas[i,j]);
+            }
+        }
+    }
+
+    for (i in 1:n_classes) {
+        for (j in 1:n_classes) {
+            for (c in 1:n_corpora) {
+                corpus_bias[c,j,i] ~ normal(0, corpus_sigma[j,i]);
+            }
+            corpus_sigma[j,i] ~ normal(0, 1);
+        }
+    }
+
+    // speech rates
+    for (i in 1:n_classes) {
+        speech_rate_alpha[i,:] ~ normal(1, 1);
+        speech_rate_mu[i,:] ~ exponential(2);
+
+        voc_dur_alpha[i,:] ~ normal(1, 1);
+        voc_dur_mu[i,:] ~ exponential(1);
+
+        //voc_dur_child_alpha[i,:] ~ exponential(1);
+    }
+
+    for (g in 1:n_rates) {
+        for (i in 1:n_classes) {
+            speech_rate[i,g] ~ gamma(speech_rate_alpha[i,group_corpus[g]], (speech_rate_alpha[i,group_corpus[g]]/speech_rate_mu[i,group_corpus[g]])/1000);
+            speech_rates[g,i] ~ poisson(speech_rate[i,g]*duration[g]);
+
+            if (speech_rates[g,i] > 0) {
+                //voc_duration[g,i] ~ gamma(speech_rates[g,i]*voc_dur_child_alpha[i,g], speech_rates[g,i]*voc_dur_child_alpha[i,g]/voc_dur_child_mean[i,g]);
+                //voc_duration[g,i] ~ gamma(speech_rates[g,i], speech_rates[g,i]/voc_dur_child_mean[i,g]);
+                target += gamma_lpdf(voc_duration[g,i] * speech_rates[g,i] |  speech_rates[g,i], 1/voc_dur_child_mean[i,g]);
+            }
+            voc_dur_child_mean[i,g] ~ gamma(voc_dur_alpha[i,group_corpus[g]], voc_dur_alpha[i,group_corpus[g]]/voc_dur_mu[i,group_corpus[g]]);
+        }
+    }
+}
+
+generated quantities {
+    int pred[n_clips,n_classes,n_classes];
+    matrix[n_classes,n_classes] probs[n_groups];
+    matrix[n_classes,n_classes] log_lik[n_clips];
+
+    matrix[n_classes,n_classes] random_bias;
+    matrix[n_classes,n_classes] fixed_bias;
+
+    int sim_truth[n_sim,n_classes];
+    int sim_vtc[n_sim,n_classes];
+    vector[n_classes] lambdas;
+    real chi_adu_coef = 0; // null-hypothesis
+
+    for (i in 1:n_classes) {
+        for (j in 1:n_classes) {
+            if (selected_corpus != 0) {
+                fixed_bias[j, i] = corpus_bias[selected_corpus, j, i];
+            }
+            else {
+                fixed_bias[j, i] = 0;
+            }
+            random_bias[j,i] = normal_rng(0, corpus_sigma[j,i]);
+        }
+    }
+
+    for (c in 1:n_groups) {
+        for (i in 1:n_classes) {
+            for (j in 1:n_classes) {
+                probs[c,i,j] = beta_rng(alphas[i,j], betas[i,j]);
+            }
+        }
+    }
+
+    for (k in 1:n_clips) {
+        for (i in 1:n_classes) {
+            for (j in 1:n_classes) {
+                if (k >= n_validation) {
+                    pred[k,i,j] = binomial_rng(truth[k,j], inv_logit(logit(group_confusion[group[k],j,i]) + corpus_bias[corpus[k], j,i]));
+                    log_lik[k,i,j] = binomial_lpmf(
+                        vtc[k,i,j] | truth[k,j], inv_logit(logit(group_confusion[group[k],j,i]) + corpus_bias[corpus[k], j,i])
+                    );
+                }
+                else {
+                    pred[k,i,j] = binomial_rng(
+                        truth[k,j], inv_logit(logit(probs[group[k],j,i]) + corpus_bias[corpus[k], j,i])
+                    );
+                    log_lik[k,i,j] = beta_lpdf(probs[group[k],j,i] | alphas[j,i], betas[j,i]);
+                    log_lik[k,i,j] += binomial_lpmf(
+                        vtc[k,i,j] | truth[k,j], inv_logit(logit(probs[group[k],j,i]) + corpus_bias[corpus[k], j,i])
+                    );
+                }
+            }
+        }
+    }
+
+    real lambda;
+    for (k in 1:n_sim) {
+        for (i in 2:n_classes) {
+            if (selected_corpus != 0) {
+                lambda = gamma_rng(speech_rate_alpha[i,selected_corpus], (speech_rate_alpha[i,selected_corpus]/speech_rate_mu[i,selected_corpus])/1000)*9;
+            } else {
+                lambda = gamma_rng(rates_alphas[i], rates_betas[i]);
+            }
+            sim_truth[k,i] = poisson_rng(lambda);
+        }
+        if (selected_corpus != 0) {
+            lambda = gamma_rng(speech_rate_alpha[1,selected_corpus], speech_rate_alpha[1,selected_corpus]/speech_rate_mu[1,selected_corpus]/1000)*9;
+        } else {
+            lambda = gamma_rng(rates_alphas[1], rates_betas[1]);
+        }
+        sim_truth[k,1] = poisson_rng(lambda + chi_adu_coef*(sim_truth[k,3]+sim_truth[k,4]));
+    }
+
+    for (k in 1:n_sim) {
+        for (i in 1:n_classes) {
+            sim_vtc[k,i] = 0;
+            for (j in 1:n_classes) {
+                real p = logit(beta_rng(alphas[j,i], betas[j,i]));
+
+                if (selected_corpus != 0) {
+                    p += fixed_bias[j,i];
+                }
+                else {
+                    p += random_bias[j,i];
+                }
+                p = inv_logit(p);
+                sim_vtc[k,i] += binomial_rng(sim_truth[k,j], p);
+            }
+        }
+    }
+}
+"""
+
+if __name__ == "__main__":
+    annotators = pd.read_csv("input/annotators.csv")
+    annotators["path"] = annotators["corpus"].apply(lambda c: opj("input", c))
+
+    with mp.Pool(processes=8) as pool:
+        data = pd.concat(pool.map(compute_counts, annotators.to_dict(orient="records")))
+
+    data = data.sample(frac=1)
+    duration = data["duration"].sum()
+
+    vtc = np.moveaxis(
+        [[data[f"vtc_{j}_{i}"].values for i in range(4)] for j in range(4)], -1, 0
+    )
+    truth = np.transpose([data[f"truth_{i}"].values for i in range(4)])
+
+    # speech rates at the child level
+    annotators = annotators[~annotators['annotator'].str.startswith('eaf_2021')]
+    with mp.Pool(processes=8) as pool:
+        speech_rates = pd.concat(pool.map(rates, annotators.to_dict(orient="records")))
+
+    speech_rates.reset_index(inplace=True)
+    speech_rates = speech_rates.groupby(["corpus", "child_id"]).sample(1)
+    speech_rate_matrix = np.transpose([speech_rates[f"speech_rate_{i}"].values for i in range(4)])
+    voc_duration_matrix = np.transpose([speech_rates[f"voc_duration_{i}"].values for i in range(4)])
+
+    speech_rates.to_csv("rates.csv")
+
+
+    print(vtc.shape)
+
+    fixed_rates = pd.read_csv("output/speech_dist.csv")
+
+    training_set = data.groupby("corpus").agg(
+        duration=("duration", "sum"), children=("child", lambda x: x.nunique())
+    )
+    training_set["duration"] /= 3600
+    training_set.to_csv("output/training_set.csv")
+
+    data["corpus"] = data["corpus"].astype("category")
+    corpora = data["corpus"].cat.codes.values
+    corpora_codes = dict(enumerate(data["corpus"].cat.categories))
+    corpora_codes = {v: k for k, v in corpora_codes.items()}
+
+    data = {
+        "n_clips": truth.shape[0],
+        "n_classes": truth.shape[1],
+        "n_groups": data["child"].nunique(),
+        "n_corpora": data["corpus"].nunique(),
+        "n_validation": max(1, int(truth.shape[0] * args.validation)),
+        "n_sim": args.simulated_children,
+        "group": 1 + data["child"].astype("category").cat.codes.values,
+        "corpus": 1 + corpora,
+        "selected_corpus": (
+            1 + corpora_codes[args.apply_bias_from]
+            if args.apply_bias_from in corpora_codes
+            else 0
+        ),
+        "truth": truth.astype(int),
+        "vtc": vtc.astype(int),
+        "rates_alphas": fixed_rates["alpha"].values,
+        "rates_betas": fixed_rates["beta"].values,
+        "speech_rates": speech_rate_matrix.astype(int),
+        "voc_duration": voc_duration_matrix,
+        "group_corpus": 1+speech_rates["corpus"].map(corpora_codes).astype(int).values,
+        "duration": speech_rates["duration"].values,
+        "n_rates": len(speech_rates)
+    }
+
+    print(f"clips: {data['n_clips']}")
+    print(f"groups: {data['n_groups']}")
+    print("true vocs: {}".format(np.sum(data["truth"])))
+    print("vtc vocs: {}".format(np.sum(data["vtc"])))
+    print("duration: {}".format(duration))
+
+    print("selected corpus: {}".format(data["selected_corpus"]))
+
+    with open(f"output/samples/data_{args.output}.pickle", "wb") as fp:
+        pickle.dump(data, fp, pickle.HIGHEST_PROTOCOL)
+
+    posterior = stan.build(stan_code, data=data)
+    fit = posterior.sample(num_chains=args.chains, num_samples=args.samples)
+    df = fit.to_frame()
+    df.to_parquet(f"output/samples/fit_{args.output}.parquet")
+

+ 0 - 0
code/models/speech_distribution.py


+ 237 - 0
code/plots/speech_rate.py

@@ -0,0 +1,237 @@
+#!/usr/bin/env python3
+
+import pandas as pd
+import pickle
+import numpy as np
+from scipy.special import logit, expit
+
+import argparse
+import matplotlib
+import matplotlib.pyplot as plt
+
+from scipy.stats import gamma
+
+from os.path import basename
+
+
+matplotlib.use("pgf")
+matplotlib.rcParams.update(
+    {
+        "pgf.texsystem": "pdflatex",
+        "font.family": "serif",
+        "font.serif": "Times New Roman",
+        "text.usetex": True,
+        "pgf.rcfonts": False,
+    }
+)
+
+
+def set_size(width, fraction=1, ratio=None):
+    fig_width_pt = width * fraction
+    inches_per_pt = 1 / 72.27
+    if ratio is None:
+        ratio = (5**0.5 - 1) / 2
+    fig_width_in = fig_width_pt * inches_per_pt
+    fig_height_in = fig_width_in * ratio
+    return fig_width_in, fig_height_in
+
+
+parser = argparse.ArgumentParser(description="speech_rate")
+parser.add_argument("data")
+parser.add_argument("fit")
+parser.add_argument("output")
+parser.add_argument("--selected-corpus", type=int, default=-1)
+
+args = parser.parse_args()
+
+with open(args.data, "rb") as fp:
+    data = pickle.load(fp)
+
+fit = pd.read_parquet(args.fit)
+
+corpora = pd.read_csv("output/training_set.csv")
+corpora = corpora["corpus"].map(basename).tolist()
+
+speakers = ["CHI", "OCH", "FEM", "MAL"]
+
+n_groups = data["n_groups"]
+n_corpora = data["n_corpora"]
+
+colors = ['#377eb8', '#ff7f00', '#4daf4a', '#f781bf']
+
+mu = np.zeros((data['n_corpora'], 4))
+mu_low = np.zeros((data['n_corpora'], 4))
+mu_high = np.zeros((data['n_corpora'], 4))
+
+inputs = np.zeros((data['n_corpora'], len(fit)))
+input = np.zeros(data['n_corpora'])
+input_low = np.zeros(data['n_corpora'])
+input_high = np.zeros(data['n_corpora'])
+
+output_speech =  np.zeros((data['n_corpora'], len(fit)))
+input_speech =  np.zeros((data['n_corpora'], len(fit)))
+
+output_dur = np.zeros(data['n_corpora'])
+output_dur_low = np.zeros(data['n_corpora'])
+output_dur_high = np.zeros(data['n_corpora'])
+
+input_dur = np.zeros(data['n_corpora'])
+input_dur_low = np.zeros(data['n_corpora'])
+input_dur_high = np.zeros(data['n_corpora'])
+
+for c in range(data['n_corpora']):
+    for i in range(4):
+        # alphas = fit[f"speech_rate_alpha.{i+1}.{c+1}"].values
+        # betas = fit[f"speech_rate_beta.{i+1}.{c+1}"].values/1000
+
+        # voc_dur_alphas = fit[f"voc_dur_alpha.{i+1}.{c+1}"].values
+        # voc_dur_betas = fit[f"voc_dur_beta.{i+1}.{c+1}"].values
+
+        mus = 1000*fit[f"speech_rate_mu.{i+1}.{c+1}"].values
+        voc_dur_means = fit[f"voc_dur_mu.{i+1}.{c+1}"].values
+
+        print(corpora[c], voc_dur_means.mean())
+
+        mu[c,i] = np.mean(mus)
+        mu_low[c,i] = np.quantile(mus, q=0.05/2)
+        mu_high[c,i] = np.quantile(mus, q=1-0.05/2)
+
+        if i > 0:
+            inputs[c,:] += mus
+            input_speech[c,:] += voc_dur_means*mus
+        else:
+            output_speech[c,:] = voc_dur_means*mus
+
+    
+    input[c] = inputs[c,:].mean()
+    input_low[c] = np.quantile(inputs[c,:], q=0.05/2)
+    input_high[c] = np.quantile(inputs[c,:], q=1-0.05/2)
+
+    output_dur[c] = output_speech[c,:].mean()/60
+    output_dur_low[c] = np.quantile(output_speech[c,:], q=0.05/2)/60
+    output_dur_high[c] = np.quantile(output_speech[c,:], q=1-0.05/2)/60
+
+    input_dur[c] = input_speech[c,:].mean()/60
+    input_dur_low[c] = np.quantile(input_speech[c,:], q=0.05/2)/60
+    input_dur_high[c] = np.quantile(input_speech[c,:], q=1-0.05/2)/60
+
+
+keep = [c not in ["winnipeg", "tsimane2017"] for c in corpora]
+
+corpora = [corpora[i] for i in range(len(corpora)) if keep[i]]
+mu = mu[keep,:]
+mu_low = mu_low[keep,:]
+mu_high = mu_high[keep,:]
+input = input[keep]
+input_low = input_low[keep]
+input_high = input_high[keep]
+
+input_dur = input_dur[keep]
+input_dur_low = input_dur_low[keep]
+input_dur_high = input_dur_high[keep]
+
+output_dur = output_dur[keep]
+output_dur_low = output_dur_low[keep]
+output_dur_high = output_dur_high[keep]
+
+print(corpora)
+print(mu.shape)
+
+fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)
+
+for row in range (2):
+    for col in range(2):
+        i = row+2*col
+        axes[row,col].scatter(np.arange(mu.shape[0]), mu[:,i])
+        axes[row,col].errorbar(np.arange(mu.shape[0]), mu[:,i], (mu[:,i]-mu_low[:,i], mu_high[:,i]-mu[:,i]) ,ls="none")
+        axes[row,col].set_title(speakers[i])
+        axes[row,col].set_ylim(0,1200)
+        axes[row,col].set_xticks(np.arange(mu.shape[0]))
+        axes[row,col].set_xticklabels(corpora)
+        axes[row,col].xaxis.set_tick_params(rotation=90)
+        
+        if col==0:
+            axes[row,col].set_ylabel("voc/h")
+
+fig.suptitle("Latent population mean voc/h\n(human annotations)")
+fig.savefig("output/quantities.png", bbox_inches="tight")
+
+fig, axes = plt.subplots(1, 2, sharex=True, sharey=True)
+
+for col in range(2):
+    if col == 0:
+        axes[col].scatter(np.arange(mu.shape[0]), mu[:,0])
+        axes[col].errorbar(np.arange(mu.shape[0]), mu[:,0], (mu[:,0]-mu_low[:,0], mu_high[:,0]-mu[:,0]) ,ls="none")
+    else:
+        axes[col].scatter(np.arange(len(input)), input)
+        axes[col].errorbar(np.arange(len(input)), input, (input-input_low, input_high-input) ,ls="none")
+
+    axes[col].set_title("output" if col == 0 else "input")
+    axes[col].set_ylim(0,2000)
+    axes[col].set_xticks(np.arange(mu.shape[0]))
+    axes[col].set_xticklabels(corpora)
+    axes[col].xaxis.set_tick_params(rotation=90)
+    
+    if col==0:
+        axes[col].set_ylabel("voc/h")
+
+fig.suptitle("Latent population mean voc/h\n(human annotations)")
+fig.savefig("output/input_output.png", bbox_inches="tight")
+
+
+fig, axes = plt.subplots(1, 2, sharex=True, sharey=True)
+
+for col in range(2):
+    if col == 0:
+        axes[col].scatter(np.arange(len(output_dur)), output_dur)
+        axes[col].errorbar(np.arange(len(output_dur)), output_dur, (output_dur-output_dur_low, output_dur_high-output_dur) ,ls="none")
+    else:
+        axes[col].scatter(np.arange(len(input_dur)), input_dur)
+        axes[col].errorbar(np.arange(len(input_dur)), input_dur, (input_dur-input_dur_low, input_dur_high-input_dur) ,ls="none")
+
+    axes[col].set_title("output" if col == 0 else "input")
+    axes[col].set_ylim(0,60)
+    axes[col].set_xticks(np.arange(mu.shape[0]))
+    axes[col].set_xticklabels(corpora)
+    axes[col].xaxis.set_tick_params(rotation=90)
+    
+    if col==0:
+        axes[col].set_ylabel("min/h")
+
+fig.suptitle("Latent population mean min/h\n(human annotations)")
+fig.savefig("output/input_output_dur.png", bbox_inches="tight")
+
+fig, ax = plt.subplots(1,1)
+
+for i in range(4):
+    alphas = fit[f"speech_rate_alpha.{i+1}.{args.selected_corpus}"].values
+    scale = 1000*fit[f"speech_rate_mu.{i+1}.{args.selected_corpus}"].values/alphas
+
+    x = np.linspace(0,500,200,True)
+    pdf = np.zeros((len(x), len(alphas)))
+    for k in range(len(x)):
+        pdf[k,:] = gamma.pdf(x[k], alphas, np.zeros(len(alphas)), scale)
+
+    pdf_low = np.quantile(pdf, q=0.05, axis=1)
+    pdf_high = np.quantile(pdf, q=0.95, axis=1)
+    pdf_mean = np.mean(pdf, axis=1)
+
+    ax.plot(x, pdf_mean, color=colors[i], label=speakers[i])
+    ax.fill_between(x, pdf_low, pdf_high, color=colors[i], alpha=0.2)
+
+    ax.set_xlim(0, 500)
+
+    ax.set_xlabel("voc/h")
+
+    # ax.axvline(np.mean(data), linestyle="--", linewidth=0.5, color="#333", alpha=1)
+    # ax.text(0.5, 4.5, f"{low:.2f} - {high:.2f}", ha="center", va="center")
+
+ax.legend()
+
+corpus = pd.read_csv("output/training_set.csv")["corpus"].map(basename).tolist()[args.selected_corpus-1]
+fig.suptitle(f"voc/h distribution for each kind of speaker ({corpus})")
+fig.savefig(f"output/speech_distribution_{corpus}.png", bbox_inches="tight")
+
+plt.show()
+
+

+ 1 - 0
output/input_output.png

@@ -0,0 +1 @@
+../.git/annex/objects/MF/W1/MD5E-s23360--7ce11f2d6826f87d25d2f9943a4b1153.png/MD5E-s23360--7ce11f2d6826f87d25d2f9943a4b1153.png

+ 1 - 0
output/input_output_dur.png

@@ -0,0 +1 @@
+../.git/annex/objects/vW/Gm/MD5E-s21006--4392730d21b9551dcfb2a7b2631f6179.png/MD5E-s21006--4392730d21b9551dcfb2a7b2631f6179.png

+ 1 - 0
output/quantities.png

@@ -0,0 +1 @@
+../.git/annex/objects/q0/7w/MD5E-s31766--486ddaa61ceae02f8d6f6616c168f5a5.png/MD5E-s31766--486ddaa61ceae02f8d6f6616c168f5a5.png

+ 1 - 0
output/samples/data_corpus_bias_rates_25k_vanuatu1.pickle

@@ -0,0 +1 @@
+../../.git/annex/objects/X6/Qj/MD5E-s40277--6d7a75caac07b683733c9ae82e65b6fc/MD5E-s40277--6d7a75caac07b683733c9ae82e65b6fc

+ 1 - 0
output/samples/data_corpus_bias_rates_25k_vanuatu3.pickle

@@ -0,0 +1 @@
+../../.git/annex/objects/65/M2/MD5E-s40277--e42ac94aa38b65e332a6649797820cd5/MD5E-s40277--e42ac94aa38b65e332a6649797820cd5

+ 1 - 0
output/samples/fit_corpus_bias_rates_25k_vandam1.parquet

@@ -0,0 +1 @@
+../../.git/annex/objects/xM/Qz/MD5E-s2137699394--2eeefdd2b9fd4a88128089ebed3a610e/MD5E-s2137699394--2eeefdd2b9fd4a88128089ebed3a610e

+ 1 - 0
output/samples/fit_corpus_bias_rates_25k_vanuatu1.parquet

@@ -0,0 +1 @@
+../../.git/annex/objects/zj/xg/MD5E-s2244006222--5a023f8ed40aa28d88e5d3f667305e38/MD5E-s2244006222--5a023f8ed40aa28d88e5d3f667305e38

+ 1 - 0
output/samples/fit_corpus_bias_rates_25k_vanuatu3.parquet

@@ -0,0 +1 @@
+../../.git/annex/objects/JG/5j/MD5E-s2243271668--b1341a8046c120093176a7ce5a661e4f/MD5E-s2243271668--b1341a8046c120093176a7ce5a661e4f

+ 1 - 0
output/speech_distribution_vanuatu.pdf

@@ -0,0 +1 @@
+../.git/annex/objects/xJ/z7/MD5E-s49926--ab9204d0cf18b182f4ee23a6a0b727db.pdf/MD5E-s49926--ab9204d0cf18b182f4ee23a6a0b727db.pdf

+ 1 - 0
output/speech_distribution_vanuatu.png

@@ -0,0 +1 @@
+../.git/annex/objects/Qf/zX/MD5E-s57063--a64a59538d1e775d0d0207174d19e917.png/MD5E-s57063--a64a59538d1e775d0d0207174d19e917.png