Jelajahi Sumber

example analysis

Lucas Gautheron 9 bulan lalu
induk
melakukan
30be6fbfde
3 mengubah file dengan 231 tambahan dan 77 penghapusan
  1. 80 70
      code/models/analysis.py
  2. 147 0
      code/models/analysis.stan
  3. 4 7
      input/annotators.csv

+ 80 - 70
code/models/analysis.py

@@ -6,6 +6,7 @@ import numpy as np
 from ChildProject.projects import ChildProject
 from ChildProject.annotations import AnnotationManager
 from ChildProject.metrics import segments_to_grid, segments_to_annotation
+from ChildProject.pipelines.metrics import AclewMetrics
 from ChildProject.utils import TimeInterval
 
 from cmdstanpy import CmdStanModel
@@ -42,14 +43,15 @@ parser = argparse.ArgumentParser()
 parser.add_argument("--corpora", default=["input/bergelson"], nargs="+")
 parser.add_argument("--duration", type=int, help="duration in hours", default=8)
 parser.add_argument("--run")
-parser.add_argument("--chains", type=int, default=4)
-parser.add_argument("--warmup", type=int, default=500)
-parser.add_argument("--samples", type=int, default=2500)
+parser.add_argument("--chains", type=int, default=1)
+parser.add_argument("--warmup", type=int, default=250)
+parser.add_argument("--samples", type=int, default=1000)
 parser.add_argument("--threads-per-chain", type=int, default=4)
-parser.add_argument("--models", nargs='+')
+parser.add_argument("--models", default=["analysis"], nargs='+')
+parser.add_argument("--validation", type=float, default=0)
 args = parser.parse_args()
 
-
+speakers = ["CHI", "OCH", "FEM", "MAL"]
 
 def extrude(self, removed, mode: str = "intersection"):
     if isinstance(removed, Segment):
@@ -68,7 +70,6 @@ def extrude(self, removed, mode: str = "intersection"):
 def compute_counts(parameters):
     corpus = parameters["corpus"]
     annotator = parameters["annotator"]
-    speakers = ["CHI", "OCH", "FEM", "MAL"]
 
     project = ChildProject(parameters["path"])
     am = AnnotationManager(project)
@@ -177,10 +178,44 @@ def compute_counts(parameters):
         corpus=corpus,
     )
 
+def rates(parameters):
+    corpus = parameters["corpus"]
+    annotator = parameters["annotator"]
+    speakers = ["CHI", "OCH", "FEM", "MAL"]
+
+    project = ChildProject(parameters["path"])
+    am = AnnotationManager(project)
+    am.read()
+
+    pipeline = AclewMetrics(
+        project,
+        vtc=annotator,
+        alice=None,
+        vcm=None,
+        from_time="10:00:00",
+        to_time="18:00:00",
+        by="child_id",
+    )
+    metrics = pipeline.extract()
+    metrics = pd.DataFrame(metrics).assign(corpus=corpus,annotator=annotator)
+    metrics["duration"] = metrics[f"duration_{annotator}"]/1000/3600
+    metrics = metrics[metrics["duration"] > 0.01]
+    
+    speakers = ['CHI', 'OCH', 'FEM', 'MAL']
+
+    # metrics.dropna(subset={f"voc_{speaker.lower()}_ph" for speaker in speakers}&set(metrics.columns), inplace=True)
+
+    for i, speaker in enumerate(speakers):
+        # if f"voc_{speaker.lower()}_ph" not in metrics.columns:
+        #     metrics[f"speech_rate_{i}"] = pd.NA
+        # else:
+        metrics[f"speech_rate_{i}"] = (metrics[f"voc_{speaker.lower()}_ph"]*(metrics["duration"])).fillna(0).astype(int)
+
+    return metrics
 
 def run_model(data, run, model_name):
     model = CmdStanModel(
-        stan_file=f"analyses/aggregates_{model_name}.stan",
+        stan_file=f"code/models/{model_name}.stan",
         cpp_options={"STAN_THREADS": "TRUE"},
         compile="force",
     )
@@ -248,6 +283,17 @@ def compile_recordings(corpus):
 
         duration = args.duration * 3600 * 1000
 
+        _annotations["path"] = _annotations.apply(
+            lambda r: opj(
+                project.path, "annotations", r["set"], "converted", r["annotation_filename"]
+            ),
+            axis=1,
+        )
+
+        missing_annotations = _annotations[~_annotations["path"].map(exists)]
+        if len(missing_annotations):
+            datalad.api.get(list(missing_annotations["path"].unique()))
+
         segments = am.get_segments(_annotations)
         segments["segment_onset"] -= segments["segment_onset"].min()
         segments = segments[segments["segment_onset"] >= 0]
@@ -257,23 +303,17 @@ def compile_recordings(corpus):
             continue
 
         segments = segments[segments["speaker_type"].isin(["CHI", "OCH", "FEM", "MAL"])]
-        # segments["segment_onset"] += segments["session_offset"]
-
-        adu_onsets = segments[segments["speaker_type"].isin(["FEM", "MAL"])][
-            "segment_onset"
-        ].values
-        chi_onsets = segments[segments["speaker_type"] == "CHI"]["segment_onset"].values
-
-        recs.append(
-            {
-                "recording": recording_filename,
-                "adu": len(adu_onsets),
-                "chi": len(chi_onsets),
-                "age": age,
-                "children": f"{corpus}_{child_id}",
-                "corpus": project.children["experiment"].iloc[0]
-            }
-        )
+
+        rec = {
+            f"vtc_{i}": len(segments[segments["speaker_type"] == speaker])
+            for i, speaker in enumerate(speakers)
+        }
+        rec["recording"] = recording_filename
+        rec["children"] = f"{corpus}_{child_id}"
+        rec["corpus"] = basename(corpus)
+        rec["age"] = age
+
+        recs.append(rec)
 
     recs = pd.DataFrame(recs)
     return recs
@@ -285,7 +325,7 @@ if __name__ == "__main__":
     annotators = pd.read_csv("input/annotators.csv")
     annotators["path"] = annotators["corpus"].apply(lambda c: opj("input", c))
 
-    with mp.Pool(processes=8) as pool:
+    with mp.Pool(processes=args.chains*args.threads_per_chain) as pool:
         data = pd.concat(pool.map(compute_counts, annotators.to_dict(orient="records")))
 
     data = data.sample(frac=1)
@@ -298,27 +338,17 @@ if __name__ == "__main__":
 
     # speech rates at the child level
     annotators = annotators[~annotators['annotator'].str.startswith('eaf_2021')]
-    with mp.Pool(processes=8) as pool:
+    with mp.Pool(processes=args.chains*args.threads_per_chain) as pool:
         speech_rates = pd.concat(pool.map(rates, annotators.to_dict(orient="records")))
 
     speech_rates.reset_index(inplace=True)
     speech_rates = speech_rates.groupby(["corpus", "child_id"]).sample(1)
     speech_rate_matrix = np.transpose([speech_rates[f"speech_rate_{i}"].values for i in range(4)])
-    voc_duration_matrix = np.transpose([speech_rates[f"voc_duration_{i}"].values for i in range(4)])
 
     speech_rates.to_csv("rates.csv")
 
-
     print(vtc.shape)
 
-    fixed_rates = pd.read_csv("output/speech_dist.csv")
-
-    training_set = data.groupby("corpus").agg(
-        duration=("duration", "sum"), children=("child", lambda x: x.nunique())
-    )
-    training_set["duration"] /= 3600
-    training_set.to_csv("output/training_set.csv")
-
     data["corpus"] = data["corpus"].astype("category")
     corpora = data["corpus"].cat.codes.values
     corpora_codes = dict(enumerate(data["corpus"].cat.categories))
@@ -330,54 +360,34 @@ if __name__ == "__main__":
         "n_groups": data["child"].nunique(),
         "n_corpora": data["corpus"].nunique(),
         "n_validation": max(1, int(truth.shape[0] * args.validation)),
-        "n_sim": args.simulated_children,
         "group": 1 + data["child"].astype("category").cat.codes.values,
-        "corpus": 1 + corpora,
-        "selected_corpus": (
-            1 + corpora_codes[args.apply_bias_from]
-            if args.apply_bias_from in corpora_codes
-            else 0
-        ),
+        "conf_corpus": 1 + corpora,
         "truth": truth.astype(int),
         "vtc": vtc.astype(int),
+        "speech_rates": speech_rate_matrix.astype(int),
         "group_corpus": 1+speech_rates["corpus"].map(corpora_codes).astype(int).values,
-        "duration": speech_rates["duration"].values,
+        "durations": speech_rates["duration"].values,
         "n_rates": len(speech_rates)
     }
 
     n_recs = len(recs)
+
+    children_corpus = recs.groupby("children").agg(corpus=("corpus", "first")).sort_index()
+    children_corpus = 1+children_corpus.corpus.map(corpora_codes).astype(int).values
+
     analysis_data = {
         "n_recs": n_recs,
         "n_children": len(recs["children"].unique()),
         "children": recs["children"],
-        "adu": recs["adu"],
-        "chi": recs["chi"],
+        "vocs": np.transpose([recs[f"vtc_{i}"].values for i in range(4)]),
         "age": recs["age"],
-        "corpus": recs["corpus"]
+        "corpus": children_corpus,
+        "duration": args.duration,
     }
 
-data = analysis_data.update(confusion_data)
-
-output = {}
-
-for model_name in args.models:
-    output[model_name] = run_model(data, args.run, model_name)
-
-for model in output:
-    print(model)
-    print("total_evidence:", output[model]["total_evidence"])
-    print("evidence:", output[model]["evidence"])
-    print("priors:", output[model]["priors"])
-
-    samples = output[model]["samples"]
-
-    if "beta_a" in samples:
-        plt.clf()
-        plt.hist(samples["beta_a"], bins=np.linspace(-0.5,0.5,100), histtype="step")
-        plt.savefig(f"output/{args.run}_{model}_beta_a.png", bbox_inches="tight")
+    data = {**analysis_data, **confusion_data}
 
-    if "lambda" in samples:
-        plt.clf()
-        plt.hist(samples["lambda"], bins=np.linspace(-0.5,0.5,100), histtype="step")
-        plt.savefig(f"output/{args.run}_{model}_lambda.png", bbox_inches="tight")
+    output = {}
 
+    for model_name in args.models:
+        output[model_name] = run_model(data, args.run, model_name)

+ 147 - 0
code/models/analysis.stan

@@ -0,0 +1,147 @@
+// TODO
+// use speech rates to set priors on truth_vocs
+data {
+    int<lower=1> n_classes; // number of classes
+
+    // analysis data block
+    int<lower=1> n_recs;
+    int<lower=1> n_children;
+
+    array[n_recs] int<lower=1> children;
+    array[n_recs] real<lower=1> age;
+    array[n_recs, n_classes] int<lower=0> vocs;
+    array[n_children] int<lower=1> corpus;
+
+    real<lower=0> duration;
+
+    // speaker confusion data block
+    int<lower=1> n_clips;   // number of clips
+    int<lower=1> n_groups; // number of groups
+    int<lower=1> n_corpora;
+    array [n_clips] int group;
+    array [n_clips] int conf_corpus;
+    array [n_clips,n_classes,n_classes] int vtc;
+    array [n_clips,n_classes] int truth;
+
+    int<lower=1> n_validation;
+
+    // actual speech rates
+    int<lower=1> n_rates;
+    array [n_rates,n_classes] int<lower=0> speech_rates;
+    array [n_rates] int group_corpus;
+    array [n_rates] real<lower=0> durations;
+}
+
+parameters {
+    matrix<lower=0> [n_recs, n_classes] truth_vocs;
+    array [n_children] matrix<lower=0,upper=1>[n_classes,n_classes] actual_confusion_baseline;
+
+    // confusion parameters
+    matrix<lower=0,upper=1>[n_classes,n_classes] mus;
+    matrix<lower=1>[n_classes,n_classes] etas;
+    array [n_groups] matrix<lower=0,upper=1>[n_classes,n_classes] group_confusion;
+
+    array [n_corpora] matrix[n_classes,n_classes] corpus_bias;
+    matrix<lower=0>[n_classes,n_classes] corpus_sigma;
+
+    // speech rates
+    matrix<lower=1>[n_classes,n_corpora] speech_rate_alpha;
+    matrix<lower=0>[n_classes,n_corpora] speech_rate_mu;
+    matrix<lower=0> [n_classes,n_rates] speech_rate;
+}
+
+transformed parameters {
+    array [n_children] matrix<lower=0,upper=1>[n_classes,n_classes] actual_confusion;
+
+    for (c in 1:n_children) {
+        actual_confusion[c] = inv_logit(logit(actual_confusion_baseline[c])+corpus_bias[corpus[c]]);
+    }
+}
+
+model {
+    // actual model
+    vector [4] expect;
+    vector [4] sd;
+
+    for (k in 1:n_recs) {
+        expect = rep_vector(0, 4);
+        sd = rep_vector(0, 4);
+
+        for (i in 1:n_classes) {
+            for (j in 1:n_classes) {
+                expect[i] += truth_vocs[k,j]*actual_confusion[children[k],j,i];
+                sd[i] += truth_vocs[k,j]*actual_confusion[children[k],j,i]*(1-actual_confusion[children[k],j,i]);
+            }
+        }
+        vocs[k,:] ~ normal(expect, sqrt(sd));
+    }
+
+    for (c in 1:n_children) {
+        for (i in 1:n_classes) {
+            actual_confusion_baseline[c,i] ~ beta_proportion(mus[i,:], etas[i,:]);
+        }
+    }
+
+    for (k in 1:n_recs) {
+        truth_vocs[k,:] ~ gamma(
+            speech_rate_alpha[:,corpus[children[k]]],
+            (speech_rate_alpha[:,corpus[children[k]]]./speech_rate_mu[:,corpus[children[k]]])/1000/duration
+        );
+    }
+
+    // speaker confusion model
+    for (k in n_validation:n_clips) {
+        for (i in 1:n_classes) {
+            for (j in 1:n_classes) {
+                vtc[k,i,j] ~ binomial(
+                    truth[k,j], inv_logit(logit(group_confusion[group[k],j,i]) + corpus_bias[conf_corpus[k],j,i])
+                );
+            }
+        }
+    }
+
+    for (i in 1:n_classes) {
+        for (j in 1:n_classes) {
+            if (i==j) {
+                mus[i,j] ~ beta(2,1);
+            }
+            else {
+                mus[i,j] ~ beta(1,2);
+            }
+            etas[i,j] ~ pareto(1,1.5);
+        }
+    }
+
+    for (c in 1:n_groups) {
+        for (i in 1:n_classes) {
+            for (j in 1:n_classes) {
+                group_confusion[c,i,j] ~ beta_proportion(mus[i,j], etas[i,j]);
+            }
+        }
+    }
+
+    for (i in 1:n_classes) {
+        for (j in 1:n_classes) {
+            for (c in 1:n_corpora) {
+                corpus_bias[c,j,i] ~ normal(0, corpus_sigma[j,i]);
+            }
+            corpus_sigma[j,i] ~ normal(0, 1);
+        }
+    }
+
+    // speech rates
+    for (i in 1:n_classes) {
+        speech_rate_alpha[i,:] ~ normal(1, 1);
+        speech_rate_mu[i,:] ~ exponential(2);
+    }
+
+    for (g in 1:n_rates) {
+        for (i in 1:n_classes) {
+            speech_rate[i,g] ~ gamma(
+                speech_rate_alpha[i,group_corpus[g]],
+                (speech_rate_alpha[i,group_corpus[g]]/speech_rate_mu[i,group_corpus[g]])/1000
+            );
+            speech_rates[g,i] ~ poisson(speech_rate[i,g]*durations[g]);
+        }
+    }
+}

+ 4 - 7
input/annotators.csv

@@ -9,10 +9,7 @@ tsimane2017,eaf_2021/CD
 tsimane2017,eaf_2021/CM
 tsimane2017,textgrid/mm
 tsimane2017,eaf/nk
-namibia,textgrid/m1
-namibia,textgrid/mm
-namibia,textgrid/ak
-EL1000/bergelson,eaf/an1
-EL1000/warlaumont,eaf/an1
-EL1000/winnipeg,eaf/an1
-vandam,cha/an1
+bergelson,eaf/an1
+warlaumont,eaf/an1
+winnipeg,eaf/an1
+lucid,eaf/an1