Forráskód Böngészése

[DATALAD] Recorded changes

Lucas Gautheron 3 hónapja
szülő
commit
18c36893f1
2 módosított fájl, 99 hozzáadás és 0 törlés
  1. 79 0
      votes_distrib.py
  2. 20 0
      votes_distrib.stan

+ 79 - 0
votes_distrib.py

@@ -0,0 +1,79 @@
+import pandas as pd
+import numpy as np
+from matplotlib import pyplot as plt 
+import seaborn as sns
+
+import argparse
+
+def order(x,l):
+    if x == "NO-LABEL":
+        return 2
+    if x in ["NA","Junk"]:
+        return 1
+    else:
+        return -sorted(l, reverse=True).index(x)
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--key")
+parser.add_argument("--normalize", action="store_true", default=False)
+args = parser.parse_args()
+
+majority_col = f"majority_label_{args.key}"
+individual_col = f"labels_{args.key}"
+
+df = pd.concat([
+    pd.read_csv("speech-maturity-dataset/data/babblecor/babblecor.csv"),
+    pd.read_csv("speech-maturity-dataset/data/maturity1/maturity1.csv"),
+    pd.read_csv("speech-maturity-dataset/data/maturity2/maturity2.csv"),
+])
+df["clip_id"] = df.index.astype(int)+1
+
+df.dropna(axis=0, subset=[majority_col], inplace=True)
+
+df[individual_col] = df[individual_col].str.split(",")
+df["n_responses"] = df[individual_col].map(len)
+df = df.explode(individual_col)
+
+df = df.groupby(["clip_id",individual_col]).agg(
+    n=("child_id", "count"),
+    majority_label=(majority_col, "first"),
+    n_responses=("n_responses", "first")
+).reset_index()
+
+df = df.pivot(index="clip_id", columns=individual_col,values="n").fillna(0)
+
+print(df)
+
+data = {
+    "N": len(df),
+    "C": len(df.columns),
+    "votes": df.values.astype(int),
+}
+
+from cmdstanpy import CmdStanModel
+
+model = CmdStanModel(
+    stan_file=f"votes_distrib.stan",
+)
+fit = model.sample(
+    data=data,
+    chains=4,
+    threads_per_chain=1,
+    iter_sampling=1000,
+    iter_warmup=250,
+    show_console=True,
+)
+
+vars = fit.stan_variables()
+samples = {}
+for (k, v) in vars.items():
+    samples[k] = v
+
+# if args.normalize:
+#     df["n"] /= df["n_responses"]
+
+# # df = df.pivot(index=["clip_id"], columns=individual_col, values="n").reset_index()
+# df = df.groupby([individual_col, "n"]).agg(
+#     clip_id = ("n", "count")
+# )
+# df.to_csv(f"votes_distrib_{args.key}.csv")

+ 20 - 0
votes_distrib.stan

@@ -0,0 +1,20 @@
+data {
+    int<lower=1> N;
+    int<lower=1> C;
+    array[N,C] int<lower=0> votes;
+}
+
+parameters {
+    vector<lower=0>[C] alphas;
+    //array[N] simplex[C] p;
+}
+
+model {
+    for (i in 1:N) {
+        votes[i] ~ multinomial_dirichlet(alphas);
+        //votes[i] ~ multinomial(p[i]);
+        //p[i] ~ dirichlet(alphas);
+    }
+
+    alphas ~ exponential(1);
+}