|
@@ -0,0 +1,79 @@
|
|
|
+import pandas as pd
|
|
|
+import numpy as np
|
|
|
+from matplotlib import pyplot as plt
|
|
|
+import seaborn as sns
|
|
|
+
|
|
|
+import argparse
|
|
|
+
|
|
|
+def order(x,l):
|
|
|
+ if x == "NO-LABEL":
|
|
|
+ return 2
|
|
|
+ if x in ["NA","Junk"]:
|
|
|
+ return 1
|
|
|
+ else:
|
|
|
+ return -sorted(l, reverse=True).index(x)
|
|
|
+
|
|
|
+parser = argparse.ArgumentParser()
|
|
|
+parser.add_argument("--key")
|
|
|
+parser.add_argument("--normalize", action="store_true", default=False)
|
|
|
+args = parser.parse_args()
|
|
|
+
|
|
|
+majority_col = f"majority_label_{args.key}"
|
|
|
+individual_col = f"labels_{args.key}"
|
|
|
+
|
|
|
+df = pd.concat([
|
|
|
+ pd.read_csv("speech-maturity-dataset/data/babblecor/babblecor.csv"),
|
|
|
+ pd.read_csv("speech-maturity-dataset/data/maturity1/maturity1.csv"),
|
|
|
+ pd.read_csv("speech-maturity-dataset/data/maturity2/maturity2.csv"),
|
|
|
+])
|
|
|
+df["clip_id"] = df.index.astype(int)+1
|
|
|
+
|
|
|
+df.dropna(axis=0, subset=[majority_col], inplace=True)
|
|
|
+
|
|
|
+df[individual_col] = df[individual_col].str.split(",")
|
|
|
+df["n_responses"] = df[individual_col].map(len)
|
|
|
+df = df.explode(individual_col)
|
|
|
+
|
|
|
+df = df.groupby(["clip_id",individual_col]).agg(
|
|
|
+ n=("child_id", "count"),
|
|
|
+ majority_label=(majority_col, "first"),
|
|
|
+ n_responses=("n_responses", "first")
|
|
|
+).reset_index()
|
|
|
+
|
|
|
+df = df.pivot(index="clip_id", columns=individual_col,values="n").fillna(0)
|
|
|
+
|
|
|
+print(df)
|
|
|
+
|
|
|
+data = {
|
|
|
+ "N": len(df),
|
|
|
+ "C": len(df.columns),
|
|
|
+ "votes": df.values.astype(int),
|
|
|
+}
|
|
|
+
|
|
|
+from cmdstanpy import CmdStanModel
|
|
|
+
|
|
|
+model = CmdStanModel(
|
|
|
+ stan_file=f"votes_distrib.stan",
|
|
|
+)
|
|
|
+fit = model.sample(
|
|
|
+ data=data,
|
|
|
+ chains=4,
|
|
|
+ threads_per_chain=1,
|
|
|
+ iter_sampling=1000,
|
|
|
+ iter_warmup=250,
|
|
|
+ show_console=True,
|
|
|
+)
|
|
|
+
|
|
|
+vars = fit.stan_variables()
|
|
|
+samples = {}
|
|
|
+for (k, v) in vars.items():
|
|
|
+ samples[k] = v
|
|
|
+
|
|
|
+# if args.normalize:
|
|
|
+# df["n"] /= df["n_responses"]
|
|
|
+
|
|
|
+# # df = df.pivot(index=["clip_id"], columns=individual_col, values="n").reset_index()
|
|
|
+# df = df.groupby([individual_col, "n"]).agg(
|
|
|
+# clip_id = ("n", "count")
|
|
|
+# )
|
|
|
+# df.to_csv(f"votes_distrib_{args.key}.csv")
|