12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485 |
- #!/usr/bin/env python
- import pandas as pd
- import numpy as np
- from matplotlib import pyplot as plt
- import seaborn as sns
- import argparse
- def order(x,l):
- if x == "NO-LABEL":
- return 2
- if x in ["NA","Junk"]:
- return 1
- else:
- return -sorted(l, reverse=True).index(x)
- parser = argparse.ArgumentParser()
- parser.add_argument("--key")
- parser.add_argument("--threads-per-chain", type=int, default=4)
- parser.add_argument("--normalize", action="store_true", default=False)
- args = parser.parse_args()
- majority_col = f"majority_label_{args.key}"
- individual_col = f"labels_{args.key}"
- df = pd.concat([
- pd.read_csv("speech-maturity-dataset/data/babblecor/babblecor.csv"),
- pd.read_csv("speech-maturity-dataset/data/maturity1/maturity1.csv"),
- pd.read_csv("speech-maturity-dataset/data/maturity2/maturity2.csv"),
- ])
- df["clip_id"] = df.index.astype(int)+1
- df.dropna(axis=0, subset=[majority_col], inplace=True)
- df[individual_col] = df[individual_col].str.split(",")
- df["n_responses"] = df[individual_col].map(len)
- df = df.explode(individual_col)
- df = df.groupby(["clip_id",individual_col]).agg(
- n=("child_id", "count"),
- majority_label=(majority_col, "first"),
- n_responses=("n_responses", "first")
- ).reset_index()
- df = df.pivot(index="clip_id", columns=individual_col,values="n").fillna(0)
- print(df)
- data = {
- "N": len(df),
- "C": len(df.columns),
- "votes": df.values.astype(int),
- }
- from cmdstanpy import CmdStanModel
- model = CmdStanModel(
- stan_file=f"votes_distrib.stan",
- cpp_options={"STAN_THREADS": "TRUE"},
- )
- fit = model.sample(
- data=data,
- chains=2,
- threads_per_chain=args.threads_per_chain,
- iter_sampling=1000,
- iter_warmup=250,
- show_console=True,
- )
- vars = fit.stan_variables()
- samples = {}
- for (k, v) in vars.items():
- samples[k] = v
- np.savez('votes_distrib.npz', **samples)
- # if args.normalize:
- # df["n"] /= df["n_responses"]
- # # df = df.pivot(index=["clip_id"], columns=individual_col, values="n").reset_index()
- # df = df.groupby([individual_col, "n"]).agg(
- # clip_id = ("n", "count")
- # )
- # df.to_csv(f"votes_distrib_{args.key}.csv")
|