Lucas Gautheron 3 months ago
parent
commit
5338592ffe
2 changed files with 2706 additions and 1 deletions
  1. 62 1
      threshold.py
  2. 2644 0
      votes_distrib.eps

+ 62 - 1
threshold.py

@@ -1 +1,62 @@
-/annex/objects/MD5E-s1459--58acfe400b29daf088dd4488aaa81c39.py
+import pandas as pd
+import numpy as np
+from matplotlib import pyplot as plt 
+import seaborn as sns
+
+
+df = pd.read_csv("data/maturity2/maturity2.csv")
+df = df[df["selection"]=="key_chi"]
+df["child_age"] /= 365
+
+df = df[~df["majority_label_age"].isin(["Junk",""])]
+
+df = df.groupby(["child_id", "child_age", "majority_label_age"]).agg(
+    n=("majority_label_age", "count")
+)
+
+df = df.reset_index()
+df = df.pivot(columns="majority_label_age", index=["child_id","child_age"], values="n")
+df["ratio"] = df["Baby"]/(df["Baby"]+df["Child"])
+
+df = df.reset_index()
+df = df[df["ratio"]>=0]
+
+from cmdstanpy import CmdStanModel
+
+data = {
+    "N": len(df),
+    "age": df["child_age"].astype(float).values,
+    "ratio": df["ratio"].astype(float).values
+}
+
+model = CmdStanModel(
+    stan_file=f"model.stan",
+)
+fit = model.sample(
+    data=data,
+    chains=4,
+    threads_per_chain=1,
+    iter_sampling=2000,
+    iter_warmup=500,
+    show_console=True,
+)
+
+vars = fit.stan_variables()
+samples = {}
+for (k, v) in vars.items():
+    samples[k] = v
+
+m = samples["lp"].max()
+q = np.exp(samples["lp"]-m).mean(axis=0)
+p = q/q.sum()
+    
+fig, ax1 = plt.subplots()
+ax1.scatter(df["child_age"], df["ratio"], label="Data")
+ax1.set_xlabel("Key child age (in years)")
+ax1.set_ylabel("Baby/(Baby+Child)")
+ax2 = ax1.twinx()
+ax2.plot(np.linspace(0,5,len(p)),q, color="black", label="Threshold probability density")
+
+fig.legend()
+
+fig.savefig("output/ratio_vs_age.png", bbox_inches="tight")

File diff suppressed because it is too large
+ 2644 - 0
votes_distrib.eps