1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162 |
- import pandas as pd
- import numpy as np
- from matplotlib import pyplot as plt
- import seaborn as sns
- df = pd.read_csv("data/maturity2/maturity2.csv")
- df = df[df["selection"]=="key_chi"]
- df["child_age"] /= 365
- df = df[~df["majority_label_age"].isin(["Junk",""])]
- df = df.groupby(["child_id", "child_age", "majority_label_age"]).agg(
- n=("majority_label_age", "count")
- )
- df = df.reset_index()
- df = df.pivot(columns="majority_label_age", index=["child_id","child_age"], values="n")
- df["ratio"] = df["Baby"]/(df["Baby"]+df["Child"])
- df = df.reset_index()
- df = df[df["ratio"]>=0]
- from cmdstanpy import CmdStanModel
- data = {
- "N": len(df),
- "age": df["child_age"].astype(float).values,
- "ratio": df["ratio"].astype(float).values
- }
- model = CmdStanModel(
- stan_file=f"model.stan",
- )
- fit = model.sample(
- data=data,
- chains=4,
- threads_per_chain=1,
- iter_sampling=2000,
- iter_warmup=500,
- show_console=True,
- )
- vars = fit.stan_variables()
- samples = {}
- for (k, v) in vars.items():
- samples[k] = v
- m = samples["lp"].max()
- q = np.exp(samples["lp"]-m).mean(axis=0)
- p = q/q.sum()
-
- fig, ax1 = plt.subplots()
- ax1.scatter(df["child_age"], df["ratio"], label="Data")
- ax1.set_xlabel("Key child age (in years)")
- ax1.set_ylabel("Baby/(Baby+Child)")
- ax2 = ax1.twinx()
- ax2.plot(np.linspace(0,5,len(p)),q, color="black", label="Threshold probability density")
- fig.legend()
- fig.savefig("output/ratio_vs_age.png", bbox_inches="tight")
|