123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124 |
- import pandas as pd
- import numpy as np
- from cmdstanpy import CmdStanModel
- import argparse
- import pickle
- from os.path import join as opj, basename
- import seaborn as sns
- from matplotlib import pyplot as plt
- parser = argparse.ArgumentParser()
- parser.add_argument("--input", default="output/etm_20_pretrained")
- parser.add_argument("--portfolios", default=None)
- parser.add_argument("--resources", default=None)
- parser.add_argument("--n", type=int, default=4000)
- parser.add_argument("--min-pop", type=int, default=100)
- parser.add_argument("--stack-rows", type=int, default=1)
- parser.add_argument("--chains", type=int, default=1)
- parser.add_argument("--threads-per-chain", type=int, default=4)
- parser.add_argument("--samples", type=int, default=500)
- parser.add_argument("--warmup", type=int, default=300)
- parser.add_argument("--model", default="control_nu")
- parser.add_argument("--mle", action="store_true", default=False)
- args = parser.parse_args()
- portfolios = opj(args.input, "aggregate.csv") if args.portfolios is None else args.portfolios
- n_topics = len(pd.read_csv(opj(args.input, "topics.csv")))
- df = pd.read_csv(portfolios)
- df = df[df[[f"start_{k+1}" for k in range(n_topics)]].sum(axis=1) >= args.min_pop]
- if args.n < len(df):
- df = df.head(n=args.n)
- n_authors = len(df)
- print(opj(args.input, f"ei_samples_{args.model}_{basename(portfolios)}.npz"))
- resources = pd.read_parquet(
- opj(args.input, "pooled_resources.parquet") if args.resources is None else args.resources
- )
- print(len(resources))
- df = df.merge(resources, left_on="bai", right_on="bai")
- assert len(df)==n_authors
- print(n_authors)
- data = {
- "NR": np.stack(df[[f"start_{k+1}" for k in range(n_topics)]].values).astype(int),
- "NC": np.stack(df[[f"end_{k+1}" for k in range(n_topics)]].values).astype(int),
- "expertise": np.stack(df[[f"expertise_{k+1}" for k in range(n_topics)]].values),
- "R": n_topics,
- "C": n_topics,
- "n_units": len(df),
- "threads": args.threads_per_chain
- }
- data["cov"] = np.stack(df["pooled_resources"])
- junk = np.sum(data["NR"] + data["NC"], axis=0) == 0
- for col in ["NR", "NC", "cov", "expertise"]:
- data[col] = data[col][:, ~junk]
- data["R"] -= junk.sum()
- data["C"] -= junk.sum()
- data["cov"] = data["cov"]# / np.maximum(data["cov"].sum(axis=1)[:, np.newaxis], 1)
- fig, ax = plt.subplots()
- sns.heatmap(
- np.corrcoef(data["NC"].T, data["cov"].T), vmin=-0.5, vmax=0.5, cmap="RdBu", ax=ax
- )
- plt.show()
- fig, ax = plt.subplots()
- for i in range(data["R"]):
- ax.scatter(data["cov"][:,i], data["NR"][:,i]/data["NR"].sum(axis=1))
- plt.show()
- expertise = data["expertise"]
- data["nu"] = np.array([
- [((expertise[:,i]>expertise[:,i].mean())&(expertise[:,j]>expertise[:,j].mean())).mean()/(expertise[:,i]>expertise[:,i].mean()).mean() for j in range(data["R"])]
- for i in range(data["R"])
- ])
- model = CmdStanModel(
- stan_file=f"code/ei_cov_softmax_{args.model}.stan",
- cpp_options={"STAN_THREADS": "TRUE"},
- )
- if args.mle:
- fit = model.optimize(
- data=data,
- show_console=True,
- iter=5000,
- )
- else:
- fit = model.sample(
- data=data,
- chains=args.chains,
- threads_per_chain=args.threads_per_chain,
- iter_sampling=args.samples,
- iter_warmup=args.warmup,
- show_console=True,
- max_treedepth=11,
- # step_size=0.1
- )
- vars = fit.stan_variables()
- samples = {}
- for (k, v) in vars.items():
- samples[k] = v
- format = "mle" if args.mle else "samples"
- if args.portfolios is None:
- np.savez_compressed(opj(args.input, f"ei_{format}_{args.model}.npz"), **samples)
- else:
- np.savez_compressed(opj(args.input, f"ei_{format}_{args.model}_{basename(portfolios)}.npz"), **samples)
|