123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140 |
- import numpy as np
- import pandas as pd
- from scipy.stats import entropy
- from sklearn.linear_model import LinearRegression
- from matplotlib import pyplot as plt
- import matplotlib
- from matplotlib import pyplot as plt
- matplotlib.use("pgf")
- matplotlib.rcParams.update(
- {
- "pgf.texsystem": "xelatex",
- "font.family": "serif",
- "font.serif": "Times New Roman",
- "text.usetex": True,
- "pgf.rcfonts": False,
- }
- )
- plt.rcParams["text.latex.preamble"].join([
- r"\usepackage{amsmath}",
- r"\setmainfont{amssymb}",
- ])
- import seaborn as sns
- import argparse
- from os.path import join as opj
- import pickle
- from cmdstanpy import CmdStanModel
- parser = argparse.ArgumentParser()
- parser.add_argument("--input")
- args = parser.parse_args()
- topics = pd.read_csv(opj(args.input, "topics.csv"))
- junk = topics["label"].str.contains("Junk")
- topics = topics[~junk]["label"].tolist()
- fig, ax = plt.subplots()
- n_topics = len(pd.read_csv(opj(args.input, "topics.csv")))
- df = pd.read_csv(opj(args.input, "aggregate.csv"))
- resources = pd.read_parquet(opj(args.input, "pooled_resources.parquet"))
- df = df.merge(resources, left_on="bai", right_on="bai")
- NR = np.stack(df[[f"start_{k+1}" for k in range(n_topics)]].values).astype(int)
- NC = np.stack(df[[f"end_{k+1}" for k in range(n_topics)]].values).astype(int)
- expertise = np.stack(df[[f"expertise_{k+1}" for k in range(n_topics)]].values)
- NR = NR[:,~junk]
- NC = NC[:,~junk]
- S = np.stack(df["pooled_resources"])
- S = S[:,~junk]
- expertise = expertise[:,~junk]
- x = NR/NR.sum(axis=1)[:,np.newaxis]
- y = NC/NC.sum(axis=1)[:,np.newaxis]
- S = S/S.sum(axis=1)[:,np.newaxis]
- # R = np.array([
- # [((expertise[:,i]>expertise[:,i].mean())&(expertise[:,j]>expertise[:,j].mean())).mean()/((expertise[:,i]>expertise[:,i].mean())|(expertise[:,j]>expertise[:,j].mean())).mean() for j in range(len(topics))]
- # for i in range(len(topics))
- # ])
- nu = np.load(opj(args.input, "nu_expertise_symmetric.npy"))
- print(nu)
- df["research_diversity"] = np.exp(entropy(x, axis=1))
- df["social_diversity"] = np.exp(entropy(np.stack(df["pooled_resources"]),axis=1))
- df["intellectual_diversity"] = np.exp(entropy(expertise,axis=1))
- # df["social_magnitude"] = np.log(1+np.stack(df["pooled_resources"]).sum(axis=1))
- df["social_magnitude"] = np.stack(df["pooled_resources"]).sum(axis=1)
- expertise_matrix = np.einsum("ki,kj->kij", expertise, expertise)
- social_expertise_matrix = np.einsum("ki,kj->kij", S, S)
- df["intellectual_stirling"] = 1-np.einsum("ij,kij->k", nu, expertise_matrix)
- df["social_stirling"] = 1-np.einsum("ij,kij->k", nu, social_expertise_matrix)
- df.fillna({
- "social_stirling": 0,
- "social_diversity": 0,
- "intellectual_diversity": 0
- }, inplace=True)
- df["excess_social_diversity"] = df["social_diversity"]-LinearRegression().fit(df[["intellectual_diversity"]], df["social_diversity"]).predict(df[["intellectual_diversity"]])
- df["excess_social_stirling"] = df["social_stirling"]-LinearRegression().fit(df[["intellectual_stirling"]], df["social_stirling"]).predict(df[["intellectual_stirling"]])
- brokerage = pd.read_csv(opj(args.input, "brokerage.csv"))
- df = df.merge(brokerage, left_on="bai", right_on="bai")
- print(df)
- # df["brokerage"] = np.log(1+df["brokerage"])
- df.fillna(0, inplace=True)
- measures = ["intellectual_diversity", "intellectual_stirling", "excess_social_diversity", "excess_social_stirling", "social_magnitude", "brokerage"]
- labels = ["\\textbf{Intellectual diversity}", "Intellectual diversity (Stirling)", "\\textbf{Excess social diversity}", "Excess social diversity (Stirling)", "\\textbf{Power} (magnitude)", "Power (brokerage)"]
- R = np.zeros((len(measures), len(measures)))
- for i, a in enumerate(measures):
- for j, b in enumerate(measures):
- if i == j:
- R[i,j] = 1
- else:
- R[i,j] = np.corrcoef(df[a], df[b])[0, 1]
- print(R[i,j])
- fig, ax = plt.subplots(figsize=(4,3.2))
- R[np.tril_indices(R.shape[0],k=-1)] = np.nan
- sns.heatmap(
- R,
- cmap="Reds",
- vmin=0,
- vmax=1,
- xticklabels=labels,
- yticklabels=labels,
- ax=ax,
- annot=R,
- fmt=".2f",
- annot_kws={"fontsize": 6},
- square=True
- )
- # ax.xaxis.set_tick_params(rotation=45)
- ax.yaxis.set_tick_params(rotation=0)
- ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, ha="right")
- fig.savefig(
- opj(args.input, f"capital_measures.eps"),
- bbox_inches="tight",
- )
- df.agg(['mean', 'std']).to_csv(opj(args.input, "capital_measures.csv"))
|