comparative_analysis.py 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545
  1. from cProfile import label
  2. import numpy as np
  3. import pandas as pd
  4. from scipy.stats import entropy
  5. import ot
  6. from sklearn.linear_model import LinearRegression
  7. from matplotlib import pyplot as plt
  8. import matplotlib
  9. matplotlib.use("pgf")
  10. matplotlib.rcParams.update(
  11. {
  12. "pgf.texsystem": "xelatex",
  13. "font.family": "serif",
  14. "font.serif": "Times New Roman",
  15. "text.usetex": True,
  16. "pgf.rcfonts": False,
  17. }
  18. )
  19. plt.rcParams["text.latex.preamble"].join([
  20. r"\usepackage{amsmath}",
  21. r"\setmainfont{amssymb}",
  22. ])
  23. from textwrap import wrap
  24. import argparse
  25. from os.path import join as opj, exists
  26. import pickle
  27. from cmdstanpy import CmdStanModel
  28. parser = argparse.ArgumentParser()
  29. parser.add_argument("--input")
  30. parser.add_argument("--suffix", default=None)
  31. parser.add_argument("--metric", default="change", choices=["change", "disruption", "diversification", "diversification_stirling", "entered", "exited"])
  32. parser.add_argument("--diversity", default="entropy", choices=["entropy", "stirling"])
  33. parser.add_argument("--power", choices=["magnitude", "brokerage"], default="magnitude")
  34. parser.add_argument("--model", default="", choices=["", "bare"])
  35. parser.add_argument("--compact", action="store_true", default=False)
  36. args = parser.parse_args()
  37. def age():
  38. if not exists(opj(args.input, "age.csv")):
  39. articles = pd.read_parquet("../semantics/inspire-harvest/database/articles.parquet")[["article_id", "date_created", "pacs_codes", "curated", "accelerators"]]
  40. articles["article_id"] = articles.article_id.astype(int)
  41. articles = articles[articles["date_created"].str.len() >= 4]
  42. articles["year"] = articles["date_created"].str[:4].astype(int)
  43. articles["age"] = 2015-articles["date_created"].str[:4].astype(int)
  44. age = articles[["article_id", "age"]].copy()
  45. articles = articles[(articles["year"]>=2000)&(articles["year"]<2010)]
  46. _articles = pd.read_csv(opj(args.input, "articles.csv"))
  47. articles = _articles.merge(articles, how="inner")
  48. authors = pd.read_parquet("../semantics/inspire-harvest/database/articles_authors.parquet")
  49. authors["article_id"] = authors.article_id.astype(int)
  50. n_authors = authors.groupby("article_id").agg(n_authors=("bai", "count")).reset_index()
  51. articles = articles.merge(n_authors, how="left", left_on="article_id", right_on="article_id")
  52. # exclude large collaborations (experiments, software, etc.)
  53. articles = articles[articles.accelerators.map(len)==0]
  54. articles = articles[articles["n_authors"]<10]
  55. references = pd.read_parquet("../semantics/inspire-harvest/database/articles_references.parquet")
  56. references = references[references["cites"]!=references["cited"]]
  57. references = references.groupby("cited").agg(citations=("cites", "count")).reset_index()
  58. references["cited"] = references.cited.astype(int)
  59. references = references[references["cited"].isin(articles.article_id)]
  60. articles = articles.merge(references, how="outer", left_on="article_id", right_on="cited")
  61. articles.dropna(subset=["year"], inplace=True)
  62. articles.fillna({"citations": 0}, inplace=True)
  63. articles["citations_per_author"] = articles["citations"]/articles["n_authors"]
  64. del references
  65. age = age.merge(authors, how="inner", left_on="article_id", right_on="article_id")
  66. age = age.groupby("bai").agg(age=("age", "max")).reset_index()
  67. age.to_csv(opj(args.input, "age.csv"))
  68. else:
  69. age = pd.read_csv(opj(args.input, "age.csv"))
  70. return age
  71. def institution_stability():
  72. if exists(opj(args.input, "institutional_stability.csv")):
  73. return pd.read_csv(opj(args.input, "institutional_stability.csv"), index_col="bai")
  74. affiliations = pd.read_parquet("../semantics/inspire-harvest/database/affiliations.parquet")
  75. affiliations["article_id"] = affiliations.article_id.astype(int)
  76. articles = pd.read_parquet("../semantics/inspire-harvest/database/articles.parquet")[["article_id", "date_created"]]
  77. articles = articles[articles["date_created"].str.len() >= 4]
  78. articles["year"] = articles["date_created"].str[:4].astype(int) - 2000
  79. articles["article_id"] = articles.article_id.astype(int)
  80. articles = articles[articles["year"] <= 2019 - 2000]
  81. articles = articles[articles["year"] >= 0]
  82. affiliations["article_id"] = affiliations.article_id.astype(int)
  83. affiliations = affiliations.merge(articles, how="inner", left_on="article_id", right_on="article_id")
  84. affiliations = affiliations[affiliations["bai"].isin(df["bai"])]
  85. authors_last = affiliations.groupby("bai").agg(last_article=("year", "max"))
  86. hosts = affiliations.sort_values(["bai", "institution_id", "year"]).groupby(["bai", "institution_id"]).agg(
  87. first=("year", "min"),
  88. last=("year", "max")
  89. )
  90. hosts["duration"] = hosts["last"]-hosts["first"]
  91. stability = hosts.groupby("bai").agg(stability=("duration", "max"), last=("last", "max"), first=("first", "min"))
  92. stability = stability.merge(authors_last, left_index=True, right_index=True)
  93. stability["stable"] = stability["stability"]>=(stability["last"]-stability["first"]-1)
  94. stability.to_csv(opj(args.input, "institutional_stability.csv"))
  95. return stability
  96. suffix = f"_{args.suffix}" if args.suffix is not None else ""
  97. topics = pd.read_csv(opj(args.input, "topics.csv"))
  98. junk = topics["label"].str.contains("Junk")
  99. topics = topics[~junk]["label"].tolist()
  100. fig, ax = plt.subplots()
  101. n_topics = len(pd.read_csv(opj(args.input, "topics.csv")))
  102. df = pd.read_csv(opj(args.input, "aggregate.csv"))
  103. resources = pd.read_parquet(opj(args.input, "pooled_resources.parquet"))
  104. df = df.merge(resources, left_on="bai", right_on="bai")
  105. NR = np.stack(df[[f"start_{k+1}" for k in range(n_topics)]].values).astype(int)
  106. NC = np.stack(df[[f"end_{k+1}" for k in range(n_topics)]].values).astype(int)
  107. expertise = np.stack(df[[f"expertise_{k+1}" for k in range(n_topics)]].values)
  108. S = np.stack(df["pooled_resources"])
  109. brokerage = pd.read_csv("output/authors_brokerage.csv")
  110. df = df.merge(brokerage, left_on="bai", right_on="bai")
  111. NR = NR[:,~junk]
  112. NC = NC[:,~junk]
  113. expertise = expertise[:,~junk]
  114. S = S[:,~junk]
  115. x = NR/NR.sum(axis=1)[:,np.newaxis]
  116. y = NC/NC.sum(axis=1)[:,np.newaxis]
  117. S_distrib = S/S.sum(axis=1)[:,np.newaxis]
  118. R = np.array([
  119. [((expertise[:,i]>expertise[:,i].mean())&(expertise[:,j]>expertise[:,j].mean())).mean()/(expertise[:,i]>expertise[:,i].mean()).mean() for j in range(len(topics))]
  120. for i in range(len(topics))
  121. ])
  122. change = np.abs(y-x).sum(axis=1)/2
  123. diversification = (np.exp(entropy(y, axis=1))-np.exp(entropy(x, axis=1)))/x.shape[1]
  124. x_matrix = np.einsum("ki,kj->kij", x, x)
  125. y_matrix = np.einsum("ki,kj->kij", y, y)
  126. x_stirling = 1-np.einsum("ij,kij->k", R, x_matrix)
  127. y_stirling = 1-np.einsum("ij,kij->k", R, y_matrix)
  128. disruption = np.zeros(len(change))
  129. for a in range(len(change)):
  130. disruption[a] = ot.emd2(x[a,:].copy(order='C'), y[a,:].copy(order='C'), 1-R, processes=4)
  131. alpha = 1
  132. exited = ((x>alpha*x.mean(axis=0))&(y<alpha*y.mean(axis=0))).sum(axis=1)
  133. entered = ((x<alpha*x.mean(axis=0))&(y>alpha*y.mean(axis=0))).sum(axis=1)
  134. fig, ax = plt.subplots(figsize=[6.4, 3.2])
  135. ax.hist(change, bins=np.linspace(0,1,50), histtype="step")
  136. ax.set_xlabel(f"Change score $c_a = \\frac{{1}}{{2}}\\sum_k |y_{{ak}}-x_{{ak}}|$")
  137. ax.set_ylabel("\\# of scientists")
  138. fig.savefig(opj(args.input, "change_score.eps"), bbox_inches="tight")
  139. print("change 50%% interval: ", np.quantile(change,q=0.25), np.quantile(change,q=1-0.25))
  140. fig, ax = plt.subplots(figsize=[6.4, 3.2])
  141. ax.hist(diversification, bins=np.linspace(-0.5,0.5,50), histtype="step")
  142. ax.set_xlabel(f"Diversification score $\\Delta_a$")
  143. ax.set_ylabel("\\# of scientists")
  144. fig.savefig(opj(args.input, "diversification_score.eps"), bbox_inches="tight")
  145. fig, ax = plt.subplots()
  146. ax.hist(disruption, bins=np.linspace(0,1,50), histtype="step")
  147. ax.set_xlabel(f"Disruption score $d_a$")
  148. ax.set_ylabel("\\# of scientists")
  149. fig.savefig(opj(args.input, "disruption_score.eps"), bbox_inches="tight")
  150. df["change_score"] = change
  151. df["disruption_score"] = disruption
  152. df["diversification_score"] = diversification
  153. df["diversification_stirling_score"] = y_stirling-x_stirling
  154. df["entered_score"] = (entered>0).astype(int)
  155. df["exited_score"] = (exited>0).astype(int)
  156. df["origin"] = np.argmax(x, axis=1)
  157. df["target"] = np.argmax(y, axis=1)
  158. df["origin_value"] = x.max(axis=1)
  159. df["target_value"] = y.max(axis=1)
  160. df["origin_final_value"] = np.array(y[a,df.loc[a, "origin"]] for a in range(x.shape[0]))
  161. df["target_initial_value"] = np.array(x[a,df.loc[a, "target"]] for a in range(x.shape[0]))
  162. df["origin_label"] = df["origin"].apply(lambda k: topics[k])
  163. df["target_label"] = df["target"].apply(lambda k: topics[k])
  164. df["origin_label"] = df.apply(lambda row: row["origin_label"] + (f" ({row['origin_value']:.2f})" if row["origin"]==row["target"] else f" ({row['origin_value']:.2f}$\\to${row['origin_final_value']:.2f})"), axis=1)
  165. df["target_label"] = df.apply(lambda row: row["target_label"] + (f" ({row['target_value']:.2f})" if row["origin"]==row["target"] else f" ({row['target_initial_value']:.2f}$\\to${row['target_value']:.2f})"), axis=1)
  166. df["social_entropy"] = np.exp(entropy(S,axis=1))
  167. df["intellectual_entropy"] = np.exp(entropy(expertise,axis=1))
  168. expertise_matrix = np.einsum("ki,kj->kij", expertise, expertise)
  169. social_expertise_matrix = np.einsum("ki,kj->kij", S_distrib, S_distrib)
  170. df["intellectual_stirling"] = 1-np.einsum("ij,kij->k", R, expertise_matrix)
  171. df["social_stirling"] = 1-np.einsum("ij,kij->k", R, social_expertise_matrix)
  172. stability = institution_stability()
  173. df = df.merge(stability, left_on="bai", right_index=True)
  174. df = df.merge(age(), left_on="bai", right_on="bai")
  175. df["primary_research_area"] = x.argmax(axis=1)
  176. df["social_diversity"] = df[f"social_{args.diversity}"].fillna(0)
  177. df["intellectual_diversity"] = df[f"intellectual_{args.diversity}"].fillna(0)
  178. df["res_social_diversity"] = df["social_diversity"]-LinearRegression().fit(df[["intellectual_diversity"]], df["social_diversity"]).predict(df[["intellectual_diversity"]])
  179. data = {
  180. "N": len(df),
  181. "K": x.shape[1],
  182. "m": df[f"{args.metric}_score"],
  183. # "soc_cap": np.log(1+S.sum(axis=1)),
  184. "soc_cap": S.sum(axis=1) if args.power == "magnitude" else df["brokerage"].values,
  185. "soc_div": df["social_diversity"],
  186. "int_div": df["intellectual_diversity"],
  187. "res_soc_div": df["res_social_diversity"],
  188. "x": x,
  189. "initial_div": np.exp(entropy(x, axis=1)),
  190. "primary_research_area": df["primary_research_area"],
  191. "stable": df["stable"].astype(float).values,
  192. "age": df["age"].values
  193. }
  194. fig, ax = plt.subplots(figsize=[6.4, 3.2])
  195. ax.hist(change[df["primary_research_area"] != 4], bins=np.linspace(0,1,25), histtype="step", label=f"Others ($\\mu={change[df['primary_research_area'] != 4].mean():.2f}$)", density=True)
  196. ax.hist(change[df["primary_research_area"] == 4], bins=np.linspace(0,1,25), histtype="step", label=f"Collider physics ($\\mu={change[df['primary_research_area'] == 4].mean():.2f}$)", density=True)
  197. ax.set_xlabel(f"Change score $c_a = \\frac{{1}}{{2}}\\sum_k |y_{{ak}}-x_{{ak}}|$")
  198. ax.set_ylabel("\\# of scientists")
  199. ax.legend(loc='upper right', bbox_to_anchor=(1, 1.2))
  200. fig.savefig(opj(args.input, "change_score_collider_physics.eps"), bbox_inches="tight")
  201. fig, ax = plt.subplots(figsize=[6.4, 3.2])
  202. ax.hist(disruption[df["primary_research_area"] != 4], bins=np.linspace(0,1,25), histtype="step", label=f"Others ($\\mu={disruption[df['primary_research_area'] != 4].mean():.2f}$)", density=True)
  203. ax.hist(disruption[df["primary_research_area"] == 4], bins=np.linspace(0,1,25), histtype="step", label=f"Collider physics ($\\mu={disruption[df['primary_research_area'] == 4].mean():.2f}$)", density=True)
  204. ax.set_xlabel(f"Disruption score $d_a$")
  205. ax.set_ylabel("\\# of scientists")
  206. ax.legend(loc='upper right', bbox_to_anchor=(1, 1.2))
  207. fig.savefig(opj(args.input, "disruption_score_collider_physics.eps"), bbox_inches="tight")
  208. if not exists(opj(args.input, f"samples_{args.metric}_{args.diversity}_{args.power}.npz")):
  209. model = CmdStanModel(
  210. stan_file=f"code/{args.metric}.stan" if args.model==""
  211. else f"code/{args.metric}_{args.model}_{args.power}.stan",
  212. )
  213. fit = model.sample(
  214. data=data,
  215. chains=4,
  216. iter_sampling=10000,
  217. iter_warmup=1000,
  218. show_console=True
  219. )
  220. vars = fit.stan_variables()
  221. samples = {}
  222. for (k, v) in vars.items():
  223. samples[k] = v
  224. np.savez_compressed(opj(args.input, f"samples_{args.metric}_{args.diversity}_{args.power}.npz"), **samples)
  225. samples = np.load(opj(args.input, f"samples_{args.metric}_{args.diversity}_{args.power}.npz"))
  226. labels = [
  227. "Intellectual capital (diversity)",
  228. "Social capital (diversity)",
  229. "Social capital (power)",
  230. "Stable affiliation",
  231. "Academic age",
  232. ]
  233. labels = [f"\\textbf{{{label}}}" for label in labels]
  234. labels += topics
  235. names = [
  236. "beta_int_div", "beta_soc_div", "beta_soc_cap", "beta_stable", "beta_age",
  237. ]
  238. if args.metric not in ["entered", "exited"]:
  239. mu = np.array([samples[name].mean() for name in names] + [(samples["beta_x"][:,i]*samples["tau"]).mean() for i in range(x.shape[1])])
  240. low = np.array([np.quantile(samples[name], q=0.05/2) for name in names] + [np.quantile(samples["beta_x"][:,i]*samples["tau"], q=0.05/2) for i in range(x.shape[1])])
  241. up = np.array([np.quantile(samples[name], q=1-0.05/2) for name in names] + [np.quantile(samples["beta_x"][:,i]*samples["tau"], q=1-0.05/2) for i in range(x.shape[1])])
  242. sig = up*low>0
  243. prob = np.array([(samples[name]*np.sign(samples[name].mean())<0).mean() for name in names] + [((samples["beta_x"][:,i]*np.sign(samples["beta_x"][:,i].mean()))<0).mean() for i in range(x.shape[1])])
  244. keep = sig | (np.arange(len(sig))<len(names))
  245. mu = mu[keep]
  246. low = low[keep]
  247. up = up[keep]
  248. prob = prob[keep]
  249. sign = ["<" if _mu>0 else ">" for i, _mu in enumerate(mu)]
  250. labels = [label for i, label in enumerate(labels) if keep[i]]
  251. n_vars = len(labels)
  252. # effect of capital and controls
  253. fig, ax = plt.subplots(figsize=[6.4, 0.4*(1+n_vars)])
  254. ax.scatter(mu, np.arange(len(labels))[::-1])
  255. ax.errorbar(mu, np.arange(len(labels))[::-1], xerr=(mu-low,up-mu), ls="none", capsize=4, elinewidth=1)
  256. ax.set_yticks(np.arange(len(labels))[::-1], labels)
  257. for i, p in enumerate(prob):
  258. if p>1e-4 and np.abs(p-0.5)>0.4:
  259. ax.text(
  260. -0.02 if mu[i]>0 else 0.02,
  261. np.arange(len(labels))[::-1][i],
  262. f"\\scriptsize $\\mu(\\beta)={mu[i]:.2g}, P(\\beta{sign[i]}0)={p:.2g}$",
  263. ha="right" if mu[i]>0 else "left",
  264. va="center"
  265. )
  266. elif p<0.05/2 or p>1-0.05/2:
  267. ax.text(
  268. -0.02 if mu[i]>0 else 0.02,
  269. np.arange(len(labels))[::-1][i],
  270. f"\\scriptsize $\\mu(\\beta)={mu[i]:.2g}$",
  271. ha="right" if mu[i]>0 else "left",
  272. va="center"
  273. )
  274. ax.set_xlabel(f"Effect on {args.metric}")
  275. ax.axvline(0, color="black")
  276. fig.savefig(opj(args.input, f"{args.metric}_score_effects_{args.diversity}_{args.power}.eps"), bbox_inches="tight")
  277. # average change score per research area
  278. ratio = args.metric != "diversification"
  279. labels = topics
  280. if ratio:
  281. mu = np.array([(samples["mu_x"][:,i]/samples["mu_pop"]).mean() for i in range(x.shape[1])])
  282. low = np.array([np.quantile(samples["mu_x"][:,i]/samples["mu_pop"], q=0.05/2) for i in range(x.shape[1])])
  283. up = np.array([np.quantile(samples["mu_x"][:,i]/samples["mu_pop"], q=1-0.05/2) for i in range(x.shape[1])])
  284. sig = (up-1)*(low-1)>0
  285. else:
  286. mu = np.array([(samples["mu_x"][:,i]-samples["mu_pop"]).mean() for i in range(x.shape[1])])
  287. low = np.array([np.quantile(samples["mu_x"][:,i]-samples["mu_pop"], q=0.05/2) for i in range(x.shape[1])])
  288. up = np.array([np.quantile(samples["mu_x"][:,i]-samples["mu_pop"], q=1-0.05/2) for i in range(x.shape[1])])
  289. sig = (up)*(low)>0
  290. keep = sig
  291. mu = mu[keep]
  292. low = low[keep]
  293. up = up[keep]
  294. labels = [label for i, label in enumerate(labels) if keep[i]]
  295. fig, ax = plt.subplots(figsize=[6.4, 3.2])
  296. ax.scatter(mu, np.arange(len(labels))[::-1])
  297. ax.errorbar(mu, np.arange(len(labels))[::-1], xerr=(mu-low,up-mu), ls="none", capsize=4, elinewidth=1)
  298. ax.set_yticks(np.arange(len(labels))[::-1], labels)
  299. fig, ax = plt.subplots(figsize=[6.4, 3.2])
  300. df["m_ratio"] = df[f"{args.metric}_score"]/df[f"{args.metric}_score"].mean()
  301. research_areas = df.groupby("primary_research_area").agg(
  302. mu=("m_ratio", "mean"),
  303. low=("m_ratio", lambda x: np.quantile(x, q=0.05/2)),
  304. up=("m_ratio", lambda x: np.quantile(x, q=1-0.05/2)),
  305. label=("origin_label", lambda x: x.iloc[0])
  306. ).reset_index()
  307. ax.scatter(research_areas["mu"], research_areas.index)
  308. ax.errorbar(research_areas["mu"], research_areas.index, xerr=(research_areas["mu"]-research_areas["low"],research_areas["up"]-research_areas["low"]), ls="none", capsize=4, elinewidth=1)
  309. ax.set_yticks(research_areas.index, research_areas["label"])
  310. ax.set_xlabel(f"Ratio to average {args.metric} score" if ratio else f"Difference with average {args.metric} score")
  311. ax.axvline(1 if ratio else 0, color="black")
  312. fig.savefig(opj(args.input, f"{args.metric}_research_area.eps"), bbox_inches="tight")
  313. else:
  314. labels = [
  315. "Intellectual capital (diversity)",
  316. "Social capital (diversity)",
  317. "Social capital (power)",
  318. "Stable affiliation",
  319. "Academic age",
  320. ]
  321. if not args.compact:
  322. labels = [f"\\textbf{{{label}}}" for label in labels]
  323. labels += topics
  324. samples = [
  325. np.load(opj(args.input, f"samples_entered_{args.diversity}_{args.power}.npz")),
  326. np.load(opj(args.input, f"samples_exited_{args.diversity}_{args.power}.npz"))
  327. ]
  328. mu = [None, None]
  329. low = [None, None]
  330. up = [None, None]
  331. sig = [None, None]
  332. prob = [None, None]
  333. for i in range(2):
  334. mu[i] = np.array([samples[i][name].mean() for name in names] + [(samples[i]["beta_x"][:,j]*samples[i]["tau"]).mean() for j in range(x.shape[1])])
  335. low[i] = np.array([np.quantile(samples[i][name], q=0.05/2) for name in names] + [np.quantile(samples[i]["beta_x"][:,j]*samples[i]["tau"], q=0.05/2) for j in range(x.shape[1])])
  336. up[i] = np.array([np.quantile(samples[i][name], q=1-0.05/2) for name in names] + [np.quantile(samples[i]["beta_x"][:,j]*samples[i]["tau"], q=1-0.05/2) for j in range(x.shape[1])])
  337. sig[i] = up[i]*low[i]>0
  338. prob[i] = np.array([(samples[i][name]*np.sign(samples[i][name].mean())<0).mean() for name in names] + [((samples[i]["beta_x"][:,j]*np.sign(samples[i]["beta_x"][:,j].mean()))<0).mean() for j in range(x.shape[1])])
  339. if args.compact:
  340. keep = (np.arange(len(sig[0]))<len(names))
  341. else:
  342. keep = sig[0] | sig[1] | (np.arange(len(sig[0]))<len(names))
  343. for i in range(2):
  344. mu[i] = mu[i][keep]
  345. low[i] = low[i][keep]
  346. up[i] = up[i][keep]
  347. prob[i] = prob[i][keep]
  348. sign = [["<" if _mu>0 else ">" for j, _mu in enumerate(mu[i])] for i in range(2)]
  349. labels = [label for i, label in enumerate(labels) if keep[i]]
  350. n_vars = len(labels)
  351. if args.compact:
  352. labels = [
  353. '\n'.join(map(lambda x: f"\\textbf{{{x}}}", wrap(label, width=15))) if i < 4
  354. else
  355. '\n'.join(wrap(label, width=15))
  356. for i, label in enumerate(labels)
  357. ]
  358. print(labels)
  359. # effect of capital and controls
  360. fig, ax = plt.subplots(figsize=[4.8 if args.compact else 6.4, 0.52*(1+n_vars)])
  361. colors = ['#377eb8', '#ff7f00']
  362. legend = ["entered new research area", "exited research area"]
  363. if args.compact:
  364. ax.set_xlim(-0.9, 1.25)
  365. for j in range(2):
  366. dy = -0.125 if j else +0.125
  367. ax.scatter(mu[j], np.arange(len(labels))[::-1]+dy, color=colors[j])
  368. ax.errorbar(mu[j], np.arange(len(labels))[::-1]+dy, xerr=(mu[j]-low[j],up[j]-mu[j]), ls="none", capsize=4, elinewidth=1, color=colors[j], label=legend[j])
  369. for i, p in enumerate(prob[j]):
  370. significant = p<0.05/2
  371. if p>1e-4 and np.abs(p-0.5)>0.4 and significant:
  372. ax.text(
  373. -0.02 if mu[j][i]>0 else 0.02,
  374. np.arange(len(labels))[::-1][i]+dy,
  375. f"\\scriptsize $\\mu(\\beta)={mu[j][i]:.2g},P(\\beta{sign[j][i]}0)={p:.2g}$",
  376. ha="right" if mu[j][i]>0 else "left",
  377. va="center"
  378. )
  379. elif p>1e-4 and np.abs(p-0.5)>0.4 and (not significant):
  380. ax.text(
  381. -0.02 if mu[j][i]>0 else 0.02,
  382. np.arange(len(labels))[::-1][i]+dy,
  383. f"\\scriptsize $P(\\beta{sign[j][i]}0)={p:.2g}$",
  384. ha="right" if mu[j][i]>0 else "left",
  385. va="center"
  386. )
  387. elif significant:
  388. ax.text(
  389. -0.02 if mu[j][i]>0 else 0.02,
  390. np.arange(len(labels))[::-1][i]+dy,
  391. f"\\scriptsize $\\mu(\\beta)={mu[j][i]:.2g}$",
  392. ha="right" if mu[j][i]>0 else "left",
  393. va="center"
  394. )
  395. ax.set_yticks(np.arange(len(labels))[::-1], labels)
  396. ax.set_xlabel(f"Effect size (log odds ratio)")
  397. ax.axvline(0, color="black")
  398. if args.compact:
  399. ax.legend(loc='upper right', bbox_to_anchor=(1, 1.3))
  400. else:
  401. ax.legend(loc='upper right', bbox_to_anchor=(1, 1.2))
  402. fig.savefig(opj(args.input, f"{args.metric}_score_effects_{args.diversity}_{args.power}{'_compact' if args.compact else ''}.eps"), bbox_inches="tight")
  403. table = df[["bai", "stable", f"{args.metric}_score", "intellectual_entropy", "social_entropy", "origin_label", "target_label"]].sort_values(f"{args.metric}_score", ascending=False)
  404. table.to_csv(opj(args.input, f"{args.metric}_scores.csv"))
  405. table["bai"] = table["bai"].str.replace(".1", "")
  406. table["bai"] = table["bai"].str.replace(r"^([A-Z])\.", r"\1.~")
  407. table["bai"] = table["bai"].str.replace(r"\.\~([A-Z])\.", r".~\1.~")
  408. table["bai"] = table["bai"].str.replace(r"([a-zA-Z]{2,})\.", r"\1 ")
  409. table["bai"] = table.apply(lambda r: r["bai"] if not r["stable"] else f"{r['bai']} ($\\ast$)", axis=1)
  410. table["target_label"] += "EOL"
  411. latex = table.head(20).to_latex(
  412. columns=["bai", f"{args.metric}_score", "intellectual_entropy", "social_entropy", "origin_label", "target_label"],
  413. header=["Physicist", "$c_a$", "$D(\\bm{I_a})$", "$D(\\bm{S_a})$", "Previous main area", "Current main area"],
  414. index=False,
  415. multirow=True,
  416. multicolumn=True,
  417. column_format='p{0.15\\textwidth}|c|c|c|b{0.25\\textwidth}|b{0.25\\textwidth}',
  418. escape=False,
  419. float_format=lambda x: f"{x:.2f}",
  420. caption="Physicists with the highest change scores $c_a$. $D(\\bm{I_a})$ and $D(\\bm{S_a})$ measure the diversity of intellectual and social capital. Numbers in parentheses indicate the share of attention dedicated to each research area during each time-period. Asterisks ($\\ast$) indicate physicists with a permanent position.",
  421. label=f"table:top_{args.metric}",
  422. position="H"
  423. )
  424. latex = latex.replace('EOL \\\\\n', '\\\\ \\hline\n')
  425. with open(opj(args.input, f"top_{args.metric}.tex"), "w+") as fp:
  426. fp.write(latex)
  427. latex = table.sort_values(f"{args.metric}_score", ascending=True).head(20).to_latex(
  428. columns=["bai", f"{args.metric}_score", "intellectual_entropy", "social_entropy", "origin_label", "target_label"],
  429. header=["Physicist", "$c_a$", "$D(\\bm{I_a})$", "$D(\\bm{S_a})$", "Previous main area", "Current main area"],
  430. index=False,
  431. multirow=True,
  432. multicolumn=True,
  433. column_format='p{0.15\\textwidth}|c|c|c|b{0.25\\textwidth}|b{0.25\\textwidth}',
  434. escape=False,
  435. float_format=lambda x: f"{x:.2f}",
  436. caption="Physicists with the lowest change scores $c_a$. $D(\\bm{I_a})$ and $D(\\bm{S_a})$ measure the diversity of intellectual and social capital. Numbers in parentheses indicate the share of attention dedicated to each research area. Asterisks ($\\ast$) indicate physicists with a permanent position.",
  437. label=f"table:low_{args.metric}",
  438. position="H"
  439. )
  440. latex = latex.replace('EOL \\\\\n', '\\\\ \\hline\n')
  441. with open(opj(args.input, f"low_{args.metric}.tex"), "w+") as fp:
  442. fp.write(latex)