capital_measures.py 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. import numpy as np
  2. import pandas as pd
  3. from scipy.stats import entropy
  4. from sklearn.linear_model import LinearRegression
  5. from matplotlib import pyplot as plt
  6. from matplotlib.gridspec import GridSpec
  7. import matplotlib
  8. matplotlib.use("pgf")
  9. matplotlib.rcParams.update(
  10. {
  11. "pgf.texsystem": "xelatex",
  12. "font.family": "serif",
  13. "font.serif": "Times New Roman",
  14. "text.usetex": True,
  15. "pgf.rcfonts": False,
  16. }
  17. )
  18. plt.rcParams["text.latex.preamble"].join([
  19. r"\usepackage{amsmath}",
  20. r"\setmainfont{amssymb}",
  21. ])
  22. import seaborn as sns
  23. import argparse
  24. from os.path import join as opj
  25. import pickle
  26. from cmdstanpy import CmdStanModel
  27. parser = argparse.ArgumentParser()
  28. parser.add_argument("--input")
  29. args = parser.parse_args()
  30. topics = pd.read_csv(opj(args.input, "topics.csv"))
  31. junk = topics["label"].str.contains("Junk")
  32. topics = topics[~junk]["label"].tolist()
  33. fig, ax = plt.subplots()
  34. n_topics = len(pd.read_csv(opj(args.input, "topics.csv")))
  35. df = pd.read_csv(opj(args.input, "aggregate.csv"))
  36. age = pd.read_csv(opj(args.input, "age.csv"))
  37. stability = pd.read_csv(opj(args.input, "institutional_stability.csv"))
  38. resources = pd.read_parquet(opj(args.input, "pooled_resources.parquet"))
  39. df = df.merge(resources, left_on="bai", right_on="bai")
  40. NR = np.stack(df[[f"start_{k+1}" for k in range(n_topics)]].values).astype(int)
  41. NC = np.stack(df[[f"end_{k+1}" for k in range(n_topics)]].values).astype(int)
  42. expertise = np.stack(df[[f"expertise_{k+1}" for k in range(n_topics)]].values)
  43. NR = NR[:,~junk]
  44. NC = NC[:,~junk]
  45. S = np.stack(df["pooled_resources"])
  46. S = S[:,~junk]
  47. expertise = expertise[:,~junk]
  48. x = NR/NR.sum(axis=1)[:,np.newaxis]
  49. y = NC/NC.sum(axis=1)[:,np.newaxis]
  50. S = S/S.sum(axis=1)[:,np.newaxis]
  51. # R = np.array([
  52. # [((expertise[:,i]>expertise[:,i].mean())&(expertise[:,j]>expertise[:,j].mean())).mean()/((expertise[:,i]>expertise[:,i].mean())|(expertise[:,j]>expertise[:,j].mean())).mean() for j in range(len(topics))]
  53. # for i in range(len(topics))
  54. # ])
  55. nu = np.load(opj(args.input, "nu_expertise_symmetric.npy"))
  56. print(nu)
  57. df["research_diversity"] = np.exp(entropy(x, axis=1))
  58. df["social_diversity"] = np.exp(entropy(np.stack(df["pooled_resources"]),axis=1))
  59. df["intellectual_diversity"] = np.exp(entropy(expertise,axis=1))
  60. # df["social_magnitude"] = np.log(1+np.stack(df["pooled_resources"]).sum(axis=1))
  61. df["social_magnitude"] = np.stack(df["pooled_resources"]).sum(axis=1)
  62. expertise_matrix = np.einsum("ki,kj->kij", expertise, expertise)
  63. social_expertise_matrix = np.einsum("ki,kj->kij", S, S)
  64. df["intellectual_stirling"] = 1-np.einsum("ij,kij->k", nu, expertise_matrix)
  65. df["social_stirling"] = 1-np.einsum("ij,kij->k", nu, social_expertise_matrix)
  66. df.fillna({
  67. "social_stirling": 0,
  68. "social_diversity": 0,
  69. "intellectual_diversity": 0
  70. }, inplace=True)
  71. df["excess_social_diversity"] = df["social_diversity"]-LinearRegression().fit(df[["intellectual_diversity"]], df["social_diversity"]).predict(df[["intellectual_diversity"]])
  72. df["excess_social_stirling"] = df["social_stirling"]-LinearRegression().fit(df[["intellectual_stirling"]], df["social_stirling"]).predict(df[["intellectual_stirling"]])
  73. brokerage = pd.read_csv(opj(args.input, "brokerage_2000_2009.csv"))
  74. df = df.merge(brokerage, left_on="bai", right_on="bai")
  75. print(df)
  76. # df["brokerage"] = np.log(1+df["brokerage"])
  77. df.fillna(0, inplace=True)
  78. measures = ["intellectual_diversity", "intellectual_stirling", "excess_social_diversity", "excess_social_stirling", "social_magnitude", "brokerage"]
  79. labels = ["\\textbf{Intellectual diversity}", "Intellectual diversity (Stirling)", "\\textbf{Excess social diversity}", "Excess social diversity (Stirling)", "\\textbf{Power} (magnitude)", "Power (brokerage)"]
  80. R = np.zeros((len(measures), len(measures)))
  81. for i, a in enumerate(measures):
  82. for j, b in enumerate(measures):
  83. if i == j:
  84. R[i,j] = 1
  85. else:
  86. R[i,j] = np.corrcoef(df[a], df[b])[0, 1]
  87. print(R[i,j])
  88. fig, ax = plt.subplots(figsize=(4,3.2))
  89. R[np.tril_indices(R.shape[0],k=-1)] = np.nan
  90. sns.heatmap(
  91. R,
  92. cmap="Reds",
  93. vmin=0,
  94. vmax=1,
  95. xticklabels=labels,
  96. yticklabels=labels,
  97. ax=ax,
  98. annot=R,
  99. fmt=".2f",
  100. annot_kws={"fontsize": 6},
  101. square=True
  102. )
  103. # ax.xaxis.set_tick_params(rotation=45)
  104. ax.yaxis.set_tick_params(rotation=0)
  105. ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, ha="right")
  106. fig.savefig(
  107. opj(args.input, f"capital_measures.eps"),
  108. bbox_inches="tight",
  109. )
  110. df[["research_diversity", "intellectual_diversity", "social_diversity"]].agg(['mean', 'std']).to_csv(opj(args.input, "capital_measures.csv"))
  111. fig = plt.figure(figsize=[6.4, 3.2])
  112. df = df.merge(age, how="inner", left_on="bai", right_on="bai")
  113. df = df.merge(stability, how="inner", left_on="bai", right_on="bai")
  114. gs = GridSpec(1, 2, width_ratios=[2.25, 1])
  115. ax_hist = fig.add_subplot(gs[0])
  116. ax_violin = fig.add_subplot(gs[1])
  117. df["age"] += 4
  118. ax_hist.hist(df["age"], bins=np.arange(0, 60, 2), histtype="step")
  119. ax_hist.set_xlabel("Academic age in 2019")
  120. ax_hist.set_ylabel("Number of physicists")
  121. n_stable = len(df[df["stable"]==True])
  122. n_unstable = len(df[df["stable"]==False])
  123. x_stable = n_stable/len(df)
  124. x_unstable = n_unstable/len(df)
  125. print(x_stable, x_unstable)
  126. df["stable"] = df["stable"].map({
  127. False: f"Unstable $(n={n_unstable})$",
  128. True: f"Stable $(n={n_stable})$"
  129. })
  130. g = sns.violinplot(
  131. data=df,
  132. y="age",
  133. hue="stable",
  134. split=True,
  135. gap=.1,
  136. inner="quart",
  137. ax=ax_violin,
  138. alpha=0.5,
  139. common_norm=True,
  140. bw_adjust=0.75
  141. )
  142. # check axes and find which is have legend
  143. sns.move_legend(ax_violin, title="Affiliation", loc='lower right', bbox_to_anchor=(1.1, 1), frameon=False)
  144. ax_violin.set_ylabel("Academic age in 2019")
  145. plt.subplots_adjust(wspace=0.25, hspace=0)
  146. fig.savefig(opj(args.input, "age.pdf"), bbox_inches="tight")