capital_measures.py 4.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. import numpy as np
  2. import pandas as pd
  3. from scipy.stats import entropy
  4. from sklearn.linear_model import LinearRegression
  5. from matplotlib import pyplot as plt
  6. import matplotlib
  7. from matplotlib import pyplot as plt
  8. matplotlib.use("pgf")
  9. matplotlib.rcParams.update(
  10. {
  11. "pgf.texsystem": "xelatex",
  12. "font.family": "serif",
  13. "font.serif": "Times New Roman",
  14. "text.usetex": True,
  15. "pgf.rcfonts": False,
  16. }
  17. )
  18. plt.rcParams["text.latex.preamble"].join([
  19. r"\usepackage{amsmath}",
  20. r"\setmainfont{amssymb}",
  21. ])
  22. import seaborn as sns
  23. import argparse
  24. from os.path import join as opj
  25. import pickle
  26. from cmdstanpy import CmdStanModel
  27. parser = argparse.ArgumentParser()
  28. parser.add_argument("--input")
  29. args = parser.parse_args()
  30. topics = pd.read_csv(opj(args.input, "topics.csv"))
  31. junk = topics["label"].str.contains("Junk")
  32. topics = topics[~junk]["label"].tolist()
  33. fig, ax = plt.subplots()
  34. n_topics = len(pd.read_csv(opj(args.input, "topics.csv")))
  35. df = pd.read_csv(opj(args.input, "aggregate.csv"))
  36. resources = pd.read_parquet(opj(args.input, "pooled_resources.parquet"))
  37. df = df.merge(resources, left_on="bai", right_on="bai")
  38. NR = np.stack(df[[f"start_{k+1}" for k in range(n_topics)]].values).astype(int)
  39. NC = np.stack(df[[f"end_{k+1}" for k in range(n_topics)]].values).astype(int)
  40. expertise = np.stack(df[[f"expertise_{k+1}" for k in range(n_topics)]].values)
  41. NR = NR[:,~junk]
  42. NC = NC[:,~junk]
  43. S = np.stack(df["pooled_resources"])
  44. S = S[:,~junk]
  45. expertise = expertise[:,~junk]
  46. x = NR/NR.sum(axis=1)[:,np.newaxis]
  47. y = NC/NC.sum(axis=1)[:,np.newaxis]
  48. S = S/S.sum(axis=1)[:,np.newaxis]
  49. # R = np.array([
  50. # [((expertise[:,i]>expertise[:,i].mean())&(expertise[:,j]>expertise[:,j].mean())).mean()/((expertise[:,i]>expertise[:,i].mean())|(expertise[:,j]>expertise[:,j].mean())).mean() for j in range(len(topics))]
  51. # for i in range(len(topics))
  52. # ])
  53. nu = np.load(opj(args.input, "nu_expertise_symmetric.npy"))
  54. print(nu)
  55. df["research_diversity"] = np.exp(entropy(x, axis=1))
  56. df["social_diversity"] = np.exp(entropy(np.stack(df["pooled_resources"]),axis=1))
  57. df["intellectual_diversity"] = np.exp(entropy(expertise,axis=1))
  58. # df["social_magnitude"] = np.log(1+np.stack(df["pooled_resources"]).sum(axis=1))
  59. df["social_magnitude"] = np.stack(df["pooled_resources"]).sum(axis=1)
  60. expertise_matrix = np.einsum("ki,kj->kij", expertise, expertise)
  61. social_expertise_matrix = np.einsum("ki,kj->kij", S, S)
  62. df["intellectual_stirling"] = 1-np.einsum("ij,kij->k", nu, expertise_matrix)
  63. df["social_stirling"] = 1-np.einsum("ij,kij->k", nu, social_expertise_matrix)
  64. df.fillna({
  65. "social_stirling": 0,
  66. "social_diversity": 0,
  67. "intellectual_diversity": 0
  68. }, inplace=True)
  69. df["excess_social_diversity"] = df["social_diversity"]-LinearRegression().fit(df[["intellectual_diversity"]], df["social_diversity"]).predict(df[["intellectual_diversity"]])
  70. df["excess_social_stirling"] = df["social_stirling"]-LinearRegression().fit(df[["intellectual_stirling"]], df["social_stirling"]).predict(df[["intellectual_stirling"]])
  71. brokerage = pd.read_csv(opj(args.input, "brokerage.csv"))
  72. df = df.merge(brokerage, left_on="bai", right_on="bai")
  73. print(df)
  74. # df["brokerage"] = np.log(1+df["brokerage"])
  75. df.fillna(0, inplace=True)
  76. measures = ["intellectual_diversity", "intellectual_stirling", "excess_social_diversity", "excess_social_stirling", "social_magnitude", "brokerage"]
  77. labels = ["\\textbf{Intellectual diversity}", "Intellectual diversity (Stirling)", "\\textbf{Excess social diversity}", "Excess social diversity (Stirling)", "\\textbf{Power}", "Brokerage"]
  78. R = np.zeros((len(measures), len(measures)))
  79. for i, a in enumerate(measures):
  80. for j, b in enumerate(measures):
  81. if i == j:
  82. R[i,j] = np.nan
  83. else:
  84. R[i,j] = np.corrcoef(df[a], df[b])[0, 1]
  85. print(R[i,j])
  86. fig, ax = plt.subplots(figsize=(4,3.2))
  87. R[np.tril_indices(R.shape[0])] = np.nan
  88. sns.heatmap(
  89. R[:-1,1:],
  90. cmap="Reds",
  91. vmin=0,
  92. vmax=1,
  93. xticklabels=labels[1:],
  94. yticklabels=labels[:-1],
  95. ax=ax,
  96. annot=R[:-1,1:],
  97. fmt=".2f",
  98. annot_kws={"fontsize": 6},
  99. square=True
  100. )
  101. # ax.xaxis.set_tick_params(rotation=45)
  102. ax.yaxis.set_tick_params(rotation=0)
  103. ax.set_xticklabels(ax.get_xticklabels(), rotation = 45, ha="right")
  104. fig.savefig(
  105. opj(args.input, f"capital_measures.eps"),
  106. bbox_inches="tight",
  107. )
  108. df.agg(['mean', 'std']).to_csv(opj(args.input, "capital_measures.csv"))