change_effects_summary.py 6.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175
  1. from cProfile import label
  2. import numpy as np
  3. import pandas as pd
  4. from scipy.stats import entropy
  5. from sklearn.linear_model import LinearRegression
  6. from matplotlib import pyplot as plt
  7. import matplotlib
  8. matplotlib.use("pgf")
  9. matplotlib.rcParams.update(
  10. {
  11. "pgf.texsystem": "xelatex",
  12. "font.family": "serif",
  13. "font.serif": "Times New Roman",
  14. "text.usetex": True,
  15. "pgf.rcfonts": False,
  16. }
  17. )
  18. plt.rcParams["text.latex.preamble"].join([
  19. r"\usepackage{amsmath}",
  20. r"\setmainfont{amssymb}",
  21. ])
  22. import argparse
  23. from os.path import join as opj, exists
  24. import pickle
  25. parser = argparse.ArgumentParser()
  26. parser.add_argument("--input")
  27. args = parser.parse_args()
  28. topics = pd.read_csv(opj(args.input, "topics.csv"))
  29. junk = topics["label"].str.contains("Junk")
  30. topics = topics[~junk]["label"].tolist()
  31. n_topics = len(topics)
  32. labels = [
  33. "Intellectual capital (diversity)",
  34. "Social capital (diversity)",
  35. "Social capital (power)",
  36. "Stable affiliation",
  37. "Academic age",
  38. "Productivity (co-authored)",
  39. "Productivity (solo-authored)",
  40. ]
  41. labels = [f"\\textbf{{{label}}}" for label in labels]
  42. labels += topics
  43. n_vars = len(labels)
  44. label_position = {label: i for i, label in enumerate(labels)}
  45. names = [
  46. "beta_int_div", "beta_soc_div", "beta_soc_cap", "beta_stable", "beta_age", "beta_productivity", "beta_productivity_solo"
  47. ]
  48. nice_names = {
  49. "change": "Change score ($c_a$)",
  50. "disruption": "Disruption score ($d_a$)",
  51. "entered": "Entered a new research area",
  52. "exited": "Exited a research area"
  53. }
  54. def get_effects(metric, diversity, power):
  55. filename = opj(args.input, f"samples_{metric}_{diversity}_{power}.npz")
  56. if not exists(filename):
  57. print(f"samples not found: {filename}")
  58. return pd.DataFrame([])
  59. samples = np.load(filename)
  60. mu = np.array([samples[name].mean() for name in names] + [(samples["beta_x"][:,i]*samples["tau"]).mean() for i in range(n_topics)])
  61. low = np.array([np.quantile(samples[name], q=0.05/2) for name in names] + [np.quantile(samples["beta_x"][:,i]*samples["tau"], q=0.05/2) for i in range(n_topics)])
  62. up = np.array([np.quantile(samples[name], q=1-0.05/2) for name in names] + [np.quantile(samples["beta_x"][:,i]*samples["tau"], q=1-0.05/2) for i in range(n_topics)])
  63. sig = up*low>0
  64. sign = mu>0
  65. prob = np.array([(samples[name]*np.sign(samples[name].mean())<0).mean() for name in names] + [((samples["beta_x"][:,i]*np.sign(samples["beta_x"][:,i].mean()))<0).mean() for i in range(n_topics)])
  66. vars = []
  67. model = None
  68. if diversity == "entropy":
  69. model = "Reference" if power=="magnitude" else "$P=\\text{Brokerage}$"
  70. else:
  71. model = "$D=\\text{Stirling}$"
  72. for i in range(n_vars):
  73. plus = up[i]-mu[i]
  74. minus = mu[i]-low[i]
  75. sign_char = "+" if sign[i] else ""
  76. s = (f"{mu[i]:.2g}").replace("-", "")
  77. if len(s)<5 and "e" not in s:
  78. if sig[i]:
  79. string = f"$\\bm{{{sign_char}{mu[i]:.2g}}}\\substack{{+{plus:.2g} \\\\ -{minus:.2g}}}$"
  80. else:
  81. string = f"${sign_char}{mu[i]:.2g}\\substack{{+{plus:.2g} \\\\ -{minus:.2g}}}$"
  82. else:
  83. if sig[i]:
  84. string = f"$\\bm{{{sign_char}{mu[i]:.1g}}}\\substack{{+{plus:.1g} \\\\ -{minus:.1g}}}$"
  85. else:
  86. string = f"${sign_char}{mu[i]:.1g}\\substack{{+{plus:.1g} \\\\ -{minus:.1g}}}$"
  87. vars.append({
  88. "Dep. variable": nice_names[metric],
  89. "Model": model,
  90. "mu": mu[i],
  91. "low": low[i],
  92. "up": up[i],
  93. "sig": sig[i]>0,
  94. "Predictor": labels[i],
  95. "string": string,
  96. })
  97. print(metric, model)
  98. return pd.DataFrame(vars)
  99. vars = []
  100. metrics = ["change", "disruption"]
  101. for metric in metrics:
  102. vars.append(get_effects(metric, "entropy", "magnitude"))
  103. vars.append(get_effects(metric, "stirling", "magnitude"))
  104. vars.append(get_effects(metric, "entropy", "brokerage"))
  105. vars = pd.concat(vars)
  106. print(vars)
  107. vars = vars.pivot(columns=["Dep. variable", "Model"], index="Predictor", values="string")
  108. vars.sort_index(key=lambda x: x.map(label_position), inplace=True)
  109. latex = vars.to_latex(
  110. escape=False,
  111. multicolumn_format="c",
  112. caption="Effect of each variable on (a) the change score and (b) the disruption score for each model. The reference model uses entropy as the diversity measure $D$ and the magnitude of intellectual capital as a measure of power $P$. Values indicate the mean posterior effect size and the 95\\% credible interval. Significant effects are shown in bold.",
  113. label="table:summary_change_disruption",
  114. position="H"
  115. )
  116. latex = latex.replace("\\\nHadrons", "\\\n\\hline Hadrons")
  117. latex = latex.replace("\\begin{tabular}", "\\renewcommand{\\arraystretch}{2}\\fontsize{6}{7}\\selectfont\\begin{tabular}")
  118. latex = latex.replace("\\end{tabular}", "\\end{tabular}\\normalsize\\renewcommand{\\arraystretch}{1}")
  119. with open(opj(args.input, f"summary_change_disruption.tex"), "w+") as fp:
  120. fp.write(latex)
  121. vars = []
  122. metrics = ["entered", "exited"]
  123. for metric in metrics:
  124. vars.append(get_effects(metric, "entropy", "magnitude"))
  125. vars.append(get_effects(metric, "stirling", "magnitude"))
  126. vars.append(get_effects(metric, "entropy", "brokerage"))
  127. vars = pd.concat(vars)
  128. print(vars)
  129. vars = vars.pivot(columns=["Dep. variable", "Model"], index="Predictor", values="string")
  130. vars.sort_index(key=lambda x: x.map(label_position), inplace=True)
  131. latex = vars.to_latex(
  132. escape=False,
  133. multicolumn_format="c",
  134. caption="Effect of each variable on (a) the probability of having entered a new research area and (b) the probability of having exited a research area, for each model. The reference model uses entropy as the diversity measure $D$ and the magnitude of intellectual capital as a measure of power $P$. Values indicate the mean posterior effect size and the 95\\% credible interval. Significant effects are shown in bold.",
  135. label="table:summary_entered_exited",
  136. position="H"
  137. )
  138. latex = latex.replace("\\\nHadrons", "\\\n\\hline Hadrons")
  139. latex = latex.replace("\\begin{tabular}", "\\renewcommand{\\arraystretch}{2}\\fontsize{6}{7}\\selectfont\\begin{tabular}")
  140. latex = latex.replace("\\end{tabular}", "\\end{tabular}\\normalsize\\renewcommand{\\arraystretch}{1}")
  141. with open(opj(args.input, f"summary_entered_exited.tex"), "w+") as fp:
  142. fp.write(latex)