from cProfile import label import numpy as np import pandas as pd from scipy.stats import entropy from sklearn.linear_model import LinearRegression from matplotlib import pyplot as plt import matplotlib matplotlib.use("pgf") matplotlib.rcParams.update( { "pgf.texsystem": "xelatex", "font.family": "serif", "font.serif": "Times New Roman", "text.usetex": True, "pgf.rcfonts": False, } ) plt.rcParams["text.latex.preamble"].join([ r"\usepackage{amsmath}", r"\setmainfont{amssymb}", ]) import argparse from os.path import join as opj, exists import pickle parser = argparse.ArgumentParser() parser.add_argument("--input") args = parser.parse_args() topics = pd.read_csv(opj(args.input, "topics.csv")) junk = topics["label"].str.contains("Junk") topics = topics[~junk]["label"].tolist() n_topics = len(topics) labels = [ "Intellectual capital (diversity)", "Social capital (diversity)", "Social capital (power)", "Stable affiliation", "Academic age" ] labels = [f"\\textbf{{{label}}}" for label in labels] labels += topics n_vars = len(labels) label_position = {label: i for i, label in enumerate(labels)} names = [ "beta_int_div", "beta_soc_div", "beta_soc_cap", "beta_stable", "beta_age" ] nice_names = { "change": "Change score ($c_a$)", "disruption": "Disruption score ($d_a$)", "entered": "Entered a new research area", "exited": "Exited a research area" } def get_effects(metric, diversity, power): filename = opj(args.input, f"samples_{metric}_{diversity}_{power}.npz") if not exists(filename): print(f"samples not found: {filename}") return pd.DataFrame([]) samples = np.load(filename) mu = np.array([samples[name].mean() for name in names] + [(samples["beta_x"][:,i]*samples["tau"]).mean() for i in range(n_topics)]) low = np.array([np.quantile(samples[name], q=0.05/2) for name in names] + [np.quantile(samples["beta_x"][:,i]*samples["tau"], q=0.05/2) for i in range(n_topics)]) up = np.array([np.quantile(samples[name], q=1-0.05/2) for name in names] + [np.quantile(samples["beta_x"][:,i]*samples["tau"], q=1-0.05/2) for i in range(n_topics)]) sig = up*low>0 sign = mu>0 prob = np.array([(samples[name]*np.sign(samples[name].mean())<0).mean() for name in names] + [((samples["beta_x"][:,i]*np.sign(samples["beta_x"][:,i].mean()))<0).mean() for i in range(n_topics)]) vars = [] model = None if diversity == "entropy": model = "Reference" if power=="magnitude" else "$P=\\text{Brokerage}$" else: model = "$D=\\text{Stirling}$" for i in range(n_vars): plus = up[i]-mu[i] minus = mu[i]-low[i] sign_char = "+" if sign[i] else "" s = (f"{mu[i]:.2g}").replace("-", "") if len(s)<5 and "e" not in s: if sig[i]: string = f"$\\bm{{{sign_char}{mu[i]:.2g}}}\\substack{{+{plus:.2g} \\\\ -{minus:.2g}}}$" else: string = f"${sign_char}{mu[i]:.2g}\\substack{{+{plus:.2g} \\\\ -{minus:.2g}}}$" else: if sig[i]: string = f"$\\bm{{{sign_char}{mu[i]:.1g}}}\\substack{{+{plus:.1g} \\\\ -{minus:.1g}}}$" else: string = f"${sign_char}{mu[i]:.1g}\\substack{{+{plus:.1g} \\\\ -{minus:.1g}}}$" vars.append({ "Dep. variable": nice_names[metric], "Model": model, "mu": mu[i], "low": low[i], "up": up[i], "sig": sig[i]>0, "Predictor": labels[i], "string": string, }) print(metric, model) return pd.DataFrame(vars) vars = [] metrics = ["change", "disruption"] for metric in metrics: vars.append(get_effects(metric, "entropy", "magnitude")) vars.append(get_effects(metric, "stirling", "magnitude")) vars.append(get_effects(metric, "entropy", "brokerage")) vars = pd.concat(vars) print(vars) vars = vars.pivot(columns=["Dep. variable", "Model"], index="Predictor", values="string") vars.sort_index(key=lambda x: x.map(label_position), inplace=True) latex = vars.to_latex( escape=False, multicolumn_format="c", caption="Effect of each variable on (a) the change score and (b) the disruption score for each model. The reference model uses entropy as the diversity measure $D$ and the magnitude of intellectual capital as a measure of power $P$. Values indicate the mean posterior effect size and the 95\\% credible interval. Significant effects are shown in bold.", label="table:summary_change_disruption", position="H" ) latex = latex.replace("\\\nHadrons", "\\\n\\hline Hadrons") latex = latex.replace("\\begin{tabular}", "\\renewcommand{\\arraystretch}{2}\\fontsize{6}{7}\\selectfont\\begin{tabular}") latex = latex.replace("\\end{tabular}", "\\end{tabular}\\normalsize\\renewcommand{\\arraystretch}{1}") with open(opj(args.input, f"summary_change_disruption.tex"), "w+") as fp: fp.write(latex) vars = [] metrics = ["entered", "exited"] for metric in metrics: vars.append(get_effects(metric, "entropy", "magnitude")) vars.append(get_effects(metric, "stirling", "magnitude")) vars.append(get_effects(metric, "entropy", "brokerage")) vars = pd.concat(vars) print(vars) vars = vars.pivot(columns=["Dep. variable", "Model"], index="Predictor", values="string") vars.sort_index(key=lambda x: x.map(label_position), inplace=True) latex = vars.to_latex( escape=False, multicolumn_format="c", caption="Effect of each variable on (a) the probability of having entered a new research area and (b) the probability of having exited a research area, for each model. The reference model uses entropy as the diversity measure $D$ and the magnitude of intellectual capital as a measure of power $P$. Values indicate the mean posterior effect size and the 95\\% credible interval. Significant effects are shown in bold.", label="table:summary_entered_exited", position="H" ) latex = latex.replace("\\\nHadrons", "\\\n\\hline Hadrons") latex = latex.replace("\\begin{tabular}", "\\renewcommand{\\arraystretch}{2}\\fontsize{6}{7}\\selectfont\\begin{tabular}") latex = latex.replace("\\end{tabular}", "\\end{tabular}\\normalsize\\renewcommand{\\arraystretch}{1}") with open(opj(args.input, f"summary_entered_exited.tex"), "w+") as fp: fp.write(latex)