from cProfile import label import numpy as np import pandas as pd from scipy.stats import entropy import ot from sklearn.linear_model import LinearRegression from matplotlib import pyplot as plt import matplotlib matplotlib.use("pgf") matplotlib.rcParams.update( { "pgf.texsystem": "xelatex", "font.family": "serif", "font.serif": "Times New Roman", "text.usetex": True, "pgf.rcfonts": False, } ) plt.rcParams["text.latex.preamble"].join([ r"\usepackage{amsmath}", r"\setmainfont{amssymb}", ]) from textwrap import wrap import argparse from os.path import join as opj, exists import pickle from cmdstanpy import CmdStanModel parser = argparse.ArgumentParser() parser.add_argument("--input") parser.add_argument("--dataset", default="inspire-harvest/database") parser.add_argument("--suffix", default=None) parser.add_argument("--portfolios", default=None) parser.add_argument("--output", default="") parser.add_argument("--metric", default="change", choices=["change", "disruption", "diversification", "diversification_stirling", "entered", "exited", "exited_total_power_effect"]) parser.add_argument("--diversity", default="entropy", choices=["entropy", "stirling"]) parser.add_argument("--power", choices=["magnitude", "brokerage"], default="magnitude") parser.add_argument("--model", default="", choices=["", "bare"]) parser.add_argument("--compact", action="store_true", default=False) parser.add_argument("--fla", action="store_true", default=False) parser.add_argument("--early-period", type=str, default="2000-2009") parser.add_argument("--end", type=int, default=2019) args = parser.parse_args() early_period = list(map(int, args.early_period.split("-"))) fla = "_fla" if args.fla else "" def age(): if not exists(opj(args.input, f"age_{args.early_period}.csv")): articles = pd.read_parquet(opj(args.dataset, "articles.parquet"))[["article_id", "date_created", "pacs_codes", "curated", "accelerators"]] articles["article_id"] = articles.article_id.astype(int) articles = articles[articles["date_created"].str.len() >= 4] articles["year"] = articles["date_created"].str[:4].astype(int) articles["age"] = early_period[1]+1-articles["date_created"].str[:4].astype(int) age = articles[["article_id", "age"]].copy() articles = articles[(articles["year"]>=early_period[0])&(articles["year"]= 4] articles["year"] = articles["date_created"].str[:4].astype(int) articles["article_id"] = articles.article_id.astype(int) articles = articles[articles["year"] <= args.end] articles = articles[articles["year"] >= int(early_period[0])] affiliations["article_id"] = affiliations.article_id.astype(int) affiliations = affiliations.merge(articles, how="inner", left_on="article_id", right_on="article_id") #affiliations = affiliations[affiliations["bai"].isin(df["bai"])] authors_last = affiliations.groupby("bai").agg(last_article=("year", "max")) hosts = affiliations.sort_values(["bai", "institution_id", "year"]).groupby(["bai", "institution_id"]).agg( first=("year", "min"), last=("year", "max") ) hosts["duration"] = hosts["last"]-hosts["first"] stability = hosts.groupby("bai").agg(stability=("duration", "max"), last=("last", "max"), first=("first", "min")) stability = stability.merge(authors_last, left_index=True, right_index=True) stability["stable"] = stability["stability"]>=(stability["last"]-stability["first"]-1) stability.to_csv(opj(args.input, f"institutional_stability_{args.early_period}.csv")) return stability def productivity(): if exists(opj(args.input, f"productivity_{args.early_period}.csv")): return pd.read_csv(opj(args.input, f"productivity_{args.early_period}.csv"), index_col="bai") articles = pd.read_parquet(opj(args.dataset, "articles.parquet"))[["article_id", "date_created", "categories"]] articles["article_id"] = articles.article_id.astype(int) articles = articles[articles["date_created"].str.len() >= 4] articles["year"] = articles["date_created"].str[:4].astype(int) articles = articles[articles["categories"].map(lambda x: "Phenomenology-HEP" in x or "Theory-HEP" in x)] articles = articles[(articles["year"]>=early_period[0])&(articles["year"]expertise[:,i].mean())&(expertise[:,j]>expertise[:,j].mean())).mean()/(expertise[:,i]>expertise[:,i].mean()).mean() for j in range(len(topics))] for i in range(len(topics)) ]) change = np.abs(y-x).sum(axis=1)/2 diversification = (np.exp(entropy(y, axis=1))-np.exp(entropy(x, axis=1)))/x.shape[1] x_matrix = np.einsum("ki,kj->kij", x, x) y_matrix = np.einsum("ki,kj->kij", y, y) x_stirling = 1-np.einsum("ij,kij->k", R, x_matrix) y_stirling = 1-np.einsum("ij,kij->k", R, y_matrix) if exists(opj(args.input, f"cost_knowledge_bounded.npz")): cost_matrix = np.load(opj(args.input, f"cost_knowledge_bounded.npz"))["C"].mean(axis=0) print(cost_matrix.sum()) cost_matrix = cost_matrix*(1-np.eye(x.shape[1])).sum()/cost_matrix.sum() else: cost_matrix = 1-np.eye(x.shape[1]) disruption = np.zeros(len(change)) for a in range(len(change)): # disruption[a] = ot.emd2(x[a,:].copy(order='C'), y[a,:].copy(order='C'), 1-R, processes=4) disruption[a] = ot.emd2(x[a,:].copy(order='C'), y[a,:].copy(order='C'), cost_matrix, processes=4) alpha = 1 exited = ((x>alpha*x.mean(axis=0))&(yalpha*y.mean(axis=0))).sum(axis=1) fig, ax = plt.subplots(figsize=[6.4, 3.2]) ax.hist(change, bins=np.linspace(0,1,50), histtype="step", color = '#377eb8', label="Change score $c_a$") ax.hist(disruption, bins=np.linspace(0,1,50), histtype="step", color = '#ff7f00', label="Cognitive distance $d_a$") ax.set_xlabel(f"Change score $c_a$ and cognitive distance $d_a$") ax.set_ylabel("\\# of scientists") ax.legend() fig.savefig(opj(args.input, f"change_disruption_score{fla}{args.output}.eps"), bbox_inches="tight") print("change 50%% interval: ", np.quantile(change,q=0.25), np.quantile(change,q=1-0.25)) fig, ax = plt.subplots(figsize=[6.4, 3.2]) ax.hist(diversification, bins=np.linspace(-0.5,0.5,50), histtype="step") ax.set_xlabel(f"Diversification score $\\Delta_a$") ax.set_ylabel("\\# of scientists") fig.savefig(opj(args.input, f"diversification_score{fla}{args.output}.eps"), bbox_inches="tight") fig, ax = plt.subplots() ax.hist(disruption, bins=np.linspace(0,1,50), histtype="step") ax.set_xlabel(f"Disruption score $d_a$") ax.set_ylabel("\\# of scientists") fig.savefig(opj(args.input, f"disruption_score{fla}{args.output}.eps"), bbox_inches="tight") df["change_score"] = change df["disruption_score"] = disruption df["diversification_score"] = diversification df["diversification_stirling_score"] = y_stirling-x_stirling df["entered_score"] = (entered>0).astype(int) df["exited_score"] = (exited>0).astype(int) df["exited_total_power_effect_score"] = (exited>0).astype(int) df["origin"] = np.argmax(x, axis=1) df["target"] = np.argmax(y, axis=1) df["origin_value"] = x.max(axis=1) df["target_value"] = y.max(axis=1) df["origin_final_value"] = np.array(y[a,df.loc[a, "origin"]] for a in range(x.shape[0])) df["target_initial_value"] = np.array(x[a,df.loc[a, "target"]] for a in range(x.shape[0])) df["origin_label"] = df["origin"].apply(lambda k: topics[k]) df["target_label"] = df["target"].apply(lambda k: topics[k]) df["origin_label"] = df.apply(lambda row: row["origin_label"] + (f" ({row['origin_value']:.2f})" if row["origin"]==row["target"] else f" ({row['origin_value']:.2f}$\\to${row['origin_final_value']:.2f})"), axis=1) df["target_label"] = df.apply(lambda row: row["target_label"] + (f" ({row['target_value']:.2f})" if row["origin"]==row["target"] else f" ({row['target_initial_value']:.2f}$\\to${row['target_value']:.2f})"), axis=1) df["social_entropy"] = np.exp(entropy(S,axis=1)) df["intellectual_entropy"] = np.exp(entropy(expertise,axis=1)) expertise_matrix = np.einsum("ki,kj->kij", expertise, expertise) social_expertise_matrix = np.einsum("ki,kj->kij", S_distrib, S_distrib) df["intellectual_stirling"] = 1-np.einsum("ij,kij->k", R, expertise_matrix) df["social_stirling"] = 1-np.einsum("ij,kij->k", R, social_expertise_matrix) # normalize productivity per time active during the early time period df["time_active"] = np.minimum(df["age"], early_period[1]+1-early_period[0]) df["productivity"] /= df["time_active"] df["productivity_solo"] /= df["time_active"] df["primary_research_area"] = x.argmax(axis=1) df["social_diversity"] = df[f"social_{args.diversity}"].fillna(0) df["intellectual_diversity"] = df[f"intellectual_{args.diversity}"].fillna(0) df["res_social_diversity"] = df["social_diversity"]-LinearRegression().fit(df[["intellectual_diversity"]], df["social_diversity"]).predict(df[["intellectual_diversity"]]) data = { "N": len(df), "K": x.shape[1], "m": df[f"{args.metric}_score"], # "soc_cap": np.log(1+S.sum(axis=1)) if args.power == "magnitude" else np.log(1+df["brokerage"].values), "soc_cap": S.sum(axis=1) if args.power == "magnitude" else df["brokerage"].values, "soc_div": df["social_diversity"], "int_div": df["intellectual_diversity"], "res_soc_div": df["res_social_diversity"], "productivity": df["productivity"], "productivity_solo": df["productivity_solo"], "x": x, "initial_div": np.exp(entropy(x, axis=1)), "primary_research_area": df["primary_research_area"], "stable": df["stable"].astype(float).values, "age": df["age"].values } fig, ax = plt.subplots(figsize=[6.4, 3.2]) ax.hist(change[df["primary_research_area"] != 4], bins=np.linspace(0,1,25), histtype="step", label=f"Others ($\\mu={change[df['primary_research_area'] != 4].mean():.2f}$)", density=True) ax.hist(change[df["primary_research_area"] == 4], bins=np.linspace(0,1,25), histtype="step", label=f"Collider physics ($\\mu={change[df['primary_research_area'] == 4].mean():.2f}$)", density=True) ax.set_xlabel(f"Change score $c_a = \\frac{{1}}{{2}}\\sum_k |y_{{ak}}-x_{{ak}}|$") ax.set_ylabel("\\# of scientists") ax.legend(loc='upper right', bbox_to_anchor=(1, 1.2)) fig.savefig(opj(args.input, f"change_score_collider_physics{fla}{args.output}.eps"), bbox_inches="tight") fig, ax = plt.subplots(figsize=[6.4, 3.2]) ax.hist(disruption[df["primary_research_area"] != 4], bins=np.linspace(0,1,25), histtype="step", label=f"Others ($\\mu={disruption[df['primary_research_area'] != 4].mean():.2f}$)", density=True) ax.hist(disruption[df["primary_research_area"] == 4], bins=np.linspace(0,1,25), histtype="step", label=f"Collider physics ($\\mu={disruption[df['primary_research_area'] == 4].mean():.2f}$)", density=True) ax.set_xlabel(f"Cognitive distance $d_a$") ax.set_ylabel("\\# of scientists") ax.legend(loc='upper right', bbox_to_anchor=(1, 1.2)) fig.savefig(opj(args.input, f"disruption_score_collider_physics{fla}{args.output}.eps"), bbox_inches="tight") if not exists(opj(args.input, f"samples_{args.metric}_{args.diversity}_{args.power}{fla}{args.output}.npz")): model = CmdStanModel( stan_file=f"code/{args.metric}.stan" if args.model=="" else f"code/{args.metric}_{args.model}_{args.power}.stan", ) fit = model.sample( data=data, chains=4, iter_sampling=10000, iter_warmup=1000, show_console=True ) vars = fit.stan_variables() samples = {} for (k, v) in vars.items(): samples[k] = v np.savez_compressed(opj(args.input, f"samples_{args.metric}_{args.diversity}_{args.power}{fla}{args.output}.npz"), **samples) samples = np.load(opj(args.input, f"samples_{args.metric}_{args.diversity}_{args.power}{fla}{args.output}.npz")) labels = [ "Intellectual capital (diversity)", "Social capital (diversity)", "Social capital (power)", "Stable affiliation", "Academic age", "Productivity (co-authored)", "Productivity (solo-authored)", ] labels = [f"\\textbf{{{label}}}" for label in labels] labels += topics names = [ "beta_int_div", "beta_soc_div", "beta_soc_cap", "beta_stable", "beta_age", "beta_productivity", "beta_productivity_solo" ] if args.metric not in ["entered", "exited"] and args.metric not in ["change", "disruption"]: mu = np.array([samples[name].mean() for name in names] + [(samples["beta_x"][:,i]*samples["tau"]).mean() for i in range(x.shape[1])]) low = np.array([np.quantile(samples[name], q=0.05/2) for name in names] + [np.quantile(samples["beta_x"][:,i]*samples["tau"], q=0.05/2) for i in range(x.shape[1])]) up = np.array([np.quantile(samples[name], q=1-0.05/2) for name in names] + [np.quantile(samples["beta_x"][:,i]*samples["tau"], q=1-0.05/2) for i in range(x.shape[1])]) sig = up*low>0 prob = np.array([(samples[name]*np.sign(samples[name].mean())<0).mean() for name in names] + [((samples["beta_x"][:,i]*np.sign(samples["beta_x"][:,i].mean()))<0).mean() for i in range(x.shape[1])]) keep = sig | (np.arange(len(sig))0 else ">" for i, _mu in enumerate(mu)] labels = [label for i, label in enumerate(labels) if keep[i]] n_vars = len(labels) # effect of capital and controls fig, ax = plt.subplots(figsize=[6.4, 0.4*(1+n_vars)]) ax.scatter(mu, np.arange(len(labels))[::-1]) ax.errorbar(mu, np.arange(len(labels))[::-1], xerr=(mu-low,up-mu), ls="none", capsize=4, elinewidth=1) ax.set_yticks(np.arange(len(labels))[::-1], labels) for i, p in enumerate(prob): if p>1e-4 and np.abs(p-0.5)>0.4: ax.text( -0.02 if mu[i]>0 else 0.02, np.arange(len(labels))[::-1][i], f"\\scriptsize $\\mu(\\beta)={mu[i]:.2g}, P(\\beta{sign[i]}0)={p:.2g}$", ha="right" if mu[i]>0 else "left", va="center" ) elif p<0.05/2 or p>1-0.05/2: ax.text( -0.02 if mu[i]>0 else 0.02, np.arange(len(labels))[::-1][i], f"\\scriptsize $\\mu(\\beta)={mu[i]:.2g}$", ha="right" if mu[i]>0 else "left", va="center" ) ax.set_xlabel(f"Effect on {args.metric}") ax.axvline(0, color="black") low, high = ax.get_xlim() bound = max(abs(low), abs(high)) ax.set_xlim(-bound, bound) fig.savefig(opj(args.input, f"{args.metric}_score_effects_{args.diversity}_{args.power}{fla}{args.output}.eps"), bbox_inches="tight") # average change score per research area ratio = args.metric != "diversification" labels = topics if ratio: mu = np.array([(samples["mu_x"][:,i]/samples["mu_pop"]).mean() for i in range(x.shape[1])]) low = np.array([np.quantile(samples["mu_x"][:,i]/samples["mu_pop"], q=0.05/2) for i in range(x.shape[1])]) up = np.array([np.quantile(samples["mu_x"][:,i]/samples["mu_pop"], q=1-0.05/2) for i in range(x.shape[1])]) sig = (up-1)*(low-1)>0 else: mu = np.array([(samples["mu_x"][:,i]-samples["mu_pop"]).mean() for i in range(x.shape[1])]) low = np.array([np.quantile(samples["mu_x"][:,i]-samples["mu_pop"], q=0.05/2) for i in range(x.shape[1])]) up = np.array([np.quantile(samples["mu_x"][:,i]-samples["mu_pop"], q=1-0.05/2) for i in range(x.shape[1])]) sig = (up)*(low)>0 keep = sig mu = mu[keep] low = low[keep] up = up[keep] labels = [label for i, label in enumerate(labels) if keep[i]] fig, ax = plt.subplots(figsize=[6.4, 3.2]) ax.scatter(mu, np.arange(len(labels))[::-1]) ax.errorbar(mu, np.arange(len(labels))[::-1], xerr=(mu-low,up-mu), ls="none", capsize=4, elinewidth=1) ax.set_yticks(np.arange(len(labels))[::-1], labels) fig, ax = plt.subplots(figsize=[6.4, 3.2]) df["m_ratio"] = df[f"{args.metric}_score"]/df[f"{args.metric}_score"].mean() research_areas = df.groupby("primary_research_area").agg( mu=("m_ratio", "mean"), low=("m_ratio", lambda x: np.quantile(x, q=0.05/2)), up=("m_ratio", lambda x: np.quantile(x, q=1-0.05/2)), label=("origin_label", lambda x: x.iloc[0]) ).reset_index() low, high = ax.get_xlim() bound = max(abs(low), abs(high)) ax.set_xlim(-bound, bound) ax.scatter(research_areas["mu"], research_areas.index) ax.errorbar(research_areas["mu"], research_areas.index, xerr=(research_areas["mu"]-research_areas["low"],research_areas["up"]-research_areas["low"]), ls="none", capsize=4, elinewidth=1) ax.set_yticks(research_areas.index, research_areas["label"]) ax.set_xlabel(f"Ratio to average {args.metric} score" if ratio else f"Difference with average {args.metric} score") ax.axvline(1 if ratio else 0, color="black") fig.savefig(opj(args.input, f"{args.metric}_research_area{fla}{args.output}.eps"), bbox_inches="tight") elif args.metric in ["change", "disruption"]: labels = [ "Intellectual capital (diversity)", "Social capital (diversity)", "Social capital (power)", "Stable affiliation", "Academic age", "Productivity (co-authored)", "Productivity (solo-authored)", ] names = [ "beta_int_div", "beta_soc_div", "beta_soc_cap", "beta_stable", "beta_age", "beta_productivity", "beta_productivity_solo" ] if not args.compact: labels = [f"\\textbf{{{label}}}" for label in labels] labels += topics samples = [ np.load(opj(args.input, f"samples_change_{args.diversity}_{args.power}{fla}{args.output}.npz")), np.load(opj(args.input, f"samples_disruption_{args.diversity}_{args.power}{fla}{args.output}.npz")) ] mu = [None, None] low = [None, None] up = [None, None] sig = [None, None] prob = [None, None] for i in range(2): mu[i] = np.array([samples[i][name].mean() for name in names] + [(samples[i]["beta_x"][:,j]*samples[i]["tau"]).mean() for j in range(x.shape[1])]) low[i] = np.array([np.quantile(samples[i][name], q=0.05/2) for name in names] + [np.quantile(samples[i]["beta_x"][:,j]*samples[i]["tau"], q=0.05/2) for j in range(x.shape[1])]) up[i] = np.array([np.quantile(samples[i][name], q=1-0.05/2) for name in names] + [np.quantile(samples[i]["beta_x"][:,j]*samples[i]["tau"], q=1-0.05/2) for j in range(x.shape[1])]) sig[i] = up[i]*low[i]>0 prob[i] = np.array([(samples[i][name]*np.sign(samples[i][name].mean())<0).mean() for name in names] + [((samples[i]["beta_x"][:,j]*np.sign(samples[i]["beta_x"][:,j].mean()))<0).mean() for j in range(x.shape[1])]) if args.compact: keep = (np.arange(len(sig[0]))0 else ">" for j, _mu in enumerate(mu[i])] for i in range(2)] labels = [label for i, label in enumerate(labels) if keep[i]] n_vars = len(labels) if args.compact: labels = [ '\n'.join(map(lambda x: f"\\textbf{{{x}}}", wrap(label, width=15))) if i < 4 else '\n'.join(wrap(label, width=15)) for i, label in enumerate(labels) ] print(labels) # effect of capital and controls fig, ax = plt.subplots(figsize=[4.8 if args.compact else 6.4, 0.52*(1+n_vars)]) colors = ['#377eb8', '#ff7f00'] legend = ["change ($c_a$)", "cognitive distance ($d_a$)"] for j in range(2): R2 = samples[j]["R2"].mean() dy = -0.125 if j else +0.125 ax.scatter(mu[j], np.arange(len(labels))[::-1]+dy, color=colors[j]) ax.errorbar(mu[j], np.arange(len(labels))[::-1]+dy, xerr=(mu[j]-low[j],up[j]-mu[j]), ls="none", capsize=4, elinewidth=1, color=colors[j], label=f"{legend[j]}, $R^2={R2:.2f}$") for i, p in enumerate(prob[j]): significant = p<0.05/2 if p>1e-4 and np.abs(p-0.5)>0.4 and significant: ax.text( -0.02 if mu[j][i]>0 else 0.02, np.arange(len(labels))[::-1][i]+dy, f"\\scriptsize $\\mu(\\beta)={mu[j][i]:.2g},P(\\beta{sign[j][i]}0)={p:.2g}$", ha="right" if mu[j][i]>0 else "left", va="center" ) elif p>1e-4 and np.abs(p-0.5)>0.4 and (not significant): ax.text( -0.02 if mu[j][i]>0 else 0.02, np.arange(len(labels))[::-1][i]+dy, f"\\scriptsize $P(\\beta{sign[j][i]}0)={p:.2g}$", ha="right" if mu[j][i]>0 else "left", va="center" ) elif significant: ax.text( -0.02 if mu[j][i]>0 else 0.02, np.arange(len(labels))[::-1][i]+dy, f"\\scriptsize $\\mu(\\beta)={mu[j][i]:.2g}$", ha="right" if mu[j][i]>0 else "left", va="center" ) low, high = ax.get_xlim() bound = max(abs(low), abs(high)) ax.set_xlim(-bound, bound) ax.set_yticks(np.arange(len(labels))[::-1], labels) ax.set_xlabel(f"Effect size (standard deviations)") ax.axvline(0, color="black") if args.compact: ax.legend(loc='upper right', bbox_to_anchor=(1, 1.3)) else: ax.legend(loc='upper right', bbox_to_anchor=(1, 1.2)) fig.savefig(opj(args.input, f"{args.metric}_score_effects_{args.diversity}_{args.power}{'_compact' if args.compact else ''}{fla}{args.output}.eps"), bbox_inches="tight") fig.savefig(opj(args.input, f"{args.metric}_score_effects_{args.diversity}_{args.power}{'_compact' if args.compact else ''}{fla}{args.output}.pdf"), bbox_inches="tight") fig.savefig(opj(args.input, f"{args.metric}_score_effects_{args.diversity}_{args.power}{'_compact' if args.compact else ''}{fla}{args.output}.png"), bbox_inches="tight", dpi=300) else: labels = [ "Intellectual capital (diversity)", "Social capital (diversity)", "Social capital (power)", "Stable affiliation", "Academic age", "Productivity (co-authored)", "Productivity (solo-authored)", ] if not args.compact: labels = [f"\\textbf{{{label}}}" for label in labels] labels += topics samples = [ np.load(opj(args.input, f"samples_entered_{args.diversity}_{args.power}{fla}{args.output}.npz")), np.load(opj(args.input, f"samples_exited_{args.diversity}_{args.power}{fla}{args.output}.npz")) ] mu = [None, None] low = [None, None] up = [None, None] sig = [None, None] prob = [None, None] for i in range(2): mu[i] = np.array([samples[i][name].mean() for name in names] + [(samples[i]["beta_x"][:,j]*samples[i]["tau"]).mean() for j in range(x.shape[1])]) low[i] = np.array([np.quantile(samples[i][name], q=0.05/2) for name in names] + [np.quantile(samples[i]["beta_x"][:,j]*samples[i]["tau"], q=0.05/2) for j in range(x.shape[1])]) up[i] = np.array([np.quantile(samples[i][name], q=1-0.05/2) for name in names] + [np.quantile(samples[i]["beta_x"][:,j]*samples[i]["tau"], q=1-0.05/2) for j in range(x.shape[1])]) sig[i] = up[i]*low[i]>0 prob[i] = np.array([(samples[i][name]*np.sign(samples[i][name].mean())<0).mean() for name in names] + [((samples[i]["beta_x"][:,j]*np.sign(samples[i]["beta_x"][:,j].mean()))<0).mean() for j in range(x.shape[1])]) if args.compact: keep = (np.arange(len(sig[0]))0 else ">" for j, _mu in enumerate(mu[i])] for i in range(2)] labels = [label for i, label in enumerate(labels) if keep[i]] n_vars = len(labels) if args.compact: labels = [ '\n'.join(map(lambda x: f"\\textbf{{{x}}}", wrap(label, width=15))) if i < 4 else '\n'.join(wrap(label, width=15)) for i, label in enumerate(labels) ] print(labels) # effect of capital and controls fig, ax = plt.subplots(figsize=[4.8 if args.compact else 6.4, 0.52*(1+n_vars)]) colors = ['#377eb8', '#ff7f00'] legend = ["entered new research area", "exited research area"] if args.compact: ax.set_xlim(-0.9, 1.25) for j in range(2): dy = -0.125 if j else +0.125 ax.scatter(mu[j], np.arange(len(labels))[::-1]+dy, color=colors[j]) ax.errorbar(mu[j], np.arange(len(labels))[::-1]+dy, xerr=(mu[j]-low[j],up[j]-mu[j]), ls="none", capsize=4, elinewidth=1, color=colors[j], label=legend[j]) for i, p in enumerate(prob[j]): significant = p<0.05/2 if p>1e-4 and np.abs(p-0.5)>0.4 and significant: ax.text( -0.02 if mu[j][i]>0 else 0.02, np.arange(len(labels))[::-1][i]+dy, f"\\scriptsize $\\mu(\\beta)={mu[j][i]:.2g},P(\\beta{sign[j][i]}0)={p:.2g}$", ha="right" if mu[j][i]>0 else "left", va="center" ) elif p>1e-4 and np.abs(p-0.5)>0.4 and (not significant): ax.text( -0.02 if mu[j][i]>0 else 0.02, np.arange(len(labels))[::-1][i]+dy, f"\\scriptsize $P(\\beta{sign[j][i]}0)={p:.2g}$", ha="right" if mu[j][i]>0 else "left", va="center" ) elif significant: ax.text( -0.02 if mu[j][i]>0 else 0.02, np.arange(len(labels))[::-1][i]+dy, f"\\scriptsize $\\mu(\\beta)={mu[j][i]:.2g}$", ha="right" if mu[j][i]>0 else "left", va="center" ) low, high = ax.get_xlim() bound = max(abs(low), abs(high)) ax.set_xlim(-bound, bound) ax.set_yticks(np.arange(len(labels))[::-1], labels) ax.set_xlabel(f"Effect size (log odds ratio)") ax.axvline(0, color="black") if args.compact: ax.legend(loc='upper right', bbox_to_anchor=(1, 1.3)) else: ax.legend(loc='upper right', bbox_to_anchor=(1, 1.2)) fig.savefig(opj(args.input, f"{args.metric}_score_effects_{args.diversity}_{args.power}{'_compact' if args.compact else ''}{fla}{args.output}.eps"), bbox_inches="tight") fig.savefig(opj(args.input, f"{args.metric}_score_effects_{args.diversity}_{args.power}{'_compact' if args.compact else ''}{fla}{args.output}.pdf"), bbox_inches="tight") fig.savefig(opj(args.input, f"{args.metric}_score_effects_{args.diversity}_{args.power}{'_compact' if args.compact else ''}{fla}{args.output}.png"), bbox_inches="tight", dpi=300) table = df[["bai", "stable", f"{args.metric}_score", "intellectual_entropy", "social_entropy", "origin_label", "target_label"]].sort_values(f"{args.metric}_score", ascending=False) table.to_csv(opj(args.input, f"{args.metric}_scores.csv")) table["bai"] = table["bai"].str.replace(".1", "") table["bai"] = table["bai"].str.replace(r"^([A-Z])\.", r"\1.~") table["bai"] = table["bai"].str.replace(r"\.\~([A-Z])\.", r".~\1.~") table["bai"] = table["bai"].str.replace(r"([a-zA-Z]{2,})\.", r"\1 ") table["bai"] = table.apply(lambda r: r["bai"] if not r["stable"] else f"{r['bai']} ($\\ast$)", axis=1) table["target_label"] += "EOL" latex = table.head(20).to_latex( columns=["bai", f"{args.metric}_score", "intellectual_entropy", "social_entropy", "origin_label", "target_label"], header=["Physicist", "$c_a$", "$D(\\bm{I_a})$", "$D(\\bm{S_a})$", "Previous main area", "Current main area"], index=False, multirow=True, multicolumn=True, column_format='p{0.15\\textwidth}|c|c|c|b{0.25\\textwidth}|b{0.25\\textwidth}', escape=False, float_format=lambda x: f"{x:.2f}", caption="Physicists with the highest change scores $c_a$. $D(\\bm{I_a})$ and $D(\\bm{S_a})$ measure the diversity of intellectual and social capital. Numbers in parentheses indicate the share of attention dedicated to each research area during each time-period. Asterisks ($\\ast$) indicate physicists with a permanent position.", label=f"table:top_{args.metric}", position="H" ) latex = latex.replace('EOL \\\\\n', '\\\\ \\hline\n') with open(opj(args.input, f"top_{args.metric}.tex"), "w+") as fp: fp.write(latex) latex = table.sort_values(f"{args.metric}_score", ascending=True).head(20).to_latex( columns=["bai", f"{args.metric}_score", "intellectual_entropy", "social_entropy", "origin_label", "target_label"], header=["Physicist", "$c_a$", "$D(\\bm{I_a})$", "$D(\\bm{S_a})$", "Previous main area", "Current main area"], index=False, multirow=True, multicolumn=True, column_format='p{0.15\\textwidth}|c|c|c|b{0.25\\textwidth}|b{0.25\\textwidth}', escape=False, float_format=lambda x: f"{x:.2f}", caption="Physicists with the lowest change scores $c_a$. $D(\\bm{I_a})$ and $D(\\bm{S_a})$ measure the diversity of intellectual and social capital. Numbers in parentheses indicate the share of attention dedicated to each research area. Asterisks ($\\ast$) indicate physicists with a permanent position.", label=f"table:low_{args.metric}", position="H" ) latex = latex.replace('EOL \\\\\n', '\\\\ \\hline\n') with open(opj(args.input, f"low_{args.metric}.tex"), "w+") as fp: fp.write(latex)