|
@@ -40,6 +40,48 @@ parser.add_argument("--model", default="", choices=["", "bare"])
|
|
|
parser.add_argument("--compact", action="store_true", default=False)
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
+def age():
|
|
|
+ if not exists(opj(args.input, "age.csv")):
|
|
|
+ articles = pd.read_parquet("../semantics/inspire-harvest/database/articles.parquet")[["article_id", "date_created", "pacs_codes", "curated", "accelerators"]]
|
|
|
+ articles["article_id"] = articles.article_id.astype(int)
|
|
|
+ articles = articles[articles["date_created"].str.len() >= 4]
|
|
|
+ articles["year"] = articles["date_created"].str[:4].astype(int)
|
|
|
+
|
|
|
+ articles["age"] = 2015-articles["date_created"].str[:4].astype(int)
|
|
|
+ age = articles[["article_id", "age"]].copy()
|
|
|
+
|
|
|
+ articles = articles[(articles["year"]>=2000)&(articles["year"]<2010)]
|
|
|
+
|
|
|
+ _articles = pd.read_csv(opj(args.input, "articles.csv"))
|
|
|
+ articles = _articles.merge(articles, how="inner")
|
|
|
+
|
|
|
+ authors = pd.read_parquet("../semantics/inspire-harvest/database/articles_authors.parquet")
|
|
|
+ authors["article_id"] = authors.article_id.astype(int)
|
|
|
+ n_authors = authors.groupby("article_id").agg(n_authors=("bai", "count")).reset_index()
|
|
|
+ articles = articles.merge(n_authors, how="left", left_on="article_id", right_on="article_id")
|
|
|
+ # exclude large collaborations (experiments, software, etc.)
|
|
|
+ articles = articles[articles.accelerators.map(len)==0]
|
|
|
+ articles = articles[articles["n_authors"]<10]
|
|
|
+
|
|
|
+ references = pd.read_parquet("../semantics/inspire-harvest/database/articles_references.parquet")
|
|
|
+ references = references[references["cites"]!=references["cited"]]
|
|
|
+ references = references.groupby("cited").agg(citations=("cites", "count")).reset_index()
|
|
|
+ references["cited"] = references.cited.astype(int)
|
|
|
+ references = references[references["cited"].isin(articles.article_id)]
|
|
|
+ articles = articles.merge(references, how="outer", left_on="article_id", right_on="cited")
|
|
|
+ articles.dropna(subset=["year"], inplace=True)
|
|
|
+ articles.fillna({"citations": 0}, inplace=True)
|
|
|
+ articles["citations_per_author"] = articles["citations"]/articles["n_authors"]
|
|
|
+ del references
|
|
|
+
|
|
|
+ age = age.merge(authors, how="inner", left_on="article_id", right_on="article_id")
|
|
|
+ age = age.groupby("bai").agg(age=("age", "max")).reset_index()
|
|
|
+ age.to_csv(opj(args.input, "age.csv"))
|
|
|
+ else:
|
|
|
+ age = pd.read_csv(opj(args.input, "age.csv"))
|
|
|
+
|
|
|
+ return age
|
|
|
+
|
|
|
def institution_stability():
|
|
|
if exists(opj(args.input, "institutional_stability.csv")):
|
|
|
return pd.read_csv(opj(args.input, "institutional_stability.csv"), index_col="bai")
|
|
@@ -104,12 +146,6 @@ x = NR/NR.sum(axis=1)[:,np.newaxis]
|
|
|
y = NC/NC.sum(axis=1)[:,np.newaxis]
|
|
|
S_distrib = S/S.sum(axis=1)[:,np.newaxis]
|
|
|
|
|
|
-
|
|
|
-# R = np.array([
|
|
|
-# [((expertise[:,i]>expertise[:,i].mean())&(expertise[:,j]>expertise[:,j].mean())).mean()/((expertise[:,i]>expertise[:,i].mean())|(expertise[:,j]>expertise[:,j].mean())).mean() for j in range(len(topics))]
|
|
|
-# for i in range(len(topics))
|
|
|
-# ])
|
|
|
-
|
|
|
R = np.array([
|
|
|
[((expertise[:,i]>expertise[:,i].mean())&(expertise[:,j]>expertise[:,j].mean())).mean()/(expertise[:,i]>expertise[:,i].mean()).mean() for j in range(len(topics))]
|
|
|
for i in range(len(topics))
|
|
@@ -183,9 +219,7 @@ df["social_stirling"] = 1-np.einsum("ij,kij->k", R, social_expertise_matrix)
|
|
|
|
|
|
stability = institution_stability()
|
|
|
df = df.merge(stability, left_on="bai", right_index=True)
|
|
|
-
|
|
|
-age = pd.read_csv(opj(args.input, "outcomes.csv"))[["bai", "age"]].drop_duplicates()
|
|
|
-df = df.merge(age, left_on="bai", right_on="bai")
|
|
|
+df = df.merge(age(), left_on="bai", right_on="bai")
|
|
|
|
|
|
df["primary_research_area"] = x.argmax(axis=1)
|
|
|
|
|
@@ -228,7 +262,8 @@ fig.savefig(opj(args.input, "disruption_score_collider_physics.eps"), bbox_inche
|
|
|
|
|
|
if not exists(opj(args.input, f"samples_{args.metric}_{args.diversity}_{args.power}.npz")):
|
|
|
model = CmdStanModel(
|
|
|
- stan_file=f"code/{args.metric}.stan" if args.model=="" else f"code/{args.metric}_{args.model}_{args.power}.stan",
|
|
|
+ stan_file=f"code/{args.metric}.stan" if args.model==""
|
|
|
+ else f"code/{args.metric}_{args.model}_{args.power}.stan",
|
|
|
)
|
|
|
|
|
|
fit = model.sample(
|
|
@@ -254,13 +289,14 @@ labels = [
|
|
|
"Social capital (diversity)",
|
|
|
"Social capital (power)",
|
|
|
"Stable affiliation",
|
|
|
+ "Academic age",
|
|
|
]
|
|
|
labels = [f"\\textbf{{{label}}}" for label in labels]
|
|
|
|
|
|
labels += topics
|
|
|
|
|
|
names = [
|
|
|
- "beta_int_div", "beta_soc_div", "beta_soc_cap", "beta_stable"
|
|
|
+ "beta_int_div", "beta_soc_div", "beta_soc_cap", "beta_stable", "beta_age",
|
|
|
]
|
|
|
|
|
|
if args.metric not in ["entered", "exited"]:
|
|
@@ -359,6 +395,7 @@ else:
|
|
|
"Social capital (diversity)",
|
|
|
"Social capital (power)",
|
|
|
"Stable affiliation",
|
|
|
+ "Academic age",
|
|
|
]
|
|
|
|
|
|
if not args.compact:
|