123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172 |
- import pandas as pd
- import numpy as np
- import networkx as nx
- import random
- hep_cats = {
- "Theory-HEP": 0,
- "Phenomenology-HEP": 1,
- "Experiment-HEP": 2
- }
- def hep_filter(categories: list):
- return list(set(categories)&set(hep_cats.keys()))
- articles = pd.read_parquet("inspire-harvest/database/articles.parquet")[["article_id", "categories", "date_created"]]
- articles["categories"] = articles.categories.map(hep_filter)
- articles = articles[articles.categories.map(len)>0]
- articles["cross_list"] = articles.categories.map(len)>1
- for cat in hep_cats:
- articles[cat] = articles.categories.map(lambda cats: 1 if cat in cats else 0)
- authors_references = pd.read_parquet("inspire-harvest/database/articles_authors.parquet")
- authors = authors_references.merge(articles[["article_id"] + list(hep_cats.keys())], how="inner", left_on="article_id", right_on="article_id")
- authors = authors.groupby("bai").agg(**{
- cat.replace("-", "_"): (cat, "sum") for cat in hep_cats
- })
- authors = authors[authors.sum(axis=1)>=3]
- primary_category = authors.idxmax(axis=1).str.replace("_","-")
- primary_category.to_csv("output/authors_primary_category.csv")
- articles = articles.merge(authors_references, how="left", left_on="article_id", right_on="article_id")
- articles = articles.merge(authors, how="left", left_on="bai", right_on="bai")
- d = {
- "categories": ("categories", "first"),
- "date_created": ("date_created", "first")
- }
- d.update({
- cat.replace("-", "_"): (cat, "sum")
- for cat in hep_cats
- })
- articles = articles.groupby(["article_id"]).agg(**d).reset_index()
- def decision_function(row):
- if len(row["categories"])==1:
- print("ok")
- return hep_cats[row["categories"][0]]
- else:
- contribs = np.array([row[cat.replace("-", "_")] for cat in hep_cats])
- most_frequent = np.argmax(contribs)
- tie = np.count_nonzero((contribs == most_frequent).astype(int))>1
- print(most_frequent, tie)
- return most_frequent if not tie else -1
-
- articles["category"] = articles.apply(decision_function, axis=1)
- articles = articles[articles["category"]>=0]
- articles["year"] = articles["date_created"].str[:4].replace('', 0).astype(int)
- articles = articles[(articles["year"] >= 1980) & (articles["year"] < 2020)]
- references = pd.read_parquet("inspire-harvest/database/articles_references.parquet")
- references = references.merge(articles[["article_id", "category", "year"]], how='inner', left_on="cited", right_on="article_id")
- articles = articles.merge(references, how='inner', left_on='article_id', right_on='cites', suffixes = ("_cites", "_cited"))
- articles.to_parquet("output/cross_citations_crosslists.parquet")
|