123456789101112131415161718192021222324252627282930 |
- import pandas as pd
- import numpy as np
- import networkx as nx
- import random
- hep_cats = {
- "Theory-HEP": 0,
- "Phenomenology-HEP": 1,
- "Experiment-HEP": 2
- }
- def hep_filter(categories: list):
- return list(set(categories)&set(hep_cats.keys()))
- articles = pd.read_parquet("inspire-harvest/database/articles.parquet")[["article_id", "categories", "date_created"]]
- articles["categories"] = articles.categories.map(hep_filter)
- articles = articles[articles.categories.map(len)==1]
- articles["category"] = articles.categories.map(lambda x: hep_cats[x[0]])
- articles["year"] = articles["date_created"].str[:4].replace('', 0).astype(int)
- articles = articles[(articles["year"] >= 1980) & (articles["year"] < 2020)]
- references = pd.read_parquet("inspire-harvest/database/articles_references.parquet")
- references = references.merge(articles[["article_id", "category", "year"]], how='inner', left_on="cited", right_on="article_id")
- articles = articles.merge(references, how='inner', left_on='article_id', right_on='cites', suffixes = ("_cites", "_cited"))
- articles.to_parquet("output/cross_citations.parquet")
|