hep_citations.py 1.1 KB

123456789101112131415161718192021222324252627282930
  1. import pandas as pd
  2. import numpy as np
  3. import networkx as nx
  4. import random
  5. hep_cats = {
  6. "Theory-HEP": 0,
  7. "Phenomenology-HEP": 1,
  8. "Experiment-HEP": 2
  9. }
  10. def hep_filter(categories: list):
  11. return list(set(categories)&set(hep_cats.keys()))
  12. articles = pd.read_parquet("inspire-harvest/database/articles.parquet")[["article_id", "categories", "date_created"]]
  13. articles["categories"] = articles.categories.map(hep_filter)
  14. articles = articles[articles.categories.map(len)==1]
  15. articles["category"] = articles.categories.map(lambda x: hep_cats[x[0]])
  16. articles["year"] = articles["date_created"].str[:4].replace('', 0).astype(int)
  17. articles = articles[(articles["year"] >= 1980) & (articles["year"] < 2020)]
  18. references = pd.read_parquet("inspire-harvest/database/articles_references.parquet")
  19. references = references.merge(articles[["article_id", "category", "year"]], how='inner', left_on="cited", right_on="article_id")
  20. articles = articles.merge(references, how='inner', left_on='article_id', right_on='cites', suffixes = ("_cites", "_cited"))
  21. articles.to_parquet("output/cross_citations.parquet")