hep_citations_with_crosslists.py 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. import pandas as pd
  2. import numpy as np
  3. import networkx as nx
  4. import random
  5. hep_cats = {
  6. "Theory-HEP": 0,
  7. "Phenomenology-HEP": 1,
  8. "Experiment-HEP": 2
  9. }
  10. def hep_filter(categories: list):
  11. return list(set(categories)&set(hep_cats.keys()))
  12. articles = pd.read_parquet("inspire-harvest/database/articles.parquet")[["article_id", "categories", "date_created"]]
  13. articles["categories"] = articles.categories.map(hep_filter)
  14. articles = articles[articles.categories.map(len)>0]
  15. articles["cross_list"] = articles.categories.map(len)>1
  16. for cat in hep_cats:
  17. articles[cat] = articles.categories.map(lambda cats: 1 if cat in cats else 0)
  18. authors_references = pd.read_parquet("inspire-harvest/database/articles_authors.parquet")
  19. authors = authors_references.merge(articles[["article_id"] + list(hep_cats.keys())], how="inner", left_on="article_id", right_on="article_id")
  20. authors = authors.groupby("bai").agg(**{
  21. cat.replace("-", "_"): (cat, "sum") for cat in hep_cats
  22. })
  23. authors = authors[authors.sum(axis=1)>=3]
  24. primary_category = authors.idxmax(axis=1).str.replace("_","-")
  25. primary_category.to_csv("output/authors_primary_category.csv")
  26. articles = articles.merge(authors_references, how="left", left_on="article_id", right_on="article_id")
  27. articles = articles.merge(authors, how="left", left_on="bai", right_on="bai")
  28. d = {
  29. "categories": ("categories", "first"),
  30. "date_created": ("date_created", "first")
  31. }
  32. d.update({
  33. cat.replace("-", "_"): (cat, "sum")
  34. for cat in hep_cats
  35. })
  36. articles = articles.groupby(["article_id"]).agg(**d).reset_index()
  37. def decision_function(row):
  38. if len(row["categories"])==1:
  39. print("ok")
  40. return hep_cats[row["categories"][0]]
  41. else:
  42. contribs = np.array([row[cat.replace("-", "_")] for cat in hep_cats])
  43. most_frequent = np.argmax(contribs)
  44. tie = np.count_nonzero((contribs == most_frequent).astype(int))>1
  45. print(most_frequent, tie)
  46. return most_frequent if not tie else -1
  47. articles["category"] = articles.apply(decision_function, axis=1)
  48. articles = articles[articles["category"]>=0]
  49. articles["year"] = articles["date_created"].str[:4].replace('', 0).astype(int)
  50. articles = articles[(articles["year"] >= 1980) & (articles["year"] < 2020)]
  51. references = pd.read_parquet("inspire-harvest/database/articles_references.parquet")
  52. references = references.merge(articles[["article_id", "category", "year"]], how='inner', left_on="cited", right_on="article_id")
  53. articles = articles.merge(references, how='inner', left_on='article_id', right_on='cites', suffixes = ("_cites", "_cited"))
  54. articles.to_parquet("output/cross_citations_crosslists.parquet")