citation_matrix.py 3.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980
  1. import pandas as pd
  2. import numpy as np
  3. import networkx as nx
  4. import random
  5. import matplotlib
  6. from matplotlib import pyplot as plt
  7. import seaborn as sns
  8. matplotlib.use("pgf")
  9. matplotlib.rcParams.update(
  10. {
  11. "pgf.texsystem": "xelatex",
  12. "font.family": "serif",
  13. "font.serif": "Times New Roman",
  14. "text.usetex": True,
  15. "pgf.rcfonts": False,
  16. }
  17. )
  18. def is_hep(categories: str):
  19. return any(["-HEP" in x for x in categories])
  20. articles = pd.read_parquet("inspire-harvest/database/articles.parquet")[["article_id", "categories", "date_created"]]
  21. articles["is_hep"] = articles.categories.map(is_hep)
  22. articles = articles[articles["is_hep"]]
  23. articles["th"] = articles.categories.map(lambda l: "Theory-HEP" in l)
  24. articles["exp"] = articles.categories.map(lambda l: "Experiment-HEP" in l)
  25. articles["ph"] = articles.categories.map(lambda l: "Phenomenology-HEP" in l)
  26. articles["year"] = articles["date_created"].str[:4].replace('', 0).astype(int)
  27. articles = articles[(articles["year"] >= 2001) & (articles["year"] < 2020)]
  28. references = pd.read_parquet("inspire-harvest/database/articles_references.parquet")
  29. references = references.merge(articles[["article_id", "th", "exp", "ph"]], how='inner', left_on="cited", right_on="article_id")
  30. articles = articles.merge(references, how='inner', left_on='article_id', right_on='cites', suffixes = ("_cites", "_cited"))
  31. selected_articles = articles#[articles["article_id_cited"].isin(random.sample(set(articles["article_id_cited"].unique()), 10000))]
  32. groups = ['exp', 'ph', 'th']
  33. friendly_groups = ["Experiment", "Phenomenology", "Theory"]
  34. indices = {groups[i]: i for i in range(len(groups))}
  35. cites_matrix = np.zeros((len(groups),len(groups)))
  36. cited_matrix = np.zeros((len(groups),len(groups)))
  37. counts_cites = np.zeros(len(groups))
  38. counts_cited = np.zeros(len(groups))
  39. print("Building citation matrix")
  40. for cited, cites in selected_articles.groupby("article_id_cited"):
  41. for c in cites.to_dict(orient="records"):
  42. w_cites = 1/(int(c["exp_cites"])+int(c["ph_cites"])+int(c["th_cites"]))
  43. w_cited = 1/(int(c["exp_cited"])+int(c["ph_cited"])+int(c["th_cited"]))
  44. for i in range(len(indices)):
  45. for j in range(len(indices)):
  46. if c[f"{groups[i]}_cites"] and c[f"{groups[j]}_cited"]:
  47. cites_matrix[i,j] += w_cited*w_cites
  48. cited_matrix[j,i] += w_cites*w_cited
  49. counts_cites[i] += w_cited*w_cites
  50. counts_cited[j] += w_cites*w_cited
  51. sns.heatmap((cites_matrix/counts_cites.reshape(-1,1)).transpose(), cmap="Reds", annot=True, fmt=".2f", xticklabels = friendly_groups, yticklabels = friendly_groups, vmin=0, vmax=1)
  52. plt.xlabel("Citing article's category")
  53. plt.ylabel("Cited article's category")
  54. plt.savefig("plots/cites_matrix.pgf")
  55. plt.savefig("plots/cites_matrix.pdf")
  56. plt.savefig("plots/cites_matrix.eps")
  57. plt.clf()
  58. sns.heatmap((cited_matrix/counts_cited.reshape(-1,1)).transpose(), cmap="Reds", annot=True, fmt=".2f", xticklabels = friendly_groups, yticklabels = friendly_groups, vmin=0, vmax=1)
  59. plt.xlabel("Cited article's category")
  60. plt.ylabel("Citing article's category")
  61. plt.savefig("plots/cited_matrix.pgf")
  62. plt.savefig("plots/cited_matrix.pdf")
  63. plt.savefig("plots/cited_matrix.eps")