trends_in_susy.py 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. import pandas as pd
  2. import numpy as np
  3. from matplotlib import pyplot as plt
  4. import seaborn as sns
  5. from sklearn.preprocessing import MultiLabelBinarizer
  6. import argparse
  7. selected_topics = [11,33,37,42,60,64]
  8. years = np.arange(1980,2020)
  9. n_years = len(years)
  10. def is_susy(s: str):
  11. return "supersymmetr" in s or "susy" in s
  12. parser = argparse.ArgumentParser("extracting correlations")
  13. parser.add_argument("articles")
  14. args = parser.parse_args()
  15. articles = pd.read_parquet("../inspire-harvest/database/articles.parquet")[["article_id", "date_created", "pacs_codes", "categories", "abstract"]]
  16. articles = articles[articles["abstract"].str.lower().map(is_susy) == True]
  17. articles["article_id"] = articles["article_id"].astype(int)
  18. articles["year"] = articles["date_created"].str[:4].replace('', 0).astype(float).fillna(0).astype(int)
  19. articles = articles[(articles["year"] >= years.min()) & (articles["year"] <= years.max())]
  20. topics = pd.read_parquet(args.articles)
  21. topics["article_id"] = topics["article_id"].astype(int)
  22. topics["topics"] = topics["probs"]
  23. topics.drop(columns = ["year"], inplace = True)
  24. topics = topics.merge(articles, how="inner", left_on = "article_id", right_on = "article_id")
  25. n_topics = len(topics["topics"].iloc[0])
  26. cumprobs = np.zeros((n_years, n_topics))
  27. counts = np.zeros(n_years)
  28. for year, _articles in topics.groupby("year"):
  29. for article in _articles.to_dict(orient = 'records'):
  30. for topic, prob in enumerate(article['probs']):
  31. cumprobs[year-years.min(),topic] += prob
  32. counts[year-years.min()] = len(_articles)
  33. for topic in selected_topics:
  34. plt.plot(
  35. years,
  36. cumprobs[:,topic]/counts,
  37. # linestyle=lines[topic//7],
  38. label=topic
  39. )
  40. plt.title("Relative magnitude of topics within abstracts mentioning supersymmetry")
  41. plt.ylabel("Probability of each topic throughout years\n($p(t|\\mathrm{year}$)")
  42. plt.xlim(1980, 2018)
  43. plt.legend(fontsize='x-small')
  44. plt.show()