Scheduled service maintenance on November 22


On Friday, November 22, 2024, between 06:00 CET and 18:00 CET, GIN services will undergo planned maintenance. Extended service interruptions should be expected. We will try to keep downtimes to a minimum, but recommend that users avoid critical tasks, large data uploads, or DOI requests during this time.

We apologize for any inconvenience.

trends_in_susy.py 1.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061
  1. import pandas as pd
  2. import numpy as np
  3. from matplotlib import pyplot as plt
  4. import seaborn as sns
  5. from sklearn.preprocessing import MultiLabelBinarizer
  6. import argparse
  7. selected_topics = [11,33,37,42,60,64]
  8. years = np.arange(1980,2020)
  9. n_years = len(years)
  10. def is_susy(s: str):
  11. return "supersymmetr" in s or "susy" in s
  12. parser = argparse.ArgumentParser("extracting correlations")
  13. parser.add_argument("articles")
  14. args = parser.parse_args()
  15. articles = pd.read_parquet("../inspire-harvest/database/articles.parquet")[["article_id", "date_created", "pacs_codes", "categories", "abstract"]]
  16. articles = articles[articles["abstract"].str.lower().map(is_susy) == True]
  17. articles["article_id"] = articles["article_id"].astype(int)
  18. articles["year"] = articles["date_created"].str[:4].replace('', 0).astype(float).fillna(0).astype(int)
  19. articles = articles[(articles["year"] >= years.min()) & (articles["year"] <= years.max())]
  20. topics = pd.read_parquet(args.articles)
  21. topics["article_id"] = topics["article_id"].astype(int)
  22. topics["topics"] = topics["probs"]
  23. topics.drop(columns = ["year"], inplace = True)
  24. topics = topics.merge(articles, how="inner", left_on = "article_id", right_on = "article_id")
  25. n_topics = len(topics["topics"].iloc[0])
  26. cumprobs = np.zeros((n_years, n_topics))
  27. counts = np.zeros(n_years)
  28. for year, _articles in topics.groupby("year"):
  29. for article in _articles.to_dict(orient = 'records'):
  30. for topic, prob in enumerate(article['probs']):
  31. cumprobs[year-years.min(),topic] += prob
  32. counts[year-years.min()] = len(_articles)
  33. for topic in selected_topics:
  34. plt.plot(
  35. years,
  36. cumprobs[:,topic]/counts,
  37. # linestyle=lines[topic//7],
  38. label=topic
  39. )
  40. plt.title("Relative magnitude of topics within abstracts mentioning supersymmetry")
  41. plt.ylabel("Probability of each topic throughout years\n($p(t|\\mathrm{year}$)")
  42. plt.xlim(1980, 2018)
  43. plt.legend(fontsize='x-small')
  44. plt.show()