123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177 |
- import pandas as pd
- import numpy as np
- import matplotlib
- from matplotlib import pyplot as plt
- import seaborn as sns
- matplotlib.use("pgf")
- matplotlib.rcParams.update(
- {
- "pgf.texsystem": "xelatex",
- "font.family": "serif",
- "font.serif": "Times New Roman",
- "text.usetex": True,
- "pgf.rcfonts": False,
- }
- )
- from sklearn.preprocessing import MultiLabelBinarizer
- from sklearn.linear_model import LinearRegression
- from scipy import stats
- import argparse
- def pearsonr_ci(x,y,alpha=0.05):
- ''' calculate Pearson correlation along with the confidence interval using scipy and numpy
- Source: https://zhiyzuo.github.io/Pearson-Correlation-CI-in-Python/
- Parameters
- ----------
- x, y : iterable object such as a list or np.array
- Input for correlation calculation
- alpha : float
- Significance level. 0.05 by default
- Returns
- -------
- r : float
- Pearson's correlation coefficient
- pval : float
- The corresponding p value
- lo, hi : float
- The lower and upper bound of confidence intervals
- '''
- r, p = stats.pearsonr(x,y)
- r_z = np.arctanh(r)
- se = 1/np.sqrt(x.size-3)
- z = stats.norm.ppf(1-alpha/2)
- lo_z, hi_z = r_z-z*se, r_z+z*se
- lo, hi = np.tanh((lo_z, hi_z))
- return r, p, lo, hi
- def is_susy(s: str):
- return "supersymmetr" in s or "susy" in s
- parser = argparse.ArgumentParser("extracting correlations")
- parser.add_argument("--articles", default="output/hep-ct-75-0.1-0.001-130000-20/topics_0.parquet")
- parser.add_argument("--since", help="since year", type=int, default=2011)
- parser.add_argument("--until", help="until year", type=int, default=2019)
- parser.add_argument("--domain", choices=["hep", "susy"], default="susy")
- parser.add_argument("--descriptions", default="output/hep-ct-75-0.1-0.001-130000-20/descriptions.csv")
- args = parser.parse_args()
- years = np.arange(args.since, args.until+1)
- n_years = len(years)
- articles = pd.read_parquet("inspire-harvest/database/articles.parquet")[["article_id", "date_created", "pacs_codes", "categories", "abstract", "title"]]
- if args.domain == "susy":
- articles = articles[(articles["abstract"].str.lower().map(is_susy) == True) | (articles["title"].str.lower().map(is_susy) == True)]
- articles["article_id"] = articles["article_id"].astype(int)
- articles["year"] = articles["date_created"].str[:4].replace('', 0).astype(int)
- articles = articles[(articles["year"] >= years.min()) & (articles["year"] <= years.max())]
- topics = pd.read_parquet(args.articles)
- topics["article_id"] = topics["article_id"].astype(int)
- topics["topics"] = topics["probs"]
- topics.drop(columns = ["year"], inplace = True)
- topics = topics.merge(articles, how="inner", left_on = "article_id", right_on = "article_id")
- n_topics = len(topics["topics"].iloc[0])
- cumprobs = np.zeros((n_years, n_topics))
- counts = np.zeros(n_years)
- for year, _articles in topics.groupby("year"):
- for article in _articles.to_dict(orient = 'records'):
- for topic, prob in enumerate(article['probs']):
- cumprobs[year-years.min(),topic] += prob
- counts[year-years.min()] = len(_articles)
- fits = []
- for topic in range(n_topics):
- y = cumprobs[:,topic]/counts
- reg = LinearRegression().fit(years.reshape(-1, 1), y)
- r, p, lo_95, hi_95 = pearsonr_ci(years, y)
- r, p, lo_99, hi_99 = pearsonr_ci(years, y, alpha=0.01)
- fits.append({
- 'topic': topic,
- 'r2': reg.score(years.reshape(-1, 1), y)**2,
- 'slope': reg.coef_[0],
- 'lower_95': lo_95,
- 'high_95': hi_95,
- 'lower_99': lo_99,
- 'high_99': hi_99
- })
- fits = pd.DataFrame(fits)
- fits = fits.merge(pd.read_csv(args.descriptions))
- fits['95_significant'] = fits['lower_95']*fits['high_95'] > 0
- fits['99_significant'] = fits['lower_99']*fits['high_99'] > 0
- fits.sort_values("slope", ascending=True, inplace=True)
- fits.to_csv('output/fits.csv')
- significant_dec = fits[fits["99_significant"]==True].head(3)
- significant_inc = fits[fits["99_significant"]==True].tail(3)
- fig, axes = plt.subplots(1,2,sharey=True)
- colors = ['#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3']
- ax = axes[0]
- n = 0
- for topic in significant_dec.to_dict(orient="records"):
- ax.plot(
- years,
- cumprobs[:,topic['topic']]/counts,
- color = colors[n]
- )
- ax.scatter(
- years,
- cumprobs[:,topic['topic']]/counts,
- label=topic['description'],
- color = colors[n]
- )
- n +=1
- ax.set_ylabel("Average relative contribution of each topic per year ($\\bar{\\theta_z}$)")
- ax.set_xlim(years.min(), years.max())
- ax.legend(fontsize='x-small', loc="upper right")
- ax = axes[1]
- n = 0
- for topic in significant_inc.to_dict(orient="records"):
- ax.plot(
- years,
- cumprobs[:,topic['topic']]/counts,
- color = colors[n]
- )
- ax.scatter(
- years,
- cumprobs[:,topic['topic']]/counts,
- label=topic['description'],
- color = colors[n]
- )
- n +=1
- ax.set_xlim(years.min(), years.max())
- ax.legend(fontsize='x-small', loc="upper right")
- fig.suptitle(
- "Coldest topics (left) and hottest topics (right) – {}, {}-{}".format(
- "high-energy physics" if args.domain == "hep" else "supersymmetry",
- args.since,
- args.until
- )
- )
- plt.savefig(f"plots/hot_cold_topics_hep_{args.since}_{args.until}_{args.domain}.pgf", bbox_inches="tight")
- plt.savefig(f"plots/hot_cold_topics_hep_{args.since}_{args.until}_{args.domain}.pdf", bbox_inches="tight")
- plt.savefig(f"plots/hot_cold_topics_hep_{args.since}_{args.until}_{args.domain}.eps", bbox_inches="tight")
|