123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116 |
- import pandas as pd
- import numpy as np
- import tomotopy as tp
- import matplotlib
- from matplotlib import pyplot as plt
- matplotlib.use("pgf")
- matplotlib.rcParams.update(
- {
- "pgf.texsystem": "xelatex",
- "font.family": "serif",
- "font.serif": "Times New Roman",
- "text.usetex": True,
- "pgf.rcfonts": False,
- }
- )
- from adjustText import adjust_text
- import textwrap
- from sklearn.manifold import TSNE
- articles = pd.read_parquet("inspire-harvest/database/articles.parquet")[["article_id", "pacs_codes", "categories"]]
- articles["article_id"] = articles["article_id"].astype(int)
- topics = pd.read_parquet("output/hep-ct-75-0.1-0.001-130000-20/topics_0.parquet")
- topics["article_id"] = topics["article_id"].astype(int)
- topics["topics"] = topics["probs"]
- topics = topics.merge(articles, how="inner", left_on = "article_id", right_on = "article_id")
- topics["categories"] = topics["categories"].map(
- lambda l: (
- [x in l for x in ["Theory-HEP", "Phenomenology-HEP", "Experiment-HEP"]]
- )
- )
- X = np.stack(topics["topics"].values)
- Y = np.stack(topics["categories"].values).astype(int)
- num = np.outer(X.sum(axis=0),Y.sum(axis=0))/(X.shape[0]**2)
- den = np.tensordot(X, Y, axes=([0],[0]))/X.shape[0]
- npmi = np.log(num)/np.log(den)-1
- topic_main_category = npmi.argmax(axis=1).astype(int)
- usages = pd.read_csv('output/supersymmetry_usages.csv')
- usages = usages.groupby("term").agg(topic=("topic", lambda x: x.tolist()))
- susy_topics = usages.loc["supersymmetry"]["topic"] + [t for t in usages.loc["susy"]["topic"] if t not in usages.loc["supersymmetry"]["topic"]]
- descriptions = pd.read_csv("output/hep-ct-75-0.1-0.001-130000-20/descriptions.csv")
- labels = descriptions.loc[susy_topics]["description"].tolist()
- edges = np.array([False]*len(topic_main_category))
- edges[susy_topics]=True
- edges = ["black" if edge else "none" for edge in edges]
- mdl = tp.CTModel.load("output/hep-ct-75-0.1-0.001-130000-20/model")
- correlations = mdl.get_correlations()
- colors=['#377eb8', '#ff7f00', '#4daf4a']
- cats=["Theory", "Phenomenology", "Experiment"]
- # perplexity = 40 better identifies the different lumps
- tsne = TSNE(n_components=2, metric="precomputed", random_state=714, perplexity=40)
- points = tsne.fit_transform(1-correlations)
- from sklearn.linear_model import LinearRegression
- reg = LinearRegression()
- reg.fit(points, topic_main_category)
- angle = np.arctan(reg.coef_[0]/reg.coef_[1])-np.pi/2
- m = np.array([[np.cos(angle), np.sin(angle)], [-np.sin(angle),np.cos(angle)]])
- points=points@m
- fig, axes = plt.subplots(nrows=2,ncols=1,sharex=True,gridspec_kw={"height_ratios": [5, 1]})
- for i, cat in enumerate(cats):
- axes[0].scatter(
- points[topic_main_category==i,0],
- points[topic_main_category==i,1],
- color=colors[i],
- label=cat,
- edgecolors=[edges[i[0]] for i in np.argwhere(topic_main_category==i) if i!=np.nan]
- )
- texts = []
- for i,topic in enumerate(susy_topics):
- texts.append(
- axes[0].annotate(
- labels[i],
- xy=(points[topic,0],points[topic,1]),
- xytext=(points[topic,0],points[topic,1]+0.25),
- size="small"
- )
- )
- adjust_text(texts,ax=axes[0])
- import seaborn as sns
- sns.kdeplot(data=[points[topic_main_category==i,0] for i in range(3)],ax=axes[1],legend=False)
- plt.subplots_adjust(wspace=0, hspace=0)
- for i in range(2):
- axes[i].set_xticklabels([])
- axes[i].set_yticklabels([])
- axes[0].set_ylabel("y-axis")
- axes[1].set_xlabel("x-axis")
- axes[1].set_ylabel("Density")
- axes[0].legend()
- fig.savefig(f"plots/topics_tsne.eps", bbox_inches="tight")
- fig.savefig(f"plots/topics_tsne.png", bbox_inches="tight")
|