Lucas Gautheron vor 1 Jahr
Ursprung
Commit
fcb0ca2727
3 geänderte Dateien mit 10213 neuen und 0 gelöschten Zeilen
  1. 10103 0
      plots/topics_tsne.eps
  2. 1 0
      plots/topics_tsne.png
  3. 109 0
      plots/tsne.py

Datei-Diff unterdrückt, da er zu groß ist
+ 10103 - 0
plots/topics_tsne.eps


+ 1 - 0
plots/topics_tsne.png

@@ -0,0 +1 @@
+/annex/objects/MD5E-s31392--94ed66da50d92c8d6d14e8a238ef7f88.png

+ 109 - 0
plots/tsne.py

@@ -0,0 +1,109 @@
+import pandas as pd 
+import numpy as np 
+import tomotopy as tp 
+
+import matplotlib
+from matplotlib import pyplot as plt 
+matplotlib.use("pgf")
+matplotlib.rcParams.update(
+    {
+        "pgf.texsystem": "xelatex",
+        "font.family": "serif",
+        "font.serif": "Times New Roman",
+        "text.usetex": True,
+        "pgf.rcfonts": False,
+    }
+)
+from adjustText import adjust_text
+
+import textwrap
+
+from sklearn.manifold import TSNE
+
+articles = pd.read_parquet("inspire-harvest/database/articles.parquet")[["article_id", "pacs_codes", "categories"]]
+articles["article_id"] = articles["article_id"].astype(int)
+
+topics = pd.read_parquet("output/hep-ct-75-0.1-0.001-130000-20/topics_0.parquet")
+topics["article_id"] = topics["article_id"].astype(int)
+topics["topics"] = topics["probs"]
+topics = topics.merge(articles, how="inner", left_on = "article_id", right_on = "article_id")
+topics["categories"] = topics["categories"].map(
+    lambda l: (
+        [x in l for x in ["Theory-HEP", "Phenomenology-HEP", "Experiment-HEP"]]
+    )
+)
+
+X = np.stack(topics["topics"].values)
+Y = np.stack(topics["categories"].values).astype(int)
+
+cat_topic_mean = np.zeros((Y.shape[1], X.shape[1]))
+for i in range(3):
+    cat_topic_mean[i] = X[Y[:,i]==1,:].mean(axis=0)
+
+topic_main_category = cat_topic_mean.argmax(axis=0).astype(int)
+
+usages = pd.read_csv('output/supersymmetry_usages.csv')
+usages = usages.groupby("term").agg(topic=("topic", lambda x: x.tolist()))
+susy_topics = usages.loc["supersymmetry"]["topic"] + [t for t in usages.loc["susy"]["topic"] if t not in usages.loc["supersymmetry"]["topic"]]
+
+descriptions = pd.read_csv("output/hep-ct-75-0.1-0.001-130000-20/descriptions.csv")
+labels =  descriptions.loc[susy_topics]["description"].tolist()
+
+edges = np.array([False]*len(topic_main_category))
+edges[susy_topics]=True
+edges = ["black" if edge else "none" for edge in edges]
+
+mdl = tp.CTModel.load("output/hep-ct-75-0.1-0.001-130000-20/model")
+correlations = mdl.get_correlations()
+
+
+colors=['#377eb8', '#ff7f00', '#4daf4a']
+cats=["Theory", "Phenomenology", "Experiment"]
+
+tsne = TSNE(n_components=2, metric="precomputed", random_state=714, perplexity=40)
+points = tsne.fit_transform(1-correlations)
+
+from sklearn.linear_model import LinearRegression
+reg = LinearRegression()
+reg.fit(points, topic_main_category)
+angle = np.arctan(reg.coef_[0]/reg.coef_[1])-np.pi/2
+m = np.array([[np.cos(angle), np.sin(angle)], [-np.sin(angle),np.cos(angle)]])
+points=points@m
+
+fig, axes = plt.subplots(nrows=2,ncols=1,sharex=True,gridspec_kw={"height_ratios": [5, 1]},figsize=[6.4,5])
+
+for i, cat in enumerate(cats):
+    axes[0].scatter(
+        points[topic_main_category==i,0],
+        points[topic_main_category==i,1],
+        color=colors[i],
+        label=cat,
+        edgecolors=[edges[i[0]] for i in np.argwhere(topic_main_category==i) if i!=np.nan]
+    )
+
+texts = []
+
+for i,topic in enumerate(susy_topics):
+    texts.append(
+        axes[0].annotate(
+            labels[i],
+            xy=(points[topic,0],points[topic,1]),
+            xytext=(points[topic,0],points[topic,1]+0.25),
+            size="small"
+        )
+    )
+
+adjust_text(texts,ax=axes[0])
+
+import seaborn as sns
+sns.kdeplot(data=[points[topic_main_category==i,0] for i in range(3)],ax=axes[1],legend=False)
+
+plt.subplots_adjust(wspace=0, hspace=0)
+
+for i in range(2):
+    axes[i].set_xticklabels([])
+    axes[i].set_yticklabels([])
+
+fig.legend()
+plt.savefig(f"plots/topics_tsne.eps", bboxes_inches="tight")
+plt.savefig(f"plots/topics_tsne.png", bboxes_inches="tight")