lucasgautheron
/
trading_zones_material


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116
							import pandas as pd 
import numpy as np 
import tomotopy as tp 

import matplotlib
from matplotlib import pyplot as plt 
matplotlib.use("pgf")
matplotlib.rcParams.update(
    {
        "pgf.texsystem": "xelatex",
        "font.family": "serif",
        "font.serif": "Times New Roman",
        "text.usetex": True,
        "pgf.rcfonts": False,
    }
)
from adjustText import adjust_text

import textwrap

from sklearn.manifold import TSNE

articles = pd.read_parquet("inspire-harvest/database/articles.parquet")[["article_id", "pacs_codes", "categories"]]
articles["article_id"] = articles["article_id"].astype(int)

topics = pd.read_parquet("output/hep-ct-75-0.1-0.001-130000-20/topics_0.parquet")
topics["article_id"] = topics["article_id"].astype(int)
topics["topics"] = topics["probs"]
topics = topics.merge(articles, how="inner", left_on = "article_id", right_on = "article_id")
topics["categories"] = topics["categories"].map(
    lambda l: (
        [x in l for x in ["Theory-HEP", "Phenomenology-HEP", "Experiment-HEP"]]
    )
)

X = np.stack(topics["topics"].values)
Y = np.stack(topics["categories"].values).astype(int)

num = np.outer(X.sum(axis=0),Y.sum(axis=0))/(X.shape[0]**2)
den = np.tensordot(X, Y, axes=([0],[0]))/X.shape[0]
npmi = np.log(num)/np.log(den)-1
topic_main_category = npmi.argmax(axis=1).astype(int)

usages = pd.read_csv('output/supersymmetry_usages.csv')
usages = usages.groupby("term").agg(topic=("topic", lambda x: x.tolist()))
susy_topics = usages.loc["supersymmetry"]["topic"] + [t for t in usages.loc["susy"]["topic"] if t not in usages.loc["supersymmetry"]["topic"]]

descriptions = pd.read_csv("output/hep-ct-75-0.1-0.001-130000-20/descriptions.csv")
labels =  descriptions.loc[susy_topics]["description"].tolist()

edges = np.array([False]*len(topic_main_category))
edges[susy_topics]=True
edges = ["black" if edge else "none" for edge in edges]

mdl = tp.CTModel.load("output/hep-ct-75-0.1-0.001-130000-20/model")
correlations = mdl.get_correlations()


colors=['#377eb8', '#ff7f00', '#4daf4a']
cats=["Theory", "Phenomenology", "Experiment"]

# perplexity = 40 better identifies the different lumps
tsne = TSNE(n_components=2, metric="precomputed", random_state=714, perplexity=40)
points = tsne.fit_transform(1-correlations)

from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(points, topic_main_category)
angle = np.arctan(reg.coef_[0]/reg.coef_[1])-np.pi/2
m = np.array([[np.cos(angle), np.sin(angle)], [-np.sin(angle),np.cos(angle)]])
points=points@m

fig, axes = plt.subplots(nrows=2,ncols=1,sharex=True,gridspec_kw={"height_ratios": [5, 1]})

for i, cat in enumerate(cats):
    axes[0].scatter(
        points[topic_main_category==i,0],
        points[topic_main_category==i,1],
        color=colors[i],
        label=cat,
        edgecolors=[edges[i[0]] for i in np.argwhere(topic_main_category==i) if i!=np.nan]
    )

texts = []

for i,topic in enumerate(susy_topics):
    texts.append(
        axes[0].annotate(
            labels[i],
            xy=(points[topic,0],points[topic,1]),
            xytext=(points[topic,0],points[topic,1]+0.25),
            size="small"
        )
    )

adjust_text(texts,ax=axes[0])

import seaborn as sns
sns.kdeplot(data=[points[topic_main_category==i,0] for i in range(3)],ax=axes[1],legend=False)

plt.subplots_adjust(wspace=0, hspace=0)

for i in range(2):
    axes[i].set_xticklabels([])
    axes[i].set_yticklabels([])

axes[0].set_ylabel("y-axis")

axes[1].set_xlabel("x-axis")
axes[1].set_ylabel("Density")

axes[0].legend()


fig.savefig(f"plots/topics_tsne.eps", bbox_inches="tight")
fig.savefig(f"plots/topics_tsne.png", bbox_inches="tight")