lucasgautheron
/
trading_zones_material


			
							1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
							import pandas as pd 
import numpy as np 

metric = "pmi"

pmi = pd.read_csv(f"output/{metric}.csv")
pacs = pd.read_csv("inspire-harvest/database/pacs_codes.csv").set_index("code")

pmi.set_index('description', inplace=True)
pmi = pmi.transpose()
pmi = pmi.merge(pacs, how="inner", left_index=True, right_index=True)
pmi = pmi.transpose()
pmi.index.name = "topic"
pmi.reset_index(inplace=True)

pmi = (pd.melt(pmi, id_vars=["topic"], value_name='value', var_name="pacs"))
pmi = pmi[pmi["topic"] != "description"]

pmi = pmi.merge(pacs, how="inner", left_on="pacs", right_index=True)

df1 = pmi.sort_values(["topic", "value"], ascending=[True, False]).groupby("topic").head(10)
df1.set_index("topic", inplace=True)
df1.to_csv("output/validation.csv")

df1 = pmi.sort_values(["topic", "value"], ascending=[True, False]).groupby("topic").head(5)
df1.set_index("topic", inplace=True)

df2 = pmi.sort_values(["description", "value"], ascending=[True, False]).groupby("pacs").head(5)
df2.set_index("pacs", inplace=True)
df2.to_csv("output/validation_by_pacs.csv")

import textwrap

df1.reset_index(inplace = True)

# df1["description"] = df1["description"].apply(lambda s: "\\\\ ".join(textwrap.wrap(s, width=30)))
# df1["description"] = df1["description"].apply(lambda s: '\\begin{tabular}{@{}c@{}}' + s +'\\end{tabular}')

df1["description"] = df1["description"].str.replace("&", "\\&")

df1["topic"] = df1["topic"].apply(lambda s: "\\\\ ".join(textwrap.wrap(s, width=15)))
df1["topic"] = df1["topic"].apply(lambda s: '\\begin{tabular}{l}' + s +'\\end{tabular}')

df1["value"] = df1["value"].apply(lambda x: f"{x:.2f}")

df1.rename(columns = {
    "value": "pmi",
    "description": "PACS category"
}, inplace = True)

latex = df1.set_index(["topic", "PACS category"]).to_latex(
    columns=["pmi"],
    longtable=True,
    sparsify=True,
    multirow=True,
    multicolumn=True,
    position='H',
    column_format='p{0.25\\textwidth}|p{0.6\\textwidth}|p{0.15\\textwidth}',
    escape=False,
    caption="PACS categories most correlated to the topics derived with the unsupervised model. Correlation is measured as the mutual pointwise information (pmi).",
    label="table:full_topics_pacs_pmi"
)

with open("tables/topic_pacs_validation.tex", "w+") as fp:
    fp.write(latex)


# df2.reset_index(inplace = True)

# df2 = df2[df2["pacs"].isin(["11.30.Pb", "12.60.Jv", "14.80.Da", "14.80.Ly", "14.80.Nb", "04.65.+e"])]

# df2["topic"] = df2["topic"].str.replace("&", "\\&")

# df2["description"] = df2["description"].apply(lambda s: "\\\\ ".join(textwrap.wrap(s, width=15)))
# df2["description"] = df2["description"].apply(lambda s: '\\begin{tabular}{l}' + s +'\\end{tabular}')

# df2["value"] = df2["value"].apply(lambda x: f"{x:.2f}")

# df2.rename(columns = {
#     "value": "pmi",
#     "description": "Catégorie PACS"
# }, inplace = True)

# latex = df2.set_index(["Catégorie PACS", "topic"]).to_latex(
#     columns=["pmi"],
#     longtable=True,
#     sparsify=True,
#     multirow=True,
#     multicolumn=True,
#     position='H',
#     column_format='p{0.25\\textwidth}|p{0.6\\textwidth}|p{0.15\\textwidth}',
#     escape=False,
#     caption="Sujets les plus corrélés avec les catégories PACS supersymétriques. Le niveau de corrélation est estimé via l'information mutuelle ponctuelle (pmi).",
#     label="table:susy_pacs_pmi"
# ).replace("Continued on next page", "Suite page suivante")

# with open("analyses/susy_pacs_pmi.tex", "w+") as fp:
#     fp.write(latex)