pacs_validation.py 3.4 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
  1. import pandas as pd
  2. import numpy as np
  3. metric = "pmi"
  4. pmi = pd.read_csv(f"output/{metric}.csv")
  5. pacs = pd.read_csv("inspire-harvest/database/pacs_codes.csv").set_index("code")
  6. pmi.set_index('description', inplace=True)
  7. pmi = pmi.transpose()
  8. pmi = pmi.merge(pacs, how="inner", left_index=True, right_index=True)
  9. pmi = pmi.transpose()
  10. pmi.index.name = "topic"
  11. pmi.reset_index(inplace=True)
  12. pmi = (pd.melt(pmi, id_vars=["topic"], value_name='value', var_name="pacs"))
  13. pmi = pmi[pmi["topic"] != "description"]
  14. pmi = pmi.merge(pacs, how="inner", left_on="pacs", right_index=True)
  15. df1 = pmi.sort_values(["topic", "value"], ascending=[True, False]).groupby("topic").head(10)
  16. df1.set_index("topic", inplace=True)
  17. df1.to_csv("output/validation.csv")
  18. df1 = pmi.sort_values(["topic", "value"], ascending=[True, False]).groupby("topic").head(5)
  19. df1.set_index("topic", inplace=True)
  20. df2 = pmi.sort_values(["description", "value"], ascending=[True, False]).groupby("pacs").head(5)
  21. df2.set_index("pacs", inplace=True)
  22. df2.to_csv("output/validation_by_pacs.csv")
  23. import textwrap
  24. df1.reset_index(inplace = True)
  25. # df1["description"] = df1["description"].apply(lambda s: "\\\\ ".join(textwrap.wrap(s, width=30)))
  26. # df1["description"] = df1["description"].apply(lambda s: '\\begin{tabular}{@{}c@{}}' + s +'\\end{tabular}')
  27. df1["description"] = df1["description"].str.replace("&", "\\&")
  28. df1["topic"] = df1["topic"].apply(lambda s: "\\\\ ".join(textwrap.wrap(s, width=15)))
  29. df1["topic"] = df1["topic"].apply(lambda s: '\\begin{tabular}{l}' + s +'\\end{tabular}')
  30. df1["value"] = df1["value"].apply(lambda x: f"{x:.2f}")
  31. df1.rename(columns = {
  32. "value": "pmi",
  33. "description": "PACS category"
  34. }, inplace = True)
  35. latex = df1.set_index(["topic", "PACS category"]).to_latex(
  36. columns=["pmi"],
  37. longtable=True,
  38. sparsify=True,
  39. multirow=True,
  40. multicolumn=True,
  41. position='H',
  42. column_format='p{0.25\\textwidth}|p{0.6\\textwidth}|p{0.15\\textwidth}',
  43. escape=False,
  44. caption="PACS categories most correlated to the topics derived with the unsupervised model. Correlation is measured as the mutual pointwise information (pmi).",
  45. label="table:full_topics_pacs_pmi"
  46. )
  47. with open("tables/topic_pacs_validation.tex", "w+") as fp:
  48. fp.write(latex)
  49. # df2.reset_index(inplace = True)
  50. # df2 = df2[df2["pacs"].isin(["11.30.Pb", "12.60.Jv", "14.80.Da", "14.80.Ly", "14.80.Nb", "04.65.+e"])]
  51. # df2["topic"] = df2["topic"].str.replace("&", "\\&")
  52. # df2["description"] = df2["description"].apply(lambda s: "\\\\ ".join(textwrap.wrap(s, width=15)))
  53. # df2["description"] = df2["description"].apply(lambda s: '\\begin{tabular}{l}' + s +'\\end{tabular}')
  54. # df2["value"] = df2["value"].apply(lambda x: f"{x:.2f}")
  55. # df2.rename(columns = {
  56. # "value": "pmi",
  57. # "description": "Catégorie PACS"
  58. # }, inplace = True)
  59. # latex = df2.set_index(["Catégorie PACS", "topic"]).to_latex(
  60. # columns=["pmi"],
  61. # longtable=True,
  62. # sparsify=True,
  63. # multirow=True,
  64. # multicolumn=True,
  65. # position='H',
  66. # column_format='p{0.25\\textwidth}|p{0.6\\textwidth}|p{0.15\\textwidth}',
  67. # escape=False,
  68. # caption="Sujets les plus corrélés avec les catégories PACS supersymétriques. Le niveau de corrélation est estimé via l'information mutuelle ponctuelle (pmi).",
  69. # label="table:susy_pacs_pmi"
  70. # ).replace("Continued on next page", "Suite page suivante")
  71. # with open("analyses/susy_pacs_pmi.tex", "w+") as fp:
  72. # fp.write(latex)