compare_corpora.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091
  1. import pandas as pd
  2. import numpy as np
  3. from os.path import join as opj
  4. corpora = {
  5. "output/etm_20_pretrained": "High-energy physics",
  6. "output/acl_2002_2022": "Computational Linguistics"
  7. }
  8. time_periods = {
  9. "output/etm_20_pretrained": ("2000--2009", "2015--2019"),
  10. "output/acl_2002_2022": ("2002--2011", "2012--2022")
  11. }
  12. author_disambiguation = {
  13. "output/etm_20_pretrained": "Yes",
  14. "output/acl_2002_2022": "No"
  15. }
  16. publication_requirement = {
  17. "output/etm_20_pretrained": 5,
  18. "output/acl_2002_2022": 3
  19. }
  20. df = []
  21. for corpus in corpora:
  22. N = len(pd.read_csv(opj(corpus, "aggregate.csv")))
  23. scores = np.load(opj(corpus, "scores.npy"))
  24. scores = scores.mean(axis=0)
  25. model = scores[1]
  26. baseline = scores[2]
  27. df.append({
  28. "Cohort": f"{corpora[corpus]}\\newline ($N={N}$)",
  29. "Model\\newline $\\mu(d_{{\\mathrm{{TV}}}}(\\bm{{y_{{a}}}}, \\bm{{y_{{a}}}}^{{\\text{{pred}}}}))$": f"\\textbf{{{model:.3f}}}" if model<baseline else f"{model:.3f}",
  30. "Baseline\\newline $\\mu(d_{{\\mathrm{{TV}}}}(\\bm{{y_{{a}}}}, \\bm{{x_{{a}}}}))$": f"\\textbf{{{baseline:.3f}}}" if baseline<model else f"{baseline:.3f}"
  31. })
  32. df = pd.DataFrame(df).set_index("Cohort").transpose()
  33. pd.set_option('display.max_colwidth', None)
  34. latex = df.to_latex(
  35. escape=False,
  36. multirow=True,
  37. column_format="|>{\\centering\\arraybackslash}m{4cm}|>{\\centering\\arraybackslash}m{5cm}|>{\\centering\\arraybackslash}m{5cm}|",
  38. caption="Performance of the actual model versus that of the baseline model, for i) the cohort of high-energy physicists and ii) a cohort from the ACL Anthology corpus of Computation Linguistics papers.",
  39. label="table:performance"
  40. )
  41. print(latex)
  42. with open(opj("output", "scores.tex"), "w+") as fp:
  43. fp.write(latex)
  44. df = []
  45. for corpus in corpora:
  46. N = len(pd.read_csv(opj(corpus, "aggregate.csv")))
  47. V = len(pd.read_csv(opj(corpus, "ngrams.csv")))
  48. D = len(pd.read_csv(opj(corpus, "articles.csv")))
  49. topics = pd.read_csv(opj(corpus, "topics.csv"))
  50. K = len(topics[~topics["label"].str.contains("Junk")])
  51. df.append({
  52. "Cohort": f"{corpora[corpus]}",
  53. "Cohort size ($N$)": f"${N:,}$".replace(",",r"\,"),
  54. "Number of abstracts ($D$)": f"${D:,}$".replace(",",r"\,"),
  55. "Vocabulary size ($V$)": f"${V:,}$".replace(",",r"\,"),
  56. "Valid topics ($K$)": K,
  57. "Initial time period": time_periods[corpus][0],
  58. "Late time period": time_periods[corpus][1],
  59. "Author disambiguation": author_disambiguation[corpus],
  60. "Minimum publications": f"${publication_requirement[corpus]}$"
  61. })
  62. df = pd.DataFrame(df).set_index("Cohort").transpose()
  63. latex = df.to_latex(
  64. escape=False,
  65. multirow=True,
  66. caption="Comparison of the high-energy physics corpus and the computational linguistics corpus.",
  67. label="table:corpora",
  68. column_format="|c|c|c|"
  69. )
  70. print(latex)
  71. with open(opj("output", "hep_vs_acl.tex"), "w+") as fp:
  72. fp.write(latex)