predictive_power.py 2.3 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586
  1. import pandas as pd
  2. import numpy as np
  3. from os.path import join as opj
  4. locations = [
  5. "output/etm_20_r",
  6. "output/etm_20_r",
  7. "output/etm_20_r",
  8. "output/etm_20_r",
  9. "output/etm_20_r",
  10. "output/etm_15_pretrained",
  11. "output/acl_2002_2022",
  12. ]
  13. corpora = [
  14. "High-energy physics",
  15. "High-energy physics",
  16. "High-energy physics",
  17. "High-energy physics",
  18. "High-energy physics",
  19. "High-energy physics (K=15)",
  20. "ACL Anthology",
  21. ]
  22. time_periods = [
  23. ("2000--2009", "2015--2019"),
  24. ("2000--2004", "2005--2009"),
  25. ("2005--2009", "2010--2014"),
  26. ("2010--2014", "2015--2019"),
  27. ("2000--2009", "2010--2019"),
  28. ("2000--2009", "2015--2019"),
  29. ("2002--2011", "2012--2022")
  30. ]
  31. aggregate_suffix = [
  32. "",
  33. "_0_1",
  34. "_1_2",
  35. "_2_3",
  36. "_0-1_2-3",
  37. "",
  38. ""
  39. ]
  40. score_suffix = [
  41. "_default",
  42. "_0_1",
  43. "_1_2",
  44. "_2_3",
  45. "_0-1_2-3",
  46. "_default",
  47. ""
  48. ]
  49. df = []
  50. for i, corpus in enumerate(corpora):
  51. N = len(pd.read_csv(opj(locations[i], f"aggregate{aggregate_suffix[i]}.csv")))
  52. scores = np.load(opj(locations[i], f"scores{score_suffix[i]}.npy"))
  53. scores = scores.mean(axis=0)
  54. model = scores[1]
  55. baseline = scores[2]
  56. df.append({
  57. "Cohort": f"{corpus} (from {time_periods[i][0]} to {time_periods[i][1]})",
  58. "Model\\newline $\\mu(d_{{\\mathrm{{TV}}}}(\\bm{{y_{{a}}}}, \\bm{{y_{{a}}}}^{{\\text{{pred}}}}))$": f"\\textbf{{{model:.3f}}}" if model<baseline else f"{model:.3f}",
  59. "Baseline\\newline $\\mu(d_{{\\mathrm{{TV}}}}(\\bm{{y_{{a}}}}, \\bm{{x_{{a}}}}))$": f"\\textbf{{{baseline:.3f}}}" if baseline<model else f"{baseline:.3f}"
  60. })
  61. # df = pd.DataFrame(df).set_index("Cohort").transpose()
  62. df = pd.DataFrame(df).set_index("Cohort")
  63. pd.set_option('display.max_colwidth', None)
  64. latex = df.to_latex(
  65. escape=False,
  66. multirow=True,
  67. column_format="|>{\\centering\\arraybackslash}m{4cm}|>{\\centering\\arraybackslash}m{5cm}|>{\\centering\\arraybackslash}m{5cm}|",
  68. caption="Performance of the actual model versus that of the baseline model, for i) the cohort of high-energy physicists and ii) a cohort from the ACL Anthology corpus of Computation Linguistics papers.",
  69. label="table:performance"
  70. )
  71. print(latex)
  72. with open(opj("output", "scores.tex"), "w+") as fp:
  73. fp.write(latex)