12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091 |
- import pandas as pd
- import numpy as np
- from os.path import join as opj
- corpora = {
- "output/etm_20_pretrained": "High-energy physics",
- "output/acl_2002_2022": "Computational Linguistics"
- }
- time_periods = {
- "output/etm_20_pretrained": ("2000--2009", "2015--2019"),
- "output/acl_2002_2022": ("2002--2011", "2012--2022")
- }
- author_disambiguation = {
- "output/etm_20_pretrained": "Yes",
- "output/acl_2002_2022": "No"
- }
- publication_requirement = {
- "output/etm_20_pretrained": 5,
- "output/acl_2002_2022": 3
- }
- df = []
- for corpus in corpora:
- N = len(pd.read_csv(opj(corpus, "aggregate.csv")))
- scores = np.load(opj(corpus, "scores.npy"))
- scores = scores.mean(axis=0)
- model = scores[1]
- baseline = scores[2]
- df.append({
- "Cohort": f"{corpora[corpus]}\\newline ($N={N}$)",
- "Model\\newline $\\mu(d_{{\\mathrm{{TV}}}}(\\bm{{y_{{a}}}}, \\bm{{y_{{a}}}}^{{\\text{{pred}}}}))$": f"\\textbf{{{model:.3f}}}" if model<baseline else f"{model:.3f}",
- "Baseline\\newline $\\mu(d_{{\\mathrm{{TV}}}}(\\bm{{y_{{a}}}}, \\bm{{x_{{a}}}}))$": f"\\textbf{{{baseline:.3f}}}" if baseline<model else f"{baseline:.3f}"
- })
- df = pd.DataFrame(df).set_index("Cohort").transpose()
- pd.set_option('display.max_colwidth', None)
- latex = df.to_latex(
- escape=False,
- multirow=True,
- column_format="|>{\\centering\\arraybackslash}m{4cm}|>{\\centering\\arraybackslash}m{5cm}|>{\\centering\\arraybackslash}m{5cm}|",
- caption="Performance of the actual model versus that of the baseline model, for i) the cohort of high-energy physicists and ii) a cohort from the ACL Anthology corpus of Computation Linguistics papers.",
- label="table:performance"
- )
- print(latex)
- with open(opj("output", "scores.tex"), "w+") as fp:
- fp.write(latex)
- df = []
- for corpus in corpora:
- N = len(pd.read_csv(opj(corpus, "aggregate.csv")))
- V = len(pd.read_csv(opj(corpus, "ngrams.csv")))
- D = len(pd.read_csv(opj(corpus, "articles.csv")))
- topics = pd.read_csv(opj(corpus, "topics.csv"))
- K = len(topics[~topics["label"].str.contains("Junk")])
- df.append({
- "Cohort": f"{corpora[corpus]}",
- "Cohort size ($N$)": f"${N:,}$".replace(",",r"\,"),
- "Number of abstracts ($D$)": f"${D:,}$".replace(",",r"\,"),
- "Vocabulary size ($V$)": f"${V:,}$".replace(",",r"\,"),
- "Valid topics ($K$)": K,
- "Initial time period": time_periods[corpus][0],
- "Late time period": time_periods[corpus][1],
- "Author disambiguation": author_disambiguation[corpus],
- "Minimum publications": f"${publication_requirement[corpus]}$"
- })
- df = pd.DataFrame(df).set_index("Cohort").transpose()
- latex = df.to_latex(
- escape=False,
- multirow=True,
- caption="Comparison of the high-energy physics corpus and the computational linguistics corpus.",
- label="table:corpora",
- column_format="|c|c|c|"
- )
- print(latex)
- with open(opj("output", "hep_vs_acl.tex"), "w+") as fp:
- fp.write(latex)
|