category_prediction.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433
  1. from AbstractSemantics.terms import TermExtractor
  2. import pandas as pd
  3. import numpy as np
  4. from os.path import join as opj
  5. from os.path import exists
  6. import itertools
  7. from functools import partial
  8. from collections import defaultdict
  9. import re
  10. from sklearn.preprocessing import MultiLabelBinarizer
  11. from sklearn.feature_extraction.text import TfidfTransformer
  12. from sklearn.model_selection import train_test_split
  13. import multiprocessing as mp
  14. from matplotlib import pyplot as plt
  15. import argparse
  16. import yaml
  17. import sys
  18. if __name__ == '__main__':
  19. parser = argparse.ArgumentParser('CT Model')
  20. parser.add_argument('location', help='model directory')
  21. parser.add_argument('filter', choices=['categories', 'keywords', 'no-filter'], help='filter type')
  22. parser.add_argument('--values', nargs='+', default=[], help='filter allowed values')
  23. parser.add_argument('--samples', type=int, default=50000)
  24. parser.add_argument('--constant-sampling', type=int, default=0)
  25. parser.add_argument('--reuse-articles', default=False, action="store_true", help="reuse article selection")
  26. parser.add_argument('--nouns', default=False, action="store_true", help="include nouns")
  27. parser.add_argument('--adjectives', default=False, action="store_true", help="include adjectives")
  28. parser.add_argument('--lemmatize', default=False, action="store_true", help="stemmer")
  29. parser.add_argument('--lemmatize-ngrams', default=False, action="store_true", help="stemmer")
  30. parser.add_argument('--remove-latex', default=False, action="store_true", help="remove latex")
  31. parser.add_argument('--limit-redundancy', default=False, action="store_true", help="limit redundancy")
  32. parser.add_argument('--add-title', default=False, action="store_true", help="include title")
  33. parser.add_argument('--top-unithood', type=int, default=20000, help='top unithood filter')
  34. parser.add_argument('--min-token-length', type=int, default=0, help='minimum token length')
  35. parser.add_argument('--min-df', type=int, default=0, help='min_df')
  36. parser.add_argument('--reuse-stored-vocabulary', default=False, action='store_true')
  37. parser.add_argument('--threads', type=int, default=4)
  38. args = parser.parse_args(["output/category_prediction", "categories", "--values", "Experiment-HEP", "Phenomenology-HEP", "Theory-HEP", "--samples", "110000", "--nouns", "--lemmatize", "--lemmatize-ngrams", "--remove-latex", "--add-title", "--top-unithood", "1000", "--threads", "16"])
  39. with open(opj(args.location, "params.yml"), "w+") as fp:
  40. yaml.dump(args, fp)
  41. articles = pd.read_parquet("inspire-harvest/database/articles.parquet")[["title", "abstract", "article_id", "date_created", "categories"]]
  42. if args.add_title:
  43. articles["abstract"] = articles["abstract"].str.cat(articles["title"])
  44. articles.drop(columns = ["title"], inplace=True)
  45. if args.remove_latex:
  46. articles['abstract'] = articles['abstract'].apply(lambda s: re.sub('$[^>]+$', '', s))
  47. articles = articles[articles["abstract"].map(len)>=100]
  48. articles["abstract"] = articles["abstract"].str.lower()
  49. articles = articles[articles["date_created"].str.len() >= 4]
  50. articles["year"] = articles["date_created"].str[:4].astype(int)-1980
  51. articles = articles[(articles["year"] >= 0) & (articles["year"] <= 40)]
  52. articles["year_group"] = articles["year"]//5
  53. if args.reuse_articles:
  54. used = pd.read_csv(opj(args.location, 'articles.csv'))
  55. articles = articles[articles["article_id"].isin(used["article_id"])]
  56. else:
  57. articles = articles[~articles["abstract"].isnull()]
  58. if args.constant_sampling > 0:
  59. articles = articles.groupby("year").head(args.constant_sampling)
  60. keep = pd.Series([False]*len(articles), index=articles.index)
  61. print("Applying filter...")
  62. if args.filter == 'keywords':
  63. for value in args.values:
  64. keep |= articles["abstract"].str.contains(value)
  65. elif args.filter == 'categories':
  66. for value in args.values:
  67. keep |= articles["categories"].apply(lambda l: value in l)
  68. articles = articles[keep==True]
  69. articles = articles.sample(frac=1).head(args.samples)
  70. articles[["article_id"]].to_csv(opj(args.location, 'articles.csv'))
  71. articles.reset_index(inplace = True)
  72. print("Extracting n-grams...")
  73. extractor = TermExtractor(articles["abstract"].tolist(), limit_redundancy=args.limit_redundancy)
  74. if args.nouns:
  75. extractor.add_patterns([["NN.*"]])
  76. if args.adjectives:
  77. extractor.add_patterns([["^JJ$"]])
  78. ngrams = extractor.ngrams(threads=args.threads,lemmatize=args.lemmatize,lemmatize_ngrams=args.lemmatize_ngrams)
  79. ngrams = map(lambda l: [" ".join(n) for n in l], ngrams)
  80. ngrams = list(ngrams)
  81. articles["ngrams"] = ngrams
  82. print("Deriving vocabulary...")
  83. if not args.reuse_stored_vocabulary:
  84. ngrams_occurrences = defaultdict(int)
  85. for ngrams in articles["ngrams"].tolist():
  86. _ngrams = set(ngrams)
  87. for ngram in _ngrams:
  88. ngrams_occurrences[ngram] += 1
  89. ngrams_occurrences = pd.DataFrame(
  90. {"ngram": ngrams_occurrences.keys(), "count": ngrams_occurrences.values()}
  91. )
  92. ngrams_occurrences["unithood"] = (
  93. np.log(2 + ngrams_occurrences["ngram"].str.count(" "))
  94. * ngrams_occurrences["count"]
  95. )
  96. ngrams_occurrences["unithood"] /= len(articles)
  97. ngrams_occurrences.set_index("ngram", inplace=True)
  98. ngrams_occurrences["len"] = ngrams_occurrences.index.map(len)
  99. ngrams_occurrences = ngrams_occurrences[ngrams_occurrences["len"] > 1]
  100. top = ngrams_occurrences.sort_values("unithood", ascending=False).head(
  101. args.top_unithood
  102. )
  103. top.to_csv(opj(args.location, "ngrams.csv"))
  104. selected_ngrams = pd.read_csv(opj(args.location, 'ngrams.csv'))['ngram'].tolist()
  105. vocabulary = {
  106. n: i
  107. for i, n in enumerate(selected_ngrams)
  108. }
  109. ngrams = articles["ngrams"].tolist()
  110. ngrams = [[ngram for ngram in _ngrams if ngram in selected_ngrams] for _ngrams in ngrams]
  111. bow = [[vocabulary[ngram] for ngram in _ngrams] for _ngrams in ngrams]
  112. bow = [[_ngrams.count(i) for i in range(len(selected_ngrams))] for _ngrams in bow]
  113. bow = np.array(bow)
  114. tfidf = TfidfTransformer()
  115. bow_tfidf = tfidf.fit_transform(bow).todense().tolist()
  116. articles["bow_tfidf"] = bow_tfidf
  117. cat_classifier = MultiLabelBinarizer(sparse_output=False)
  118. articles["categories"] = articles["categories"].map(lambda l: list(set(l)&{"Experiment-HEP", "Phenomenology-HEP", "Theory-HEP"}))
  119. cats = cat_classifier.fit_transform(articles["categories"]).tolist()
  120. articles["cats"] = cats
  121. training, validation = train_test_split(articles, train_size=100000/110000)
  122. from sklearn.linear_model import LogisticRegression
  123. from sklearn.dummy import DummyClassifier
  124. from sklearn.metrics import f1_score
  125. dummies = dict()
  126. fit = dict()
  127. scores = dict()
  128. f1 = dict()
  129. dummies_scores = dict()
  130. dummies_f1 = dict()
  131. score_vs_vocab_size = []
  132. for vocab in [50] + list(np.arange(125, 1000, 125)):
  133. score = 0
  134. for i in range(3):
  135. dummies[i] = DummyClassifier(strategy="most_frequent")
  136. dummies[i].fit(np.stack(training["bow_tfidf"].values), np.stack(training["cats"].values).astype(int)[:,i])
  137. fit[i] = LogisticRegression(random_state=0, max_iter=200).fit(np.stack(training["bow_tfidf"].values)[:,0:vocab], np.stack(training["cats"].values).astype(int)[:,i])
  138. y_hat = np.stack(validation["cats"].values).astype(int)[:,i]
  139. scores[i] = fit[i].score(np.stack(validation["bow_tfidf"].values)[:,0:vocab], y_hat)
  140. f1[i] = f1_score(y_hat, fit[i].predict(np.stack(validation["bow_tfidf"].values)[:,0:vocab]))
  141. score += f1[i]
  142. dummies_scores[i] = dummies[i].score(np.stack(validation["bow_tfidf"].values), y_hat)
  143. dummies_f1[i] = f1_score(y_hat, dummies[i].predict(np.stack(validation["bow_tfidf"].values)))
  144. print(vocab, score)
  145. score_vs_vocab_size.append({
  146. 'vocab': vocab,
  147. 'f1': score,
  148. "acc_0": scores[0],
  149. "acc_1": scores[1],
  150. "acc_2": scores[2],
  151. "baseline_acc_0": dummies_scores[0],
  152. "baseline_acc_1": dummies_scores[1],
  153. "baseline_acc_2": dummies_scores[2],
  154. "f1_0": f1[0],
  155. "f1_1": f1[1],
  156. "f1_2": f1[2],
  157. })
  158. score_vs_vocab_size = pd.DataFrame(score_vs_vocab_size)
  159. score_vs_vocab_size.to_csv(opj(args.location, "vocab_performance.csv"))
  160. vocab = 500
  161. scores = dict()
  162. f1 = dict()
  163. dummies_scores = dict()
  164. dummies_f1 = dict()
  165. frequency = np.stack(articles["cats"].values).sum(axis=0)
  166. results = []
  167. inv_vocabulary = {
  168. vocabulary[v]: v
  169. for v in vocabulary
  170. }
  171. for i in range(3):
  172. dummies[i] = DummyClassifier(strategy="most_frequent")
  173. dummies[i].fit(np.stack(training["bow_tfidf"].values), np.stack(training["cats"].values).astype(int)[:,i])
  174. fit[i] = LogisticRegression(random_state=0,max_iter=200).fit(np.stack(training["bow_tfidf"].values)[:,0:vocab], np.stack(training["cats"].values).astype(int)[:,i])
  175. y_hat = np.stack(validation["cats"].values).astype(int)[:,i]
  176. scores[i] = fit[i].score(np.stack(validation["bow_tfidf"].values)[:,0:vocab], y_hat)
  177. f1[i] = f1_score(y_hat, fit[i].predict(np.stack(validation["bow_tfidf"].values)[:,0:vocab]))
  178. dummies_scores[i] = dummies[i].score(np.stack(validation["bow_tfidf"].values), y_hat)
  179. dummies_f1[i] = f1_score(y_hat, dummies[i].predict(np.stack(validation["bow_tfidf"].values)))
  180. for j in range(vocab):
  181. results.append({
  182. 'term': inv_vocabulary[j],
  183. 'category': cat_classifier.inverse_transform(np.array([np.identity(3)[i,:]]))[0][0],
  184. 'coef': fit[i].coef_[0,j],
  185. 'rank': j
  186. })
  187. predictions = fit[i].predict(np.stack(validation["bow_tfidf"].values)[:,0:vocab])
  188. dummy_predictions = dummies[i].predict(np.stack(validation["bow_tfidf"].values)[:,0:vocab])
  189. validation[f"accurate_{i}"] = predictions==y_hat
  190. validation[f"dummy_accurate_{i}"] = dummy_predictions==y_hat
  191. validation[f"truth_{i}"] = y_hat
  192. validation.groupby("year_group").agg(
  193. accurate_0=("accurate_0", "mean"),
  194. accurate_1=("accurate_1", "mean"),
  195. accurate_2=("accurate_2", "mean"),
  196. dummy_accurate_0=("dummy_accurate_0", "mean"),
  197. dummy_accurate_1=("dummy_accurate_1", "mean"),
  198. dummy_accurate_2=("dummy_accurate_2", "mean"),
  199. truth_0=("truth_0", "sum"),
  200. truth_1=("truth_1", "sum"),
  201. truth_2=("truth_2", "sum"),
  202. count_0=("truth_0", "count"),
  203. count_1=("truth_1", "count"),
  204. count_2=("truth_2", "count"),
  205. ).to_csv(opj(args.location, "accuracy_per_period.csv"))
  206. kfold = []
  207. for year_group, test in articles.groupby("year_group"):
  208. train = articles[articles["year_group"] != year_group]
  209. accurate = np.zeros(3)
  210. dummy_accurate = np.zeros(3)
  211. truth = np.zeros(3)
  212. count = np.zeros(3)
  213. for i in range(3):
  214. kfold_fit = LogisticRegression(random_state=0,max_iter=200).fit(np.stack(train["bow_tfidf"].values)[:,0:vocab], np.stack(train["cats"].values).astype(int)[:,i])
  215. y_hat = np.stack(test["cats"].values).astype(int)[:,i]
  216. predictions = kfold_fit.predict(np.stack(test["bow_tfidf"].values)[:,0:vocab])
  217. dummy_predictions = dummies[i].predict(np.stack(test["bow_tfidf"].values)[:,0:vocab])
  218. accurate[i] = (predictions==y_hat).mean()
  219. dummy_accurate[i] = (dummy_predictions==y_hat).mean()
  220. truth[i] = y_hat.mean()
  221. count[i] = len(test)
  222. kfold.append({
  223. "year_group": year_group,
  224. "accurate_0": accurate[0],
  225. "accurate_1": accurate[1],
  226. "accurate_2": accurate[2],
  227. "dummy_accurate_0": dummy_accurate[0],
  228. "dummy_accurate_1": dummy_accurate[1],
  229. "dummy_accurate_2": dummy_accurate[2],
  230. "truth_0": truth[0],
  231. "truth_1": truth[1],
  232. "truth_2": truth[2],
  233. "count_0": count[0],
  234. "count_1": count[1],
  235. "count_2": count[2],
  236. })
  237. pd.DataFrame(kfold).to_csv(opj(args.location, "accuracy_per_period_kfold.csv"))
  238. results = pd.DataFrame(results)
  239. results["drop"] = False
  240. bow = (bow>=1).astype(int)
  241. num = np.outer(bow[:3000,:vocab].sum(axis=0),bow[:3000,:vocab].sum(axis=0))/(3000**2)
  242. den = np.tensordot(bow[:3000,:vocab], bow[:3000,:vocab], axes=([0],[0]))/3000
  243. npmi = np.log(num)/np.log(den)-1
  244. x, y = np.where(npmi-np.identity(vocab)>=0.95)
  245. for k,_ in enumerate(x):
  246. i = x[k]
  247. j = y[k]
  248. a = inv_vocabulary[i]
  249. b = inv_vocabulary[j]
  250. if (not (a in b or b in a)):
  251. continue
  252. if i > j:
  253. results.loc[results['rank'] == i, 'drop'] = True
  254. else:
  255. results.loc[results['rank'] == j, 'drop'] = True
  256. results = results[results["drop"]==False]
  257. results = results[results["term"].str.match("^[a-zA-Z--- ]*$")]
  258. results = results.pivot(index="term",columns="category",values="coef")
  259. results["ph_minus_th"] = results["Phenomenology-HEP"]-results["Theory-HEP"]
  260. results["ph_minus_exp"] = results["Phenomenology-HEP"]-results["Experiment-HEP"]
  261. results.sort_values("ph_minus_th").to_csv(opj(args.location, "results.csv"))
  262. import matplotlib
  263. from matplotlib import pyplot as plt
  264. matplotlib.use("pgf")
  265. matplotlib.rcParams.update(
  266. {
  267. "pgf.texsystem": "xelatex",
  268. "font.family": "serif",
  269. "font.serif": "Times New Roman",
  270. "text.usetex": True,
  271. "pgf.rcfonts": False,
  272. }
  273. )
  274. cats = {"exp": "Experiment-HEP", "th": "Theory-HEP"}
  275. cats_friendly = {"th": "Theory", "exp": "Experiment"}
  276. table = []
  277. for cat in cats:
  278. top = results[results["Phenomenology-HEP"]>0].sort_values(f"ph_minus_{cat}", ascending=False).head(40).index.values
  279. bottom = results[results[cats[cat]]>0].sort_values(f"ph_minus_{cat}", ascending=True).head(40).index.values
  280. table.append({
  281. 'Reference category': cats_friendly[cat],
  282. 'Relation to phenomenology': "Vocabulary specific to phenomenology",
  283. 'words': ", ".join(top)
  284. })
  285. table.append({
  286. 'Reference category': cats_friendly[cat],
  287. 'Relation to phenomenology': "Vocabulary specific to theory",
  288. 'words': ", ".join(bottom)
  289. })
  290. table = pd.DataFrame(table)
  291. table = table.pivot(index="Reference category", columns="Relation to phenomenology", values="words")
  292. with pd.option_context("display.max_colwidth", None):
  293. latex = table.to_latex(
  294. longtable=True,
  295. multirow=True,
  296. multicolumn=True,
  297. bold_rows=True,
  298. header=True,
  299. index_names=False,
  300. column_format='p{3cm}|p{5cm}|p{5cm}',
  301. caption="Vocabulary specific to each category. The left column lists expressions that discriminate experiment and theory from phenomenology. The right column lists expressions that are the most specific to phenomenology and foreigh to experiment and theory.",
  302. label="table:specific_pheno_vocabulary"
  303. )
  304. with open("tables/specific_vocabulary.tex", "w+") as fp:
  305. fp.write(latex)
  306. for cat in ["th", "exp"]:
  307. table = []
  308. top = results[results["Phenomenology-HEP"]>0].sort_values(f"ph_minus_{cat}", ascending=False).head(45).index.values
  309. bottom = results[results[cats[cat]]>0].sort_values(f"ph_minus_{cat}", ascending=True).head(45).index.values
  310. table.append({
  311. 'Reference category': cats_friendly[cat],
  312. 'Relation to phenomenology': "Vocabulary specific to phenomenology",
  313. 'words': ", ".join(top)
  314. })
  315. table.append({
  316. 'Reference category': cats_friendly[cat],
  317. 'Relation to phenomenology': f"Vocabulary specific to {cats_friendly[cat].lower()}",
  318. 'words': ", ".join(bottom)
  319. })
  320. table = pd.DataFrame(table)
  321. table = table.pivot(index="Reference category", columns="Relation to phenomenology", values="words")
  322. caption = f"Vocabulary specific to phenomenology (left column) versus {cats_friendly[cat].lower()} (right column)."
  323. with pd.option_context("display.max_colwidth", None):
  324. latex = table.to_latex(
  325. longtable=True,
  326. multirow=True,
  327. multicolumn=True,
  328. bold_rows=True,
  329. header=True,
  330. index_names=False,
  331. index=False,
  332. column_format='p{7cm}|p{7cm}',
  333. caption=caption,
  334. label=f"table:specific_pheno_vocabulary_{cat}_ph",
  335. columns = ["Vocabulary specific to phenomenology", f"Vocabulary specific to {cats_friendly[cat].lower()}"]
  336. )
  337. with open(f"tables/specific_vocabulary_{cat}_ph.tex", "w+") as fp:
  338. fp.write(latex)