category_prediction.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352
  1. from AbstractSemantics.terms import TermExtractor
  2. import pandas as pd
  3. import numpy as np
  4. from os.path import join as opj
  5. from os.path import exists
  6. import itertools
  7. from functools import partial
  8. from collections import defaultdict
  9. import re
  10. from sklearn.preprocessing import MultiLabelBinarizer
  11. from sklearn.feature_extraction.text import TfidfTransformer
  12. from sklearn.model_selection import train_test_split
  13. import multiprocessing as mp
  14. from matplotlib import pyplot as plt
  15. import argparse
  16. import yaml
  17. import sys
  18. if __name__ == '__main__':
  19. parser = argparse.ArgumentParser('CT Model')
  20. parser.add_argument('location', help='model directory')
  21. parser.add_argument('filter', choices=['categories', 'keywords', 'no-filter'], help='filter type')
  22. parser.add_argument('--values', nargs='+', default=[], help='filter allowed values')
  23. parser.add_argument('--samples', type=int, default=50000)
  24. parser.add_argument('--constant-sampling', type=int, default=0)
  25. parser.add_argument('--reuse-articles', default=False, action="store_true", help="reuse article selection")
  26. parser.add_argument('--nouns', default=False, action="store_true", help="include nouns")
  27. parser.add_argument('--adjectives', default=False, action="store_true", help="include adjectives")
  28. parser.add_argument('--lemmatize', default=False, action="store_true", help="stemmer")
  29. parser.add_argument('--lemmatize-ngrams', default=False, action="store_true", help="stemmer")
  30. parser.add_argument('--remove-latex', default=False, action="store_true", help="remove latex")
  31. parser.add_argument('--limit-redundancy', default=False, action="store_true", help="limit redundancy")
  32. parser.add_argument('--add-title', default=False, action="store_true", help="include title")
  33. parser.add_argument('--top-unithood', type=int, default=20000, help='top unithood filter')
  34. parser.add_argument('--min-token-length', type=int, default=0, help='minimum token length')
  35. parser.add_argument('--min-df', type=int, default=0, help='min_df')
  36. parser.add_argument('--reuse-stored-vocabulary', default=False, action='store_true')
  37. parser.add_argument('--threads', type=int, default=4)
  38. args = parser.parse_args(["output/category_prediction", "categories", "--values", "Experiment-HEP", "Phenomenology-HEP", "Theory-HEP", "--samples", "60000", "--nouns", "--lemmatize", "--lemmatize-ngrams", "--remove-latex", "--add-title", "--top-unithood", "1000", "--threads", "4"])
  39. with open(opj(args.location, "params.yml"), "w+") as fp:
  40. yaml.dump(args, fp)
  41. articles = pd.read_parquet("inspire-harvest/database/articles.parquet")[["title", "abstract", "article_id", "date_created", "categories"]]
  42. if args.add_title:
  43. articles["abstract"] = articles["abstract"].str.cat(articles["title"])
  44. articles.drop(columns = ["title"], inplace=True)
  45. if args.remove_latex:
  46. articles['abstract'] = articles['abstract'].apply(lambda s: re.sub('$[^>]+$', '', s))
  47. articles = articles[articles["abstract"].map(len)>=100]
  48. articles["abstract"] = articles["abstract"].str.lower()
  49. articles = articles[articles["date_created"].str.len() >= 4]
  50. articles["year"] = articles["date_created"].str[:4].astype(int)-1980
  51. articles = articles[(articles["year"] >= 0) & (articles["year"] <= 40)]
  52. if args.reuse_articles:
  53. used = pd.read_csv(opj(args.location, 'articles.csv'))
  54. articles = articles[articles["article_id"].isin(used["article_id"])]
  55. else:
  56. articles = articles[~articles["abstract"].isnull()]
  57. if args.constant_sampling > 0:
  58. articles = articles.groupby("year").head(args.constant_sampling)
  59. keep = pd.Series([False]*len(articles), index=articles.index)
  60. print("Applying filter...")
  61. if args.filter == 'keywords':
  62. for value in args.values:
  63. keep |= articles["abstract"].str.contains(value)
  64. elif args.filter == 'categories':
  65. for value in args.values:
  66. keep |= articles["categories"].apply(lambda l: value in l)
  67. articles = articles[keep==True]
  68. articles = articles.sample(frac=1).head(args.samples)
  69. articles[["article_id"]].to_csv(opj(args.location, 'articles.csv'))
  70. articles.reset_index(inplace = True)
  71. print("Extracting n-grams...")
  72. extractor = TermExtractor(articles["abstract"].tolist(), limit_redundancy=args.limit_redundancy)
  73. if args.nouns:
  74. extractor.add_patterns([["NN.*"]])
  75. if args.adjectives:
  76. extractor.add_patterns([["^JJ$"]])
  77. ngrams = extractor.ngrams(threads=args.threads,lemmatize=args.lemmatize,lemmatize_ngrams=args.lemmatize_ngrams)
  78. ngrams = map(lambda l: [" ".join(n) for n in l], ngrams)
  79. ngrams = list(ngrams)
  80. articles["ngrams"] = ngrams
  81. print("Deriving vocabulary...")
  82. if not args.reuse_stored_vocabulary:
  83. ngrams_occurrences = defaultdict(int)
  84. for ngrams in articles["ngrams"].tolist():
  85. _ngrams = set(ngrams)
  86. for ngram in _ngrams:
  87. ngrams_occurrences[ngram] += 1
  88. ngrams_occurrences = pd.DataFrame(
  89. {"ngram": ngrams_occurrences.keys(), "count": ngrams_occurrences.values()}
  90. )
  91. ngrams_occurrences["unithood"] = (
  92. np.log(2 + ngrams_occurrences["ngram"].str.count(" "))
  93. * ngrams_occurrences["count"]
  94. )
  95. ngrams_occurrences["unithood"] /= len(articles)
  96. ngrams_occurrences.set_index("ngram", inplace=True)
  97. ngrams_occurrences["len"] = ngrams_occurrences.index.map(len)
  98. ngrams_occurrences = ngrams_occurrences[ngrams_occurrences["len"] > 1]
  99. top = ngrams_occurrences.sort_values("unithood", ascending=False).head(
  100. args.top_unithood
  101. )
  102. top.to_csv(opj(args.location, "ngrams.csv"))
  103. selected_ngrams = pd.read_csv(opj(args.location, 'ngrams.csv'))['ngram'].tolist()
  104. vocabulary = {
  105. n: i
  106. for i, n in enumerate(selected_ngrams)
  107. }
  108. ngrams = articles["ngrams"].tolist()
  109. ngrams = [[ngram for ngram in _ngrams if ngram in selected_ngrams] for _ngrams in ngrams]
  110. ngrams_bow = [[vocabulary[ngram] for ngram in _ngrams] for _ngrams in ngrams]
  111. ngrams_bow = [[_ngrams.count(i) for i in range(len(selected_ngrams))] for _ngrams in ngrams_bow]
  112. tfidf = TfidfTransformer()
  113. bow_tfidf = tfidf.fit_transform(ngrams_bow).todense().tolist()
  114. articles["bow_tfidf"] = bow_tfidf
  115. cat_classifier = MultiLabelBinarizer(sparse_output=False)
  116. articles["categories"] = articles["categories"].map(lambda l: list(set(l)&{"Experiment-HEP", "Phenomenology-HEP", "Theory-HEP"}))
  117. cats = cat_classifier.fit_transform(articles["categories"]).tolist()
  118. articles["cats"] = cats
  119. training, validation = train_test_split(articles, train_size=50000)
  120. from sklearn.linear_model import LogisticRegression
  121. from sklearn.dummy import DummyClassifier
  122. from sklearn.metrics import f1_score
  123. dummies = dict()
  124. fit = dict()
  125. scores = dict()
  126. f1 = dict()
  127. dummies_scores = dict()
  128. dummies_f1 = dict()
  129. score_vs_vocab_size = []
  130. for vocab in [50] + list(np.arange(125, 1000+125, 125)):
  131. score = 0
  132. for i in range(3):
  133. dummies[i] = DummyClassifier(strategy="most_frequent")
  134. dummies[i].fit(np.stack(training["bow_tfidf"].values), np.stack(training["cats"].values).astype(int)[:,i])
  135. fit[i] = LogisticRegression(random_state=0, max_iter=200).fit(np.stack(training["bow_tfidf"].values)[:,0:vocab], np.stack(training["cats"].values).astype(int)[:,i])
  136. y_hat = np.stack(validation["cats"].values).astype(int)[:,i]
  137. scores[i] = fit[i].score(np.stack(validation["bow_tfidf"].values)[:,0:vocab], y_hat)
  138. f1[i] = f1_score(y_hat, fit[i].predict(np.stack(validation["bow_tfidf"].values)[:,0:vocab]))
  139. score += f1[i]
  140. dummies_scores[i] = dummies[i].score(np.stack(validation["bow_tfidf"].values), y_hat)
  141. dummies_f1[i] = f1_score(y_hat, dummies[i].predict(np.stack(validation["bow_tfidf"].values)))
  142. print(vocab, score)
  143. score_vs_vocab_size.append({
  144. 'vocab': vocab,
  145. 'f1': score,
  146. "acc_0": scores[0],
  147. "acc_1": scores[1],
  148. "acc_2": scores[2],
  149. "baseline_acc_0": dummies_scores[0],
  150. "baseline_acc_1": dummies_scores[1],
  151. "baseline_acc_2": dummies_scores[2],
  152. "f1_0": f1[0],
  153. "f1_1": f1[1],
  154. "f1_2": f1[2],
  155. })
  156. score_vs_vocab_size = pd.DataFrame(score_vs_vocab_size)
  157. score_vs_vocab_size.to_csv(opj(args.location, "vocab_performance.csv"))
  158. vocab = 500
  159. scores = dict()
  160. f1 = dict()
  161. dummies_scores = dict()
  162. dummies_f1 = dict()
  163. frequency = np.stack(articles["cats"].values).sum(axis=0)
  164. results = []
  165. inv_vocabulary = {
  166. vocabulary[v]: v
  167. for v in vocabulary
  168. }
  169. for i in range(3):
  170. dummies[i] = DummyClassifier(strategy="most_frequent")
  171. dummies[i].fit(np.stack(training["bow_tfidf"].values), np.stack(training["cats"].values).astype(int)[:,i])
  172. fit[i] = LogisticRegression(random_state=0,max_iter=200).fit(np.stack(training["bow_tfidf"].values)[:,0:vocab], np.stack(training["cats"].values).astype(int)[:,i])
  173. y_hat = np.stack(validation["cats"].values).astype(int)[:,i]
  174. scores[i] = fit[i].score(np.stack(validation["bow_tfidf"].values)[:,0:vocab], y_hat)
  175. f1[i] = f1_score(y_hat, fit[i].predict(np.stack(validation["bow_tfidf"].values)[:,0:vocab]))
  176. dummies_scores[i] = dummies[i].score(np.stack(validation["bow_tfidf"].values), y_hat)
  177. dummies_f1[i] = f1_score(y_hat, dummies[i].predict(np.stack(validation["bow_tfidf"].values)))
  178. for j in range(vocab):
  179. results.append({
  180. 'term': inv_vocabulary[j],
  181. 'category': cat_classifier.inverse_transform(np.array([np.identity(3)[i,:]]))[0][0],
  182. 'coef': fit[i].coef_[0,j]
  183. })
  184. results = pd.DataFrame(results).pivot(index="term",columns="category",values="coef")
  185. results["ph_minus_th"] = results["Phenomenology-HEP"]-results["Theory-HEP"]
  186. results["ph_minus_exp"] = results["Phenomenology-HEP"]-results["Experiment-HEP"]
  187. results.sort_values("ph_minus_th").to_csv(opj(args.location, "results.csv"))
  188. import matplotlib
  189. from matplotlib import pyplot as plt
  190. matplotlib.use("pgf")
  191. matplotlib.rcParams.update(
  192. {
  193. "pgf.texsystem": "xelatex",
  194. "font.family": "serif",
  195. "font.serif": "Times New Roman",
  196. "text.usetex": True,
  197. "pgf.rcfonts": False,
  198. }
  199. )
  200. colors = ['#377eb8', '#ff7f00', '#4daf4a']
  201. for i in range(3):
  202. plt.plot(score_vs_vocab_size["vocab"], score_vs_vocab_size[f"acc_{i}"], color=colors[i], label=["Expérience", "Phénoménologie", "Théorie"][i])
  203. plt.plot(score_vs_vocab_size["vocab"], [dummies_scores[i]]*len(score_vs_vocab_size["vocab"]), color=colors[i], ls="--")
  204. plt.xlim(0,500)
  205. plt.title("Prédiction des catégories d'un article à partir de son résumé")
  206. plt.xlabel("Taille du vocabulaire ($V$)")
  207. plt.ylabel("Précision")
  208. plt.legend()
  209. plt.savefig("plots/categories_bow_prediction.pdf")
  210. plt.savefig("plots/categories_bow_prediction.pgf")
  211. cats = {"exp": "Experiment-HEP", "th": "Theory-HEP"}
  212. cats_friendly = {"th": "Theory", "exp": "Experiment"}
  213. table = []
  214. for cat in cats:
  215. top = results[results["Phenomenology-HEP"]>0].sort_values(f"ph_minus_{cat}", ascending=False).head(40).index.values
  216. bottom = results[results[cats[cat]]>0].sort_values(f"ph_minus_{cat}", ascending=True).head(40).index.values
  217. table.append({
  218. 'Reference category': cats_friendly[cat],
  219. 'Relation to phenomenology': "Vocabulary specific to phenomenology",
  220. 'words': ", ".join(top)
  221. })
  222. table.append({
  223. 'Reference category': cats_friendly[cat],
  224. 'Relation to phenomenology': "Vocabulary specific to theory",
  225. 'words': ", ".join(bottom)
  226. })
  227. table = pd.DataFrame(table)
  228. table = table.pivot(index="Reference category", columns="Relation to phenomenology", values="words")
  229. with pd.option_context("display.max_colwidth", None):
  230. latex = table.to_latex(
  231. longtable=True,
  232. multirow=True,
  233. multicolumn=True,
  234. bold_rows=True,
  235. header=True,
  236. index_names=False,
  237. column_format='p{3cm}|p{5cm}|p{5cm}',
  238. caption="Vocabulary specific to each category. The left column lists expressions that discriminate experiment and theory from phenomenology. The right column lists expressions that are the most specific to phenomenology and foreigh to experiment and theory.",
  239. label="table:specific_pheno_vocabulary"
  240. )
  241. with open("tables/specific_vocabulary.tex", "w+") as fp:
  242. fp.write(latex)
  243. table = []
  244. cat = "th"
  245. top = results[results["Phenomenology-HEP"]>0].sort_values(f"ph_minus_{cat}", ascending=False).head(40).index.values
  246. bottom = results[results[cats[cat]]>0].sort_values(f"ph_minus_{cat}", ascending=True).head(40).index.values
  247. table.append({
  248. 'Reference category': cats_friendly[cat],
  249. 'Relation to phenomenology': "Vocabulary specific to phenomenology",
  250. 'words': ", ".join(top)
  251. })
  252. table.append({
  253. 'Reference category': cats_friendly[cat],
  254. 'Relation to phenomenology': "Vocabulary specific to theory",
  255. 'words': ", ".join(bottom)
  256. })
  257. table = pd.DataFrame(table)
  258. table = table.pivot(index="Reference category", columns="Relation to phenomenology", values="words")
  259. with pd.option_context("display.max_colwidth", None):
  260. latex = table.to_latex(
  261. longtable=True,
  262. multirow=True,
  263. multicolumn=True,
  264. bold_rows=True,
  265. header=True,
  266. index_names=False,
  267. index=False,
  268. column_format='p{7cm}|p{7cm}',
  269. caption="Vocabulary specific to phenomenology (left column) versus theory (right column). ",
  270. label="table:specific_pheno_vocabulary_th_ph"
  271. )
  272. with open("tables/specific_vocabulary_th_ph.tex", "w+") as fp:
  273. fp.write(latex)