category_prediction_longitudinal.py 8.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219
  1. from AbstractSemantics.terms import TermExtractor
  2. import pandas as pd
  3. import numpy as np
  4. from os.path import join as opj
  5. from os.path import exists
  6. import itertools
  7. from functools import partial
  8. from collections import defaultdict
  9. import re
  10. from sklearn.preprocessing import MultiLabelBinarizer
  11. from sklearn.feature_extraction.text import TfidfTransformer
  12. from sklearn.model_selection import train_test_split
  13. import multiprocessing as mp
  14. from matplotlib import pyplot as plt
  15. import argparse
  16. import yaml
  17. import sys
  18. if __name__ == '__main__':
  19. parser = argparse.ArgumentParser('CT Model')
  20. parser.add_argument('location', help='model directory')
  21. parser.add_argument('filter', choices=['categories', 'keywords', 'no-filter'], help='filter type')
  22. parser.add_argument('--values', nargs='+', default=[], help='filter allowed values')
  23. parser.add_argument('--samples', type=int, default=50000)
  24. parser.add_argument('--constant-sampling', type=int, default=0)
  25. parser.add_argument('--reuse-articles', default=False, action="store_true", help="reuse article selection")
  26. parser.add_argument('--nouns', default=False, action="store_true", help="include nouns")
  27. parser.add_argument('--adjectives', default=False, action="store_true", help="include adjectives")
  28. parser.add_argument('--lemmatize', default=False, action="store_true", help="stemmer")
  29. parser.add_argument('--lemmatize-ngrams', default=False, action="store_true", help="stemmer")
  30. parser.add_argument('--remove-latex', default=False, action="store_true", help="remove latex")
  31. parser.add_argument('--limit-redundancy', default=False, action="store_true", help="limit redundancy")
  32. parser.add_argument('--add-title', default=False, action="store_true", help="include title")
  33. parser.add_argument('--top-unithood', type=int, default=20000, help='top unithood filter')
  34. parser.add_argument('--min-token-length', type=int, default=0, help='minimum token length')
  35. parser.add_argument('--min-df', type=int, default=0, help='min_df')
  36. parser.add_argument('--reuse-stored-vocabulary', default=False, action='store_true')
  37. parser.add_argument('--threads', type=int, default=4)
  38. args = parser.parse_args(["output/category_prediction_longitudinal", "categories", "--values", "Phenomenology-HEP", "Theory-HEP", "--samples", "400000", "--nouns", "--lemmatize", "--lemmatize-ngrams", "--remove-latex", "--add-title", "--top-unithood", "1000", "--threads", "16"])
  39. with open(opj(args.location, "params.yml"), "w+") as fp:
  40. yaml.dump(args, fp)
  41. articles = pd.read_parquet("inspire-harvest/database/articles.parquet")[["title", "abstract", "article_id", "date_created", "categories"]]
  42. if args.add_title:
  43. articles["abstract"] = articles["abstract"].str.cat(articles["title"])
  44. articles.drop(columns = ["title"], inplace=True)
  45. if args.remove_latex:
  46. articles['abstract'] = articles['abstract'].apply(lambda s: re.sub('$[^>]+$', '', s))
  47. articles = articles[articles["abstract"].map(len)>=100]
  48. articles["abstract"] = articles["abstract"].str.lower()
  49. articles = articles[articles["date_created"].str.len() >= 4]
  50. articles["year"] = articles["date_created"].str[:4].astype(int)-1980
  51. articles = articles[(articles["year"] >= 0) & (articles["year"] <= 40)]
  52. articles["year_group"] = articles["year"]//5
  53. if args.reuse_articles:
  54. used = pd.read_csv(opj(args.location, 'articles.csv'))
  55. articles = articles[articles["article_id"].isin(used["article_id"])]
  56. else:
  57. articles = articles[~articles["abstract"].isnull()]
  58. if args.constant_sampling > 0:
  59. articles = articles.groupby("year").head(args.constant_sampling)
  60. keep = pd.Series([False]*len(articles), index=articles.index)
  61. print("Applying filter...")
  62. if args.filter == 'keywords':
  63. for value in args.values:
  64. keep |= articles["abstract"].str.contains(value)
  65. elif args.filter == 'categories':
  66. for value in args.values:
  67. keep |= articles["categories"].apply(lambda l: value in l)
  68. articles = articles[keep==True]
  69. articles = articles.sample(frac=1).head(args.samples)
  70. articles[["article_id"]].to_csv(opj(args.location, 'articles.csv'))
  71. articles.reset_index(inplace = True)
  72. print("Extracting n-grams...")
  73. extractor = TermExtractor(articles["abstract"].tolist(), limit_redundancy=args.limit_redundancy)
  74. if args.nouns:
  75. extractor.add_patterns([["NN.*"]])
  76. if args.adjectives:
  77. extractor.add_patterns([["^JJ$"]])
  78. ngrams = extractor.ngrams(threads=args.threads,lemmatize=args.lemmatize,lemmatize_ngrams=args.lemmatize_ngrams)
  79. ngrams = map(lambda l: [" ".join(n) for n in l], ngrams)
  80. ngrams = list(ngrams)
  81. articles["ngrams"] = ngrams
  82. print("n_articles:", len(articles))
  83. print("Deriving vocabulary...")
  84. if not args.reuse_stored_vocabulary:
  85. ngrams_occurrences = defaultdict(int)
  86. for ngrams in articles["ngrams"].tolist():
  87. _ngrams = set(ngrams)
  88. for ngram in _ngrams:
  89. ngrams_occurrences[ngram] += 1
  90. ngrams_occurrences = pd.DataFrame(
  91. {"ngram": ngrams_occurrences.keys(), "count": ngrams_occurrences.values()}
  92. )
  93. ngrams_occurrences["unithood"] = (
  94. np.log(2 + ngrams_occurrences["ngram"].str.count(" "))
  95. * ngrams_occurrences["count"]
  96. )
  97. ngrams_occurrences["unithood"] /= len(articles)
  98. ngrams_occurrences.set_index("ngram", inplace=True)
  99. ngrams_occurrences["len"] = ngrams_occurrences.index.map(len)
  100. ngrams_occurrences = ngrams_occurrences[ngrams_occurrences["len"] > 1]
  101. top = ngrams_occurrences.sort_values("unithood", ascending=False).head(
  102. args.top_unithood
  103. )
  104. top.to_csv(opj(args.location, "ngrams.csv"))
  105. selected_ngrams = pd.read_csv(opj(args.location, 'ngrams.csv'))['ngram'].tolist()
  106. vocabulary = {
  107. n: i
  108. for i, n in enumerate(selected_ngrams)
  109. }
  110. inv_vocabulary = {
  111. vocabulary[v]: v
  112. for v in vocabulary
  113. }
  114. ngrams = articles["ngrams"].tolist()
  115. ngrams = [[ngram for ngram in _ngrams if ngram in selected_ngrams] for _ngrams in ngrams]
  116. bow = [[vocabulary[ngram] for ngram in _ngrams] for _ngrams in ngrams]
  117. bow = [[_ngrams.count(i) for i in range(len(selected_ngrams))] for _ngrams in bow]
  118. bow = np.array(bow)
  119. bow = (bow>0)*1 # destroy freq information
  120. tfidf = TfidfTransformer()
  121. bow_tfidf = tfidf.fit_transform(bow).todense().tolist()
  122. articles["bow_tfidf"] = bow_tfidf
  123. cat_classifier = MultiLabelBinarizer(sparse_output=False)
  124. articles["categories"] = articles["categories"].map(lambda l: list(set(l)&{"Phenomenology-HEP", "Theory-HEP"}))
  125. cats = cat_classifier.fit_transform(articles["categories"]).tolist()
  126. articles["cats"] = cats
  127. vocab = 500
  128. from sklearn.linear_model import LogisticRegression
  129. from sklearn.dummy import DummyClassifier
  130. from sklearn.metrics import f1_score
  131. results = []
  132. for year_group, train in articles.groupby("year_group"):
  133. train = articles[articles["year_group"] != year_group]
  134. for i in range(2):
  135. fit = LogisticRegression(random_state=0,max_iter=200).fit(np.stack(train["bow_tfidf"].values)[:,0:vocab], np.stack(train["cats"].values).astype(int)[:,i])
  136. for j in range(vocab):
  137. results.append({
  138. 'year_group': year_group,
  139. 'term': inv_vocabulary[j],
  140. 'category': cat_classifier.inverse_transform(np.array([np.identity(2)[i,:]]))[0][0],
  141. 'coef': fit.coef_[0,j],
  142. 'rank': j
  143. })
  144. results = pd.DataFrame(results)
  145. results["drop"] = False
  146. bow = (bow>=1).astype(int)
  147. num = np.outer(bow[:3000,:vocab].sum(axis=0),bow[:3000,:vocab].sum(axis=0))/(3000**2)
  148. den = np.tensordot(bow[:3000,:vocab], bow[:3000,:vocab], axes=([0],[0]))/3000
  149. npmi = np.log(num)/np.log(den)-1
  150. x, y = np.where(npmi-np.identity(vocab)>=0.95)
  151. for k,_ in enumerate(x):
  152. i = x[k]
  153. j = y[k]
  154. a = inv_vocabulary[i]
  155. b = inv_vocabulary[j]
  156. if (not (a in b or b in a)):
  157. continue
  158. if i > j:
  159. results.loc[results['rank'] == i, 'drop'] = True
  160. else:
  161. results.loc[results['rank'] == j, 'drop'] = True
  162. results = results[results["drop"]==False]
  163. results = results[results["term"].str.match("^[a-zA-Z--- ]*$")]
  164. results.sort_values(["year_group", "rank"]).to_csv(opj(args.location, "results.csv"))