123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219 |
- from AbstractSemantics.terms import TermExtractor
- import pandas as pd
- import numpy as np
- from os.path import join as opj
- from os.path import exists
- import itertools
- from functools import partial
- from collections import defaultdict
- import re
- from sklearn.preprocessing import MultiLabelBinarizer
- from sklearn.feature_extraction.text import TfidfTransformer
- from sklearn.model_selection import train_test_split
- import multiprocessing as mp
- from matplotlib import pyplot as plt
- import argparse
- import yaml
- import sys
- if __name__ == '__main__':
- parser = argparse.ArgumentParser('CT Model')
- parser.add_argument('location', help='model directory')
- parser.add_argument('filter', choices=['categories', 'keywords', 'no-filter'], help='filter type')
- parser.add_argument('--values', nargs='+', default=[], help='filter allowed values')
- parser.add_argument('--samples', type=int, default=50000)
- parser.add_argument('--constant-sampling', type=int, default=0)
- parser.add_argument('--reuse-articles', default=False, action="store_true", help="reuse article selection")
- parser.add_argument('--nouns', default=False, action="store_true", help="include nouns")
- parser.add_argument('--adjectives', default=False, action="store_true", help="include adjectives")
- parser.add_argument('--lemmatize', default=False, action="store_true", help="stemmer")
- parser.add_argument('--lemmatize-ngrams', default=False, action="store_true", help="stemmer")
- parser.add_argument('--remove-latex', default=False, action="store_true", help="remove latex")
- parser.add_argument('--limit-redundancy', default=False, action="store_true", help="limit redundancy")
- parser.add_argument('--add-title', default=False, action="store_true", help="include title")
- parser.add_argument('--top-unithood', type=int, default=20000, help='top unithood filter')
- parser.add_argument('--min-token-length', type=int, default=0, help='minimum token length')
- parser.add_argument('--min-df', type=int, default=0, help='min_df')
- parser.add_argument('--reuse-stored-vocabulary', default=False, action='store_true')
- parser.add_argument('--threads', type=int, default=4)
- args = parser.parse_args(["output/category_prediction_longitudinal", "categories", "--values", "Phenomenology-HEP", "Theory-HEP", "--samples", "400000", "--nouns", "--lemmatize", "--lemmatize-ngrams", "--remove-latex", "--add-title", "--top-unithood", "1000", "--threads", "16"])
- with open(opj(args.location, "params.yml"), "w+") as fp:
- yaml.dump(args, fp)
- articles = pd.read_parquet("inspire-harvest/database/articles.parquet")[["title", "abstract", "article_id", "date_created", "categories"]]
- if args.add_title:
- articles["abstract"] = articles["abstract"].str.cat(articles["title"])
- articles.drop(columns = ["title"], inplace=True)
- if args.remove_latex:
- articles['abstract'] = articles['abstract'].apply(lambda s: re.sub('$[^>]+$', '', s))
- articles = articles[articles["abstract"].map(len)>=100]
- articles["abstract"] = articles["abstract"].str.lower()
- articles = articles[articles["date_created"].str.len() >= 4]
- articles["year"] = articles["date_created"].str[:4].astype(int)-1980
- articles = articles[(articles["year"] >= 0) & (articles["year"] <= 40)]
- articles["year_group"] = articles["year"]//5
- if args.reuse_articles:
- used = pd.read_csv(opj(args.location, 'articles.csv'))
- articles = articles[articles["article_id"].isin(used["article_id"])]
- else:
- articles = articles[~articles["abstract"].isnull()]
- if args.constant_sampling > 0:
- articles = articles.groupby("year").head(args.constant_sampling)
- keep = pd.Series([False]*len(articles), index=articles.index)
- print("Applying filter...")
- if args.filter == 'keywords':
- for value in args.values:
- keep |= articles["abstract"].str.contains(value)
- elif args.filter == 'categories':
- for value in args.values:
- keep |= articles["categories"].apply(lambda l: value in l)
- articles = articles[keep==True]
- articles = articles.sample(frac=1).head(args.samples)
- articles[["article_id"]].to_csv(opj(args.location, 'articles.csv'))
- articles.reset_index(inplace = True)
- print("Extracting n-grams...")
- extractor = TermExtractor(articles["abstract"].tolist(), limit_redundancy=args.limit_redundancy)
- if args.nouns:
- extractor.add_patterns([["NN.*"]])
- if args.adjectives:
- extractor.add_patterns([["^JJ$"]])
- ngrams = extractor.ngrams(threads=args.threads,lemmatize=args.lemmatize,lemmatize_ngrams=args.lemmatize_ngrams)
- ngrams = map(lambda l: [" ".join(n) for n in l], ngrams)
- ngrams = list(ngrams)
- articles["ngrams"] = ngrams
- print("n_articles:", len(articles))
- print("Deriving vocabulary...")
- if not args.reuse_stored_vocabulary:
- ngrams_occurrences = defaultdict(int)
- for ngrams in articles["ngrams"].tolist():
- _ngrams = set(ngrams)
- for ngram in _ngrams:
- ngrams_occurrences[ngram] += 1
- ngrams_occurrences = pd.DataFrame(
- {"ngram": ngrams_occurrences.keys(), "count": ngrams_occurrences.values()}
- )
- ngrams_occurrences["unithood"] = (
- np.log(2 + ngrams_occurrences["ngram"].str.count(" "))
- * ngrams_occurrences["count"]
- )
- ngrams_occurrences["unithood"] /= len(articles)
- ngrams_occurrences.set_index("ngram", inplace=True)
- ngrams_occurrences["len"] = ngrams_occurrences.index.map(len)
- ngrams_occurrences = ngrams_occurrences[ngrams_occurrences["len"] > 1]
- top = ngrams_occurrences.sort_values("unithood", ascending=False).head(
- args.top_unithood
- )
- top.to_csv(opj(args.location, "ngrams.csv"))
-
- selected_ngrams = pd.read_csv(opj(args.location, 'ngrams.csv'))['ngram'].tolist()
- vocabulary = {
- n: i
- for i, n in enumerate(selected_ngrams)
- }
- inv_vocabulary = {
- vocabulary[v]: v
- for v in vocabulary
- }
- ngrams = articles["ngrams"].tolist()
- ngrams = [[ngram for ngram in _ngrams if ngram in selected_ngrams] for _ngrams in ngrams]
- bow = [[vocabulary[ngram] for ngram in _ngrams] for _ngrams in ngrams]
- bow = [[_ngrams.count(i) for i in range(len(selected_ngrams))] for _ngrams in bow]
- bow = np.array(bow)
- bow = (bow>0)*1 # destroy freq information
- tfidf = TfidfTransformer()
- bow_tfidf = tfidf.fit_transform(bow).todense().tolist()
- articles["bow_tfidf"] = bow_tfidf
- cat_classifier = MultiLabelBinarizer(sparse_output=False)
- articles["categories"] = articles["categories"].map(lambda l: list(set(l)&{"Phenomenology-HEP", "Theory-HEP"}))
- cats = cat_classifier.fit_transform(articles["categories"]).tolist()
- articles["cats"] = cats
- vocab = 500
- from sklearn.linear_model import LogisticRegression
- from sklearn.dummy import DummyClassifier
- from sklearn.metrics import f1_score
- results = []
- for year_group, train in articles.groupby("year_group"):
- train = articles[articles["year_group"] != year_group]
-
- for i in range(2):
- fit = LogisticRegression(random_state=0,max_iter=200).fit(np.stack(train["bow_tfidf"].values)[:,0:vocab], np.stack(train["cats"].values).astype(int)[:,i])
- for j in range(vocab):
- results.append({
- 'year_group': year_group,
- 'term': inv_vocabulary[j],
- 'category': cat_classifier.inverse_transform(np.array([np.identity(2)[i,:]]))[0][0],
- 'coef': fit.coef_[0,j],
- 'rank': j
- })
- results = pd.DataFrame(results)
- results["drop"] = False
- bow = (bow>=1).astype(int)
- num = np.outer(bow[:3000,:vocab].sum(axis=0),bow[:3000,:vocab].sum(axis=0))/(3000**2)
- den = np.tensordot(bow[:3000,:vocab], bow[:3000,:vocab], axes=([0],[0]))/3000
- npmi = np.log(num)/np.log(den)-1
- x, y = np.where(npmi-np.identity(vocab)>=0.95)
- for k,_ in enumerate(x):
- i = x[k]
- j = y[k]
- a = inv_vocabulary[i]
- b = inv_vocabulary[j]
- if (not (a in b or b in a)):
- continue
- if i > j:
- results.loc[results['rank'] == i, 'drop'] = True
- else:
- results.loc[results['rank'] == j, 'drop'] = True
- results = results[results["drop"]==False]
- results = results[results["term"].str.match("^[a-zA-Z--- ]*$")]
- results.sort_values(["year_group", "rank"]).to_csv(opj(args.location, "results.csv"))
|