from AbstractSemantics.terms import TermExtractor import pandas as pd import numpy as np from os.path import join as opj from os.path import exists import itertools from functools import partial from collections import defaultdict import re from sklearn.preprocessing import MultiLabelBinarizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.model_selection import train_test_split import multiprocessing as mp from matplotlib import pyplot as plt import argparse import yaml import sys if __name__ == '__main__': parser = argparse.ArgumentParser('CT Model') parser.add_argument('location', help='model directory') parser.add_argument('filter', choices=['categories', 'keywords', 'no-filter'], help='filter type') parser.add_argument('--values', nargs='+', default=[], help='filter allowed values') parser.add_argument('--samples', type=int, default=50000) parser.add_argument('--constant-sampling', type=int, default=0) parser.add_argument('--reuse-articles', default=False, action="store_true", help="reuse article selection") parser.add_argument('--nouns', default=False, action="store_true", help="include nouns") parser.add_argument('--adjectives', default=False, action="store_true", help="include adjectives") parser.add_argument('--lemmatize', default=False, action="store_true", help="stemmer") parser.add_argument('--lemmatize-ngrams', default=False, action="store_true", help="stemmer") parser.add_argument('--remove-latex', default=False, action="store_true", help="remove latex") parser.add_argument('--limit-redundancy', default=False, action="store_true", help="limit redundancy") parser.add_argument('--add-title', default=False, action="store_true", help="include title") parser.add_argument('--top-unithood', type=int, default=20000, help='top unithood filter') parser.add_argument('--min-token-length', type=int, default=0, help='minimum token length') parser.add_argument('--min-df', type=int, default=0, help='min_df') parser.add_argument('--reuse-stored-vocabulary', default=False, action='store_true') parser.add_argument('--threads', type=int, default=4) args = parser.parse_args(["output/category_prediction_longitudinal", "categories", "--values", "Phenomenology-HEP", "Theory-HEP", "--samples", "400000", "--nouns", "--lemmatize", "--lemmatize-ngrams", "--remove-latex", "--add-title", "--top-unithood", "1000", "--threads", "16"]) with open(opj(args.location, "params.yml"), "w+") as fp: yaml.dump(args, fp) articles = pd.read_parquet("inspire-harvest/database/articles.parquet")[["title", "abstract", "article_id", "date_created", "categories"]] if args.add_title: articles["abstract"] = articles["abstract"].str.cat(articles["title"]) articles.drop(columns = ["title"], inplace=True) if args.remove_latex: articles['abstract'] = articles['abstract'].apply(lambda s: re.sub('$[^>]+$', '', s)) articles = articles[articles["abstract"].map(len)>=100] articles["abstract"] = articles["abstract"].str.lower() articles = articles[articles["date_created"].str.len() >= 4] articles["year"] = articles["date_created"].str[:4].astype(int)-1980 articles = articles[(articles["year"] >= 0) & (articles["year"] <= 40)] articles["year_group"] = articles["year"]//5 if args.reuse_articles: used = pd.read_csv(opj(args.location, 'articles.csv')) articles = articles[articles["article_id"].isin(used["article_id"])] else: articles = articles[~articles["abstract"].isnull()] if args.constant_sampling > 0: articles = articles.groupby("year").head(args.constant_sampling) keep = pd.Series([False]*len(articles), index=articles.index) print("Applying filter...") if args.filter == 'keywords': for value in args.values: keep |= articles["abstract"].str.contains(value) elif args.filter == 'categories': for value in args.values: keep |= articles["categories"].apply(lambda l: value in l) articles = articles[keep==True] articles = articles.sample(frac=1).head(args.samples) articles[["article_id"]].to_csv(opj(args.location, 'articles.csv')) articles.reset_index(inplace = True) print("Extracting n-grams...") extractor = TermExtractor(articles["abstract"].tolist(), limit_redundancy=args.limit_redundancy) if args.nouns: extractor.add_patterns([["NN.*"]]) if args.adjectives: extractor.add_patterns([["^JJ$"]]) ngrams = extractor.ngrams(threads=args.threads,lemmatize=args.lemmatize,lemmatize_ngrams=args.lemmatize_ngrams) ngrams = map(lambda l: [" ".join(n) for n in l], ngrams) ngrams = list(ngrams) articles["ngrams"] = ngrams print("n_articles:", len(articles)) print("Deriving vocabulary...") if not args.reuse_stored_vocabulary: ngrams_occurrences = defaultdict(int) for ngrams in articles["ngrams"].tolist(): _ngrams = set(ngrams) for ngram in _ngrams: ngrams_occurrences[ngram] += 1 ngrams_occurrences = pd.DataFrame( {"ngram": ngrams_occurrences.keys(), "count": ngrams_occurrences.values()} ) ngrams_occurrences["unithood"] = ( np.log(2 + ngrams_occurrences["ngram"].str.count(" ")) * ngrams_occurrences["count"] ) ngrams_occurrences["unithood"] /= len(articles) ngrams_occurrences.set_index("ngram", inplace=True) ngrams_occurrences["len"] = ngrams_occurrences.index.map(len) ngrams_occurrences = ngrams_occurrences[ngrams_occurrences["len"] > 1] top = ngrams_occurrences.sort_values("unithood", ascending=False).head( args.top_unithood ) top.to_csv(opj(args.location, "ngrams.csv")) selected_ngrams = pd.read_csv(opj(args.location, 'ngrams.csv'))['ngram'].tolist() vocabulary = { n: i for i, n in enumerate(selected_ngrams) } inv_vocabulary = { vocabulary[v]: v for v in vocabulary } ngrams = articles["ngrams"].tolist() ngrams = [[ngram for ngram in _ngrams if ngram in selected_ngrams] for _ngrams in ngrams] bow = [[vocabulary[ngram] for ngram in _ngrams] for _ngrams in ngrams] bow = [[_ngrams.count(i) for i in range(len(selected_ngrams))] for _ngrams in bow] bow = np.array(bow) bow = (bow>0)*1 # destroy freq information tfidf = TfidfTransformer() bow_tfidf = tfidf.fit_transform(bow).todense().tolist() articles["bow_tfidf"] = bow_tfidf cat_classifier = MultiLabelBinarizer(sparse_output=False) articles["categories"] = articles["categories"].map(lambda l: list(set(l)&{"Phenomenology-HEP", "Theory-HEP"})) cats = cat_classifier.fit_transform(articles["categories"]).tolist() articles["cats"] = cats vocab = 500 from sklearn.linear_model import LogisticRegression from sklearn.dummy import DummyClassifier from sklearn.metrics import f1_score results = [] for year_group, train in articles.groupby("year_group"): train = articles[articles["year_group"] != year_group] for i in range(2): fit = LogisticRegression(random_state=0,max_iter=200).fit(np.stack(train["bow_tfidf"].values)[:,0:vocab], np.stack(train["cats"].values).astype(int)[:,i]) for j in range(vocab): results.append({ 'year_group': year_group, 'term': inv_vocabulary[j], 'category': cat_classifier.inverse_transform(np.array([np.identity(2)[i,:]]))[0][0], 'coef': fit.coef_[0,j], 'rank': j }) results = pd.DataFrame(results) results["drop"] = False bow = (bow>=1).astype(int) num = np.outer(bow[:3000,:vocab].sum(axis=0),bow[:3000,:vocab].sum(axis=0))/(3000**2) den = np.tensordot(bow[:3000,:vocab], bow[:3000,:vocab], axes=([0],[0]))/3000 npmi = np.log(num)/np.log(den)-1 x, y = np.where(npmi-np.identity(vocab)>=0.95) for k,_ in enumerate(x): i = x[k] j = y[k] a = inv_vocabulary[i] b = inv_vocabulary[j] if (not (a in b or b in a)): continue if i > j: results.loc[results['rank'] == i, 'drop'] = True else: results.loc[results['rank'] == j, 'drop'] = True results = results[results["drop"]==False] results = results[results["term"].str.match("^[a-zA-Z--- ]*$")] results.sort_values(["year_group", "rank"]).to_csv(opj(args.location, "results.csv"))