1 tahun lalu · 3a3ed001bd
--- a/analyses/category_prediction_longitudinal.py
+++ b/analyses/category_prediction_longitudinal.py
@@ -0,0 +1,219 @@
 
				+from AbstractSemantics.terms import TermExtractor
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+from os.path import join as opj
			
 
				+from os.path import exists
			
 
				+
			
 
				+import itertools
			
 
				+from functools import partial
			
 
				+from collections import defaultdict
			
 
				+
			
 
				+import re
			
 
				+
			
 
				+from sklearn.preprocessing import MultiLabelBinarizer
			
 
				+from sklearn.feature_extraction.text import TfidfTransformer
			
 
				+from sklearn.model_selection import train_test_split
			
 
				+
			
 
				+import multiprocessing as mp
			
 
				+
			
 
				+from matplotlib import pyplot as plt
			
 
				+
			
 
				+import argparse
			
 
				+import yaml
			
 
				+import sys
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+
			
 
				+    parser = argparse.ArgumentParser('CT Model')
			
 
				+    parser.add_argument('location', help='model directory')
			
 
				+    parser.add_argument('filter', choices=['categories', 'keywords', 'no-filter'], help='filter type')
			
 
				+    parser.add_argument('--values', nargs='+', default=[], help='filter allowed values')
			
 
				+    parser.add_argument('--samples', type=int, default=50000)
			
 
				+    parser.add_argument('--constant-sampling', type=int, default=0)
			
 
				+    parser.add_argument('--reuse-articles', default=False, action="store_true", help="reuse article selection")
			
 
				+    parser.add_argument('--nouns', default=False, action="store_true", help="include nouns")
			
 
				+    parser.add_argument('--adjectives', default=False, action="store_true", help="include adjectives")
			
 
				+    parser.add_argument('--lemmatize', default=False, action="store_true", help="stemmer")
			
 
				+    parser.add_argument('--lemmatize-ngrams', default=False, action="store_true", help="stemmer")
			
 
				+    parser.add_argument('--remove-latex', default=False, action="store_true", help="remove latex")
			
 
				+    parser.add_argument('--limit-redundancy', default=False, action="store_true", help="limit redundancy")
			
 
				+    parser.add_argument('--add-title', default=False, action="store_true", help="include title")
			
 
				+    parser.add_argument('--top-unithood', type=int, default=20000, help='top unithood filter')
			
 
				+    parser.add_argument('--min-token-length', type=int, default=0, help='minimum token length')
			
 
				+    parser.add_argument('--min-df', type=int, default=0, help='min_df')
			
 
				+    parser.add_argument('--reuse-stored-vocabulary', default=False, action='store_true')
			
 
				+    parser.add_argument('--threads', type=int, default=4)
			
 
				+    args = parser.parse_args(["output/category_prediction_longitudinal", "categories", "--values", "Phenomenology-HEP", "Theory-HEP", "--samples", "400000", "--nouns", "--lemmatize", "--lemmatize-ngrams", "--remove-latex", "--add-title", "--top-unithood", "1000", "--threads", "16"])
			
 
				+
			
 
				+    with open(opj(args.location, "params.yml"), "w+") as fp:
			
 
				+        yaml.dump(args, fp)
			
 
				+
			
 
				+    articles = pd.read_parquet("inspire-harvest/database/articles.parquet")[["title", "abstract", "article_id", "date_created", "categories"]]
			
 
				+
			
 
				+    if args.add_title:
			
 
				+        articles["abstract"] = articles["abstract"].str.cat(articles["title"])
			
 
				+
			
 
				+    articles.drop(columns = ["title"], inplace=True)
			
 
				+
			
 
				+    if args.remove_latex:
			
 
				+        articles['abstract'] = articles['abstract'].apply(lambda s: re.sub('$[^>]+$', '', s))
			
 
				+
			
 
				+    articles = articles[articles["abstract"].map(len)>=100]
			
 
				+    articles["abstract"] = articles["abstract"].str.lower()
			
 
				+
			
 
				+    articles = articles[articles["date_created"].str.len() >= 4]
			
 
				+    articles["year"] = articles["date_created"].str[:4].astype(int)-1980
			
 
				+    articles = articles[(articles["year"] >= 0) & (articles["year"] <= 40)]
			
 
				+    articles["year_group"] = articles["year"]//5
			
 
				+
			
 
				+    if args.reuse_articles:
			
 
				+        used = pd.read_csv(opj(args.location, 'articles.csv'))
			
 
				+        articles = articles[articles["article_id"].isin(used["article_id"])]
			
 
				+    else:
			
 
				+        articles = articles[~articles["abstract"].isnull()]
			
 
				+
			
 
				+        if args.constant_sampling > 0:
			
 
				+            articles = articles.groupby("year").head(args.constant_sampling)
			
 
				+
			
 
				+        keep = pd.Series([False]*len(articles), index=articles.index)
			
 
				+
			
 
				+        print("Applying filter...")
			
 
				+        if args.filter == 'keywords':
			
 
				+            for value in args.values:
			
 
				+                keep |= articles["abstract"].str.contains(value)
			
 
				+        elif args.filter == 'categories':
			
 
				+            for value in args.values:
			
 
				+                keep |= articles["categories"].apply(lambda l: value in l)
			
 
				+
			
 
				+        articles = articles[keep==True]
			
 
				+        articles = articles.sample(frac=1).head(args.samples)
			
 
				+        articles[["article_id"]].to_csv(opj(args.location, 'articles.csv'))
			
 
				+
			
 
				+    articles.reset_index(inplace = True)
			
 
				+
			
 
				+    print("Extracting n-grams...")
			
 
				+    extractor = TermExtractor(articles["abstract"].tolist(), limit_redundancy=args.limit_redundancy)
			
 
				+
			
 
				+    if args.nouns:
			
 
				+        extractor.add_patterns([["NN.*"]])
			
 
				+
			
 
				+    if args.adjectives:
			
 
				+        extractor.add_patterns([["^JJ$"]])
			
 
				+
			
 
				+    ngrams = extractor.ngrams(threads=args.threads,lemmatize=args.lemmatize,lemmatize_ngrams=args.lemmatize_ngrams)
			
 
				+    ngrams = map(lambda l: [" ".join(n) for n in l], ngrams)
			
 
				+    ngrams = list(ngrams)
			
 
				+
			
 
				+    articles["ngrams"] = ngrams
			
 
				+
			
 
				+    print("n_articles:", len(articles))
			
 
				+
			
 
				+    print("Deriving vocabulary...")
			
 
				+    if not args.reuse_stored_vocabulary:
			
 
				+        ngrams_occurrences = defaultdict(int)
			
 
				+
			
 
				+        for ngrams in articles["ngrams"].tolist():
			
 
				+            _ngrams = set(ngrams)
			
 
				+            for ngram in _ngrams:
			
 
				+                ngrams_occurrences[ngram] += 1
			
 
				+
			
 
				+        ngrams_occurrences = pd.DataFrame(
			
 
				+            {"ngram": ngrams_occurrences.keys(), "count": ngrams_occurrences.values()}
			
 
				+        )
			
 
				+        ngrams_occurrences["unithood"] = (
			
 
				+            np.log(2 + ngrams_occurrences["ngram"].str.count(" "))
			
 
				+            * ngrams_occurrences["count"]
			
 
				+        )
			
 
				+        ngrams_occurrences["unithood"] /= len(articles)
			
 
				+        ngrams_occurrences.set_index("ngram", inplace=True)
			
 
				+
			
 
				+        ngrams_occurrences["len"] = ngrams_occurrences.index.map(len)
			
 
				+        ngrams_occurrences = ngrams_occurrences[ngrams_occurrences["len"] > 1]
			
 
				+
			
 
				+        top = ngrams_occurrences.sort_values("unithood", ascending=False).head(
			
 
				+            args.top_unithood
			
 
				+        )
			
 
				+
			
 
				+        top.to_csv(opj(args.location, "ngrams.csv"))
			
 
				+
			
 
				+    
			
 
				+    selected_ngrams = pd.read_csv(opj(args.location, 'ngrams.csv'))['ngram'].tolist()
			
 
				+
			
 
				+    vocabulary = {
			
 
				+        n: i
			
 
				+        for i, n in enumerate(selected_ngrams)
			
 
				+    }
			
 
				+
			
 
				+    inv_vocabulary = {
			
 
				+        vocabulary[v]: v
			
 
				+        for v in vocabulary
			
 
				+    }
			
 
				+
			
 
				+    ngrams = articles["ngrams"].tolist()
			
 
				+    ngrams = [[ngram for ngram in _ngrams if ngram in selected_ngrams] for _ngrams in ngrams]
			
 
				+
			
 
				+    bow = [[vocabulary[ngram] for ngram in _ngrams] for _ngrams in ngrams]
			
 
				+    bow = [[_ngrams.count(i) for i in range(len(selected_ngrams))] for _ngrams in bow]
			
 
				+    bow = np.array(bow)
			
 
				+    bow = (bow>0)*1 # destroy freq information
			
 
				+
			
 
				+    tfidf = TfidfTransformer()
			
 
				+    bow_tfidf = tfidf.fit_transform(bow).todense().tolist()
			
 
				+    articles["bow_tfidf"] = bow_tfidf
			
 
				+
			
 
				+    cat_classifier = MultiLabelBinarizer(sparse_output=False)
			
 
				+    articles["categories"] = articles["categories"].map(lambda l: list(set(l)&{"Phenomenology-HEP", "Theory-HEP"}))
			
 
				+    cats = cat_classifier.fit_transform(articles["categories"]).tolist()
			
 
				+    articles["cats"] = cats
			
 
				+
			
 
				+    vocab = 500
			
 
				+
			
 
				+    from sklearn.linear_model import LogisticRegression
			
 
				+    from sklearn.dummy import DummyClassifier
			
 
				+    from sklearn.metrics import f1_score
			
 
				+
			
 
				+    results = []
			
 
				+
			
 
				+    for year_group, train in articles.groupby("year_group"):
			
 
				+        train = articles[articles["year_group"] != year_group]
			
 
				+        
			
 
				+        for i in range(2):
			
 
				+            fit = LogisticRegression(random_state=0,max_iter=200).fit(np.stack(train["bow_tfidf"].values)[:,0:vocab], np.stack(train["cats"].values).astype(int)[:,i])
			
 
				+
			
 
				+            for j in range(vocab):
			
 
				+                results.append({
			
 
				+                    'year_group': year_group,
			
 
				+                    'term': inv_vocabulary[j],
			
 
				+                    'category': cat_classifier.inverse_transform(np.array([np.identity(2)[i,:]]))[0][0],
			
 
				+                    'coef': fit.coef_[0,j],
			
 
				+                    'rank': j
			
 
				+                })
			
 
				+
			
 
				+
			
 
				+    results = pd.DataFrame(results)
			
 
				+    results["drop"] = False
			
 
				+
			
 
				+    bow = (bow>=1).astype(int)
			
 
				+    num = np.outer(bow[:3000,:vocab].sum(axis=0),bow[:3000,:vocab].sum(axis=0))/(3000**2)
			
 
				+    den = np.tensordot(bow[:3000,:vocab], bow[:3000,:vocab], axes=([0],[0]))/3000
			
 
				+    npmi = np.log(num)/np.log(den)-1
			
 
				+
			
 
				+    x, y = np.where(npmi-np.identity(vocab)>=0.95)
			
 
				+    for k,_ in enumerate(x):
			
 
				+        i = x[k]
			
 
				+        j = y[k]
			
 
				+
			
 
				+        a = inv_vocabulary[i]
			
 
				+        b = inv_vocabulary[j]
			
 
				+
			
 
				+        if (not (a in b or b in a)):
			
 
				+            continue
			
 
				+
			
 
				+        if i > j:
			
 
				+            results.loc[results['rank'] == i, 'drop'] = True
			
 
				+        else:
			
 
				+            results.loc[results['rank'] == j, 'drop'] = True
			
 
				+
			
 
				+    results = results[results["drop"]==False]
			
 
				+    results = results[results["term"].str.match("^[a-zA-Z--- ]*$")]
			
 
				+    results.sort_values(["year_group", "rank"]).to_csv(opj(args.location, "results.csv"))