Jelajahi Sumber

longitudinal fitscripts

Lucas Gautheron 1 tahun lalu
induk
melakukan
3a3ed001bd
1 mengubah file dengan 219 tambahan dan 0 penghapusan
  1. 219 0
      analyses/category_prediction_longitudinal.py

+ 219 - 0
analyses/category_prediction_longitudinal.py

@@ -0,0 +1,219 @@
+from AbstractSemantics.terms import TermExtractor
+import pandas as pd
+import numpy as np
+from os.path import join as opj
+from os.path import exists
+
+import itertools
+from functools import partial
+from collections import defaultdict
+
+import re
+
+from sklearn.preprocessing import MultiLabelBinarizer
+from sklearn.feature_extraction.text import TfidfTransformer
+from sklearn.model_selection import train_test_split
+
+import multiprocessing as mp
+
+from matplotlib import pyplot as plt
+
+import argparse
+import yaml
+import sys
+
+if __name__ == '__main__':
+
+    parser = argparse.ArgumentParser('CT Model')
+    parser.add_argument('location', help='model directory')
+    parser.add_argument('filter', choices=['categories', 'keywords', 'no-filter'], help='filter type')
+    parser.add_argument('--values', nargs='+', default=[], help='filter allowed values')
+    parser.add_argument('--samples', type=int, default=50000)
+    parser.add_argument('--constant-sampling', type=int, default=0)
+    parser.add_argument('--reuse-articles', default=False, action="store_true", help="reuse article selection")
+    parser.add_argument('--nouns', default=False, action="store_true", help="include nouns")
+    parser.add_argument('--adjectives', default=False, action="store_true", help="include adjectives")
+    parser.add_argument('--lemmatize', default=False, action="store_true", help="stemmer")
+    parser.add_argument('--lemmatize-ngrams', default=False, action="store_true", help="stemmer")
+    parser.add_argument('--remove-latex', default=False, action="store_true", help="remove latex")
+    parser.add_argument('--limit-redundancy', default=False, action="store_true", help="limit redundancy")
+    parser.add_argument('--add-title', default=False, action="store_true", help="include title")
+    parser.add_argument('--top-unithood', type=int, default=20000, help='top unithood filter')
+    parser.add_argument('--min-token-length', type=int, default=0, help='minimum token length')
+    parser.add_argument('--min-df', type=int, default=0, help='min_df')
+    parser.add_argument('--reuse-stored-vocabulary', default=False, action='store_true')
+    parser.add_argument('--threads', type=int, default=4)
+    args = parser.parse_args(["output/category_prediction_longitudinal", "categories", "--values", "Phenomenology-HEP", "Theory-HEP", "--samples", "400000", "--nouns", "--lemmatize", "--lemmatize-ngrams", "--remove-latex", "--add-title", "--top-unithood", "1000", "--threads", "16"])
+
+    with open(opj(args.location, "params.yml"), "w+") as fp:
+        yaml.dump(args, fp)
+
+    articles = pd.read_parquet("inspire-harvest/database/articles.parquet")[["title", "abstract", "article_id", "date_created", "categories"]]
+
+    if args.add_title:
+        articles["abstract"] = articles["abstract"].str.cat(articles["title"])
+
+    articles.drop(columns = ["title"], inplace=True)
+
+    if args.remove_latex:
+        articles['abstract'] = articles['abstract'].apply(lambda s: re.sub('$[^>]+$', '', s))
+
+    articles = articles[articles["abstract"].map(len)>=100]
+    articles["abstract"] = articles["abstract"].str.lower()
+
+    articles = articles[articles["date_created"].str.len() >= 4]
+    articles["year"] = articles["date_created"].str[:4].astype(int)-1980
+    articles = articles[(articles["year"] >= 0) & (articles["year"] <= 40)]
+    articles["year_group"] = articles["year"]//5
+
+    if args.reuse_articles:
+        used = pd.read_csv(opj(args.location, 'articles.csv'))
+        articles = articles[articles["article_id"].isin(used["article_id"])]
+    else:
+        articles = articles[~articles["abstract"].isnull()]
+
+        if args.constant_sampling > 0:
+            articles = articles.groupby("year").head(args.constant_sampling)
+
+        keep = pd.Series([False]*len(articles), index=articles.index)
+
+        print("Applying filter...")
+        if args.filter == 'keywords':
+            for value in args.values:
+                keep |= articles["abstract"].str.contains(value)
+        elif args.filter == 'categories':
+            for value in args.values:
+                keep |= articles["categories"].apply(lambda l: value in l)
+
+        articles = articles[keep==True]
+        articles = articles.sample(frac=1).head(args.samples)
+        articles[["article_id"]].to_csv(opj(args.location, 'articles.csv'))
+
+    articles.reset_index(inplace = True)
+
+    print("Extracting n-grams...")
+    extractor = TermExtractor(articles["abstract"].tolist(), limit_redundancy=args.limit_redundancy)
+
+    if args.nouns:
+        extractor.add_patterns([["NN.*"]])
+
+    if args.adjectives:
+        extractor.add_patterns([["^JJ$"]])
+
+    ngrams = extractor.ngrams(threads=args.threads,lemmatize=args.lemmatize,lemmatize_ngrams=args.lemmatize_ngrams)
+    ngrams = map(lambda l: [" ".join(n) for n in l], ngrams)
+    ngrams = list(ngrams)
+
+    articles["ngrams"] = ngrams
+
+    print("n_articles:", len(articles))
+
+    print("Deriving vocabulary...")
+    if not args.reuse_stored_vocabulary:
+        ngrams_occurrences = defaultdict(int)
+
+        for ngrams in articles["ngrams"].tolist():
+            _ngrams = set(ngrams)
+            for ngram in _ngrams:
+                ngrams_occurrences[ngram] += 1
+
+        ngrams_occurrences = pd.DataFrame(
+            {"ngram": ngrams_occurrences.keys(), "count": ngrams_occurrences.values()}
+        )
+        ngrams_occurrences["unithood"] = (
+            np.log(2 + ngrams_occurrences["ngram"].str.count(" "))
+            * ngrams_occurrences["count"]
+        )
+        ngrams_occurrences["unithood"] /= len(articles)
+        ngrams_occurrences.set_index("ngram", inplace=True)
+
+        ngrams_occurrences["len"] = ngrams_occurrences.index.map(len)
+        ngrams_occurrences = ngrams_occurrences[ngrams_occurrences["len"] > 1]
+
+        top = ngrams_occurrences.sort_values("unithood", ascending=False).head(
+            args.top_unithood
+        )
+
+        top.to_csv(opj(args.location, "ngrams.csv"))
+
+    
+    selected_ngrams = pd.read_csv(opj(args.location, 'ngrams.csv'))['ngram'].tolist()
+
+    vocabulary = {
+        n: i
+        for i, n in enumerate(selected_ngrams)
+    }
+
+    inv_vocabulary = {
+        vocabulary[v]: v
+        for v in vocabulary
+    }
+
+    ngrams = articles["ngrams"].tolist()
+    ngrams = [[ngram for ngram in _ngrams if ngram in selected_ngrams] for _ngrams in ngrams]
+
+    bow = [[vocabulary[ngram] for ngram in _ngrams] for _ngrams in ngrams]
+    bow = [[_ngrams.count(i) for i in range(len(selected_ngrams))] for _ngrams in bow]
+    bow = np.array(bow)
+    bow = (bow>0)*1 # destroy freq information
+
+    tfidf = TfidfTransformer()
+    bow_tfidf = tfidf.fit_transform(bow).todense().tolist()
+    articles["bow_tfidf"] = bow_tfidf
+
+    cat_classifier = MultiLabelBinarizer(sparse_output=False)
+    articles["categories"] = articles["categories"].map(lambda l: list(set(l)&{"Phenomenology-HEP", "Theory-HEP"}))
+    cats = cat_classifier.fit_transform(articles["categories"]).tolist()
+    articles["cats"] = cats
+
+    vocab = 500
+
+    from sklearn.linear_model import LogisticRegression
+    from sklearn.dummy import DummyClassifier
+    from sklearn.metrics import f1_score
+
+    results = []
+
+    for year_group, train in articles.groupby("year_group"):
+        train = articles[articles["year_group"] != year_group]
+        
+        for i in range(2):
+            fit = LogisticRegression(random_state=0,max_iter=200).fit(np.stack(train["bow_tfidf"].values)[:,0:vocab], np.stack(train["cats"].values).astype(int)[:,i])
+
+            for j in range(vocab):
+                results.append({
+                    'year_group': year_group,
+                    'term': inv_vocabulary[j],
+                    'category': cat_classifier.inverse_transform(np.array([np.identity(2)[i,:]]))[0][0],
+                    'coef': fit.coef_[0,j],
+                    'rank': j
+                })
+
+
+    results = pd.DataFrame(results)
+    results["drop"] = False
+
+    bow = (bow>=1).astype(int)
+    num = np.outer(bow[:3000,:vocab].sum(axis=0),bow[:3000,:vocab].sum(axis=0))/(3000**2)
+    den = np.tensordot(bow[:3000,:vocab], bow[:3000,:vocab], axes=([0],[0]))/3000
+    npmi = np.log(num)/np.log(den)-1
+
+    x, y = np.where(npmi-np.identity(vocab)>=0.95)
+    for k,_ in enumerate(x):
+        i = x[k]
+        j = y[k]
+
+        a = inv_vocabulary[i]
+        b = inv_vocabulary[j]
+
+        if (not (a in b or b in a)):
+            continue
+
+        if i > j:
+            results.loc[results['rank'] == i, 'drop'] = True
+        else:
+            results.loc[results['rank'] == j, 'drop'] = True
+
+    results = results[results["drop"]==False]
+    results = results[results["term"].str.match("^[a-zA-Z--- ]*$")]
+    results.sort_values(["year_group", "rank"]).to_csv(opj(args.location, "results.csv"))