|
@@ -0,0 +1,144 @@
|
|
|
+from AbstractSemantics.terms import TermExtractor
|
|
|
+from AbstractSemantics.embeddings import GensimWord2Vec
|
|
|
+import pandas as pd
|
|
|
+import numpy as np
|
|
|
+from os.path import join as opj
|
|
|
+from os.path import exists
|
|
|
+
|
|
|
+import itertools
|
|
|
+from functools import partial
|
|
|
+from collections import defaultdict
|
|
|
+
|
|
|
+import re
|
|
|
+
|
|
|
+from sklearn.preprocessing import MultiLabelBinarizer
|
|
|
+from sklearn.feature_extraction.text import TfidfTransformer
|
|
|
+from sklearn.model_selection import train_test_split
|
|
|
+
|
|
|
+import multiprocessing as mp
|
|
|
+
|
|
|
+from matplotlib import pyplot as plt
|
|
|
+
|
|
|
+import argparse
|
|
|
+import yaml
|
|
|
+import sys
|
|
|
+
|
|
|
+from gensim.models.callbacks import CallbackAny2Vec
|
|
|
+
|
|
|
+class MonitorCallback(CallbackAny2Vec):
|
|
|
+ def __init__(self, test_words):
|
|
|
+ self._test_words = test_words
|
|
|
+ self.epoch = 0
|
|
|
+
|
|
|
+ def on_epoch_end(self, model):
|
|
|
+ loss = model.get_latest_training_loss()
|
|
|
+ if self.epoch == 0:
|
|
|
+ print('Loss after epoch {}: {}'.format(self.epoch, loss))
|
|
|
+ else:
|
|
|
+ print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
|
|
|
+
|
|
|
+ self.epoch += 1
|
|
|
+ self.loss_previous_step = loss
|
|
|
+
|
|
|
+ for word in self._test_words: # show wv logic changes
|
|
|
+ print(f"{word}: {model.wv.most_similar(word)}")
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+
|
|
|
+ parser = argparse.ArgumentParser('CT Model')
|
|
|
+ parser.add_argument('location', help='model directory')
|
|
|
+ parser.add_argument('filter', choices=['categories', 'keywords', 'no-filter'], help='filter type')
|
|
|
+ parser.add_argument('--values', nargs='+', default=[], help='filter allowed values')
|
|
|
+ parser.add_argument('--samples', type=int, default=50000)
|
|
|
+ parser.add_argument('--dimensions', type=int, default=64)
|
|
|
+ parser.add_argument('--constant-sampling', type=int, default=0)
|
|
|
+ parser.add_argument('--reuse-articles', default=False, action="store_true", help="reuse article selection")
|
|
|
+ parser.add_argument('--nouns', default=False, action="store_true", help="include nouns")
|
|
|
+ parser.add_argument('--adjectives', default=False, action="store_true", help="include adjectives")
|
|
|
+ parser.add_argument('--lemmatize', default=False, action="store_true", help="stemmer")
|
|
|
+ parser.add_argument('--remove-latex', default=False, action="store_true", help="remove latex")
|
|
|
+ parser.add_argument('--add-title', default=False, action="store_true", help="include title")
|
|
|
+ parser.add_argument('--top-unithood', type=int, default=20000, help='top unithood filter')
|
|
|
+ parser.add_argument('--min-token-length', type=int, default=0, help='minimum token length')
|
|
|
+ parser.add_argument('--min-df', type=int, default=0, help='min_df')
|
|
|
+ parser.add_argument('--reuse-stored-vocabulary', default=False, action='store_true')
|
|
|
+ parser.add_argument('--threads', type=int, default=4)
|
|
|
+ args = parser.parse_args(["output/embeddings", "categories", "--values", "Phenomenology-HEP", "Theory-HEP", "--samples", "150000", "--threads", "4"])
|
|
|
+
|
|
|
+ with open(opj(args.location, "params.yml"), "w+") as fp:
|
|
|
+ yaml.dump(args, fp)
|
|
|
+
|
|
|
+ articles = pd.read_parquet("inspire-harvest/database/articles.parquet")[["title", "abstract", "article_id", "date_created", "categories"]]
|
|
|
+
|
|
|
+ if args.add_title:
|
|
|
+ articles["abstract"] = articles["abstract"].str.cat(articles["title"])
|
|
|
+
|
|
|
+ articles.drop(columns = ["title"], inplace=True)
|
|
|
+
|
|
|
+ if args.remove_latex:
|
|
|
+ articles['abstract'] = articles['abstract'].apply(lambda s: re.sub('$[^>]+$', '', s))
|
|
|
+
|
|
|
+ articles = articles[articles["abstract"].map(len)>=100]
|
|
|
+ articles["abstract"] = articles["abstract"].str.lower()
|
|
|
+
|
|
|
+ articles = articles[articles["date_created"].str.len() >= 4]
|
|
|
+ articles["year"] = articles["date_created"].str[:4].astype(int)-1980
|
|
|
+ articles = articles[(articles["year"] >= 0) & (articles["year"] <= 40)]
|
|
|
+ articles["year_group"] = articles["year"]//5
|
|
|
+
|
|
|
+ if args.reuse_articles:
|
|
|
+ used = pd.read_csv(opj(args.location, 'articles.csv'))
|
|
|
+ articles = articles[articles["article_id"].isin(used["article_id"])]
|
|
|
+ else:
|
|
|
+ articles = articles[~articles["abstract"].isnull()]
|
|
|
+
|
|
|
+ if args.constant_sampling > 0:
|
|
|
+ articles = articles.groupby("year").head(args.constant_sampling)
|
|
|
+
|
|
|
+ keep = pd.Series([False]*len(articles), index=articles.index)
|
|
|
+
|
|
|
+ print("Applying filter...")
|
|
|
+ if args.filter == 'keywords':
|
|
|
+ for value in args.values:
|
|
|
+ keep |= articles["abstract"].str.contains(value)
|
|
|
+ elif args.filter == 'categories':
|
|
|
+ for value in args.values:
|
|
|
+ keep |= articles["categories"].apply(lambda l: value in l)
|
|
|
+
|
|
|
+ articles = articles[keep==True]
|
|
|
+ articles = articles.sample(frac=1).head(args.samples)
|
|
|
+ articles[["article_id"]].to_csv(opj(args.location, 'articles.csv'))
|
|
|
+
|
|
|
+ articles.reset_index(inplace = True)
|
|
|
+
|
|
|
+ print("Extracting n-grams...")
|
|
|
+ extractor = TermExtractor(articles["abstract"].tolist())
|
|
|
+ sentences = extractor.tokens(threads=args.threads, lemmatize=True, split_sentences=True)
|
|
|
+
|
|
|
+ print(len(sentences))
|
|
|
+ print(sentences[0])
|
|
|
+ print(sentences[0][0])
|
|
|
+
|
|
|
+ articles["sentences"] = sentences
|
|
|
+
|
|
|
+ for category in args.values:
|
|
|
+ _articles = articles[articles.categories.map(lambda l: category in l)]
|
|
|
+
|
|
|
+ corpus = [sentence for sentences in _articles["sentences"].tolist() for sentence in sentences]
|
|
|
+
|
|
|
+ print(category, len(corpus))
|
|
|
+
|
|
|
+ emb = GensimWord2Vec(corpus)
|
|
|
+ model = emb.model(
|
|
|
+ vector_size=args.dimensions,
|
|
|
+ window=10,
|
|
|
+ workers=args.threads,
|
|
|
+ compute_loss=True,
|
|
|
+ epochs=50,
|
|
|
+ callbacks=[MonitorCallback(["quark", "gluino", "renormalization"])]
|
|
|
+ )
|
|
|
+ # model.build_vocab(corpus)
|
|
|
+ model.train(corpus, epochs=10, total_examples=model.corpus_count)
|
|
|
+ model.train(corpus, epochs=10, total_examples=model.corpus_count)
|
|
|
+ model.train(corpus, epochs=10, total_examples=model.corpus_count)
|
|
|
+ model.save(opj(args.location, f"{category}.mdl"))
|