from AbstractSemantics.terms import TermExtractor from AbstractSemantics.embeddings import GensimWord2Vec from gensim.models import KeyedVectors import nltk import pandas as pd import numpy as np from scipy.sparse import csr_matrix from os.path import join as opj from os.path import exists import itertools from functools import partial from collections import defaultdict import re import multiprocessing as mp # from matplotlib import pyplot as plt import argparse import yaml import sys import pickle from gensim.models.callbacks import CallbackAny2Vec model_loss = 0 class MonitorCallback(CallbackAny2Vec): def __init__(self, test_words): self._test_words = test_words self.epoch = 0 def on_epoch_end(self, model): loss = model.get_latest_training_loss() model.running_training_loss = 0.0 print("Loss after epoch {}: {}".format(self.epoch, loss)) global model_loss model_loss = loss self.epoch += 1 for word in self._test_words: # show wv logic changes print(f"{word}: {model.wv.most_similar(word)}") def filter_ngrams(l, wl): return [ngram for ngram in l if ngram in wl] def construct_bow(l, n): items = list(set(l)) return np.array(items), np.array([l.count(i) for i in items]) def ngram_inclusion(i, js): return [ j.find(i) >= 0 # matching and bool(re.search(f"(^|\_){re.escape(i)}($|\_)", j)) and (j.count("_") == i.count("_") + 1) and ( (i.count("_") >= 1) or bool(re.search(f"(^|\_){re.escape(i)}$", j)) or bool(re.search(f"^{re.escape(i)}($|\_)", j)) ) for j in js ] if __name__ == "__main__": parser = argparse.ArgumentParser("CT Model") parser.add_argument("location", help="model directory") parser.add_argument( "filter", choices=["categories", "keywords", "no-filter"], help="filter type" ) parser.add_argument("--values", nargs="+", default=[], help="filter allowed values") parser.add_argument("--dataset", default="inspire-harvest/database") # sample size parser.add_argument("--samples", type=int, default=50000) parser.add_argument("--constant-sampling", type=int, default=0) # text pre-processing parser.add_argument( "--add-title", default=False, action="store_true", help="include title" ) parser.add_argument( "--remove-latex", default=False, action="store_true", help="remove latex" ) parser.add_argument( "--lemmatize", default=False, action="store_true", help="lemmatize" ) parser.add_argument( "--limit-redundancy", default=False, action="store_true", help="limit redundancy", ) parser.add_argument("--blacklist", default=None, help="blacklist") # embeddings parser.add_argument("--dimensions", type=int, default=50) parser.add_argument("--pre-trained-embeddings", default=False, action="store_true") parser.add_argument("--use-saved-embeddings", default=False, action="store_true") # topic model parameters parser.add_argument("--topics", type=int, default=25) parser.add_argument("--min-df", type=float, default=0.001) parser.add_argument("--max-df", type=float, default=0.15) parser.add_argument("--threads", type=int, default=4) args = parser.parse_args( [ "output/etm_20_r", "categories", "--values", "Theory-HEP", "Phenomenology-HEP", "--dataset", "../inspire-harvest/database", "--constant-sampling", "30000", "--samples", "300000", "--threads", "24", "--add-title", "--remove-latex", "--dimensions", "50", "--topics", "25", "--min-df", "0.00075", "--lemmatize", "--pre-trained-embeddings", # "--limit-redundancy" "--use-saved-embeddings" # "--blacklist", # "output/medialab/blacklist", ] ) # with open(opj(args.location, "params.yml"), "w+") as fp: # yaml.dump(args, fp) articles = pd.read_parquet( opj(args.dataset, "articles.parquet") )[["title", "abstract", "article_id", "date_created", "categories"]] if args.add_title: articles["abstract"] = articles["abstract"].str.cat(articles["title"], sep=". ") articles.drop(columns=["title"], inplace=True) if args.remove_latex: articles["abstract"] = articles["abstract"].apply( lambda s: re.sub("$[^>]+$", "", s) ) articles["abstract"] = articles["abstract"].apply( lambda s: re.sub(r"\b\\\w+", "", s) ) articles["abstract"] = articles["abstract"].apply( lambda s: re.sub("[^0-9a-zA-Z--- -\.]+", "", s) ) # articles["abstract"] = articles["abstract"].str.replace("-", " ") # NEW articles = articles[articles["abstract"].map(len) >= 100] articles["abstract"] = articles["abstract"].str.lower() articles = articles[articles["date_created"].str.len() >= 4] if "year" not in articles.columns: articles["year"] = articles["date_created"].str[:4].astype(int) - 2000 articles = articles[(articles["year"] >= 0) & (articles["year"] <= 20)] else: articles["year"] = articles["year"].astype(int) articles = articles[articles["year"]>=2000] articles["year_group"] = articles["year"] // 5 keep = pd.Series([False] * len(articles), index=articles.index) print("Applying filter...") if args.filter == "keywords": for value in args.values: keep |= articles["abstract"].str.contains(value) elif args.filter == "categories": for value in args.values: keep |= articles["categories"].apply(lambda l: value in l) elif args.filter == "no-filter": keep |= True articles = articles[keep == True].sample(frac=1) if args.constant_sampling > 0: articles = articles.groupby("year").head(args.constant_sampling) articles = articles.sample(frac=1).head(args.samples) articles.reset_index(inplace=True) print(articles) print("Extracting n-grams...") extractor = TermExtractor( articles["abstract"].tolist(), # limit_redundancy=args.limit_redundancy, patterns=[ ["JJ.*"], ["NN.*"], ["JJ.*", "NN.*"], ["JJ.*", "NN.*", "NN.*"], # ["JJ.*", "NN", "CC", "NN.*"], # ["JJ.*", "NN.*", "JJ.*", "NN.*"], # ["RB.*", "JJ.*", "NN.*", "NN.*"], ], ) ngrams = extractor.ngrams( threads=args.threads, lemmatize=args.lemmatize, lemmatize_ngrams=args.lemmatize, split_sentences=True, ) del extractor del articles["abstract"] ngrams = map( lambda l: [ [ ("_".join(n)).strip() .replace("-", "_") .replace(".._", "") .replace("_..", "") for n in sent ] for sent in l ], ngrams, ) ngrams = list(ngrams) print("Pre-training embeddings...") emb = GensimWord2Vec( [sentence for sentences in ngrams for sentence in sentences] ) losses = [] for dim in [10,25,50,75,100,150,200]: for attempt in np.arange(5): model = emb.create_model( vector_size=dim, window=5, workers=args.threads, compute_loss=True, epochs=20, min_count=30, sg=1, callbacks=[ MonitorCallback( [ "black_hole", "supersymmetry", ] ) ], ) print(model_loss) losses.append({ 'dim': dim, 'loss': model_loss }) print(losses) losses = pd.DataFrame(losses) losses.to_csv(opj(args.location, "word2vec_losses.csv"))