from AbstractSemantics.terms import TermExtractor from AbstractSemantics.embeddings import GensimWord2Vec from gensim.models import KeyedVectors import nltk import pandas as pd import numpy as np from scipy.sparse import csr_matrix from os.path import join as opj from os.path import exists import itertools from functools import partial from collections import defaultdict import re import multiprocessing as mp # from matplotlib import pyplot as plt import argparse import yaml import sys import pickle from gensim.models.callbacks import CallbackAny2Vec class MonitorCallback(CallbackAny2Vec): def __init__(self, test_words): self._test_words = test_words self.epoch = 0 def on_epoch_end(self, model): loss = model.get_latest_training_loss() if self.epoch == 0: print("Loss after epoch {}: {}".format(self.epoch, loss)) else: print( "Loss after epoch {}: {}".format( self.epoch, loss - self.loss_previous_step ) ) self.epoch += 1 self.loss_previous_step = loss for word in self._test_words: # show wv logic changes print(f"{word}: {model.wv.most_similar(word)}") def filter_ngrams(l, wl): return [ngram for ngram in l if ngram in wl] def construct_bow(l, n): items = list(set(l)) return np.array(items), np.array([l.count(i) for i in items]) def ngram_inclusion(i, js): return [ j.find(i) >= 0 # matching and bool(re.search(f"(^|\_){re.escape(i)}($|\_)", j)) and (j.count("_") == i.count("_") + 1) and ( (i.count("_") >= 1) or bool(re.search(f"(^|\_){re.escape(i)}$", j)) or bool(re.search(f"^{re.escape(i)}($|\_)", j)) ) for j in js ] if __name__ == "__main__": parser = argparse.ArgumentParser("CT Model") parser.add_argument("location", help="model directory") parser.add_argument( "filter", choices=["categories", "keywords", "no-filter"], help="filter type" ) parser.add_argument("--values", nargs="+", default=[], help="filter allowed values") parser.add_argument("--dataset", default="inspire-harvest/database") # sample size parser.add_argument("--samples", type=int, default=50000) parser.add_argument("--constant-sampling", type=int, default=0) # text pre-processing parser.add_argument( "--add-title", default=False, action="store_true", help="include title" ) parser.add_argument( "--remove-latex", default=False, action="store_true", help="remove latex" ) parser.add_argument( "--lemmatize", default=False, action="store_true", help="lemmatize" ) parser.add_argument( "--limit-redundancy", default=False, action="store_true", help="limit redundancy", ) parser.add_argument("--blacklist", default=None, help="blacklist") # embeddings parser.add_argument("--dimensions", type=int, default=50) parser.add_argument("--pre-trained-embeddings", default=False, action="store_true") parser.add_argument("--use-saved-embeddings", default=False, action="store_true") # topic model parameters parser.add_argument("--topics", type=int, default=25) parser.add_argument("--min-df", type=float, default=0.001) parser.add_argument("--max-df", type=float, default=0.15) parser.add_argument("--threads", type=int, default=4) args = parser.parse_args( [ "output/acl_2002_2022", "no-filter", "--dataset", "../acl", "--constant-sampling", "12000", "--samples", "300000", "--threads", "30", "--add-title", "--remove-latex", "--dimensions", "50", "--topics", "20", "--min-df", "0.00075", "--lemmatize", "--pre-trained-embeddings", # "--limit-redundancy" "--use-saved-embeddings" # "--blacklist", # "output/medialab/blacklist", ] ) with open(opj(args.location, "params.yml"), "w+") as fp: yaml.dump(args, fp) articles = pd.read_parquet( opj(args.dataset, "articles.parquet") )[["title", "abstract", "article_id", "date_created", "categories"]] if args.add_title: articles["abstract"] = articles["abstract"].str.cat(articles["title"], sep=". ") articles.drop(columns=["title"], inplace=True) if args.remove_latex: articles["abstract"] = articles["abstract"].apply( lambda s: re.sub("$[^>]+$", "", s) ) articles["abstract"] = articles["abstract"].apply( lambda s: re.sub(r"\b\\\w+", "", s) ) articles["abstract"] = articles["abstract"].apply( lambda s: re.sub("[^0-9a-zA-Z--- -\.]+", "", s) ) # articles["abstract"] = articles["abstract"].str.replace("-", " ") # NEW articles = articles[articles["abstract"].map(len) >= 100] articles["abstract"] = articles["abstract"].str.lower() articles = articles[articles["date_created"].str.len() >= 4] if "year" not in articles.columns: articles["year"] = articles["date_created"].str[:4].astype(int) - 2000 articles = articles[(articles["year"] >= 0) & (articles["year"] <= 40)] else: articles["year"] = articles["year"].astype(int) articles = articles[articles["year"]>=2002] articles["year_group"] = articles["year"] // 5 keep = pd.Series([False] * len(articles), index=articles.index) print("Applying filter...") if args.filter == "keywords": for value in args.values: keep |= articles["abstract"].str.contains(value) elif args.filter == "categories": for value in args.values: keep |= articles["categories"].apply(lambda l: value in l) elif args.filter == "no-filter": keep |= True articles = articles[keep == True].sample(frac=1) if args.constant_sampling > 0: articles = articles.groupby("year").head(args.constant_sampling) articles = articles.sample(frac=1).head(args.samples) articles.reset_index(inplace=True) articles[["article_id"]].to_csv(opj(args.location, "articles.csv")) print(articles) print("Extracting n-grams...") extractor = TermExtractor( articles["abstract"].tolist(), limit_redundancy=args.limit_redundancy, patterns=[ ["JJ.*"], ["NN.*"], ["JJ.*", "NN.*"], ["JJ.*", "NN.*", "NN.*"], # ["JJ.*", "NN", "CC", "NN.*"], # ["JJ.*", "NN.*", "JJ.*", "NN.*"], # ["RB.*", "JJ.*", "NN.*", "NN.*"], ], ) ngrams = extractor.ngrams( threads=args.threads, lemmatize=args.lemmatize, lemmatize_ngrams=args.lemmatize, split_sentences=args.pre_trained_embeddings and not args.use_saved_embeddings, ) del extractor del articles["abstract"] if args.pre_trained_embeddings and not args.use_saved_embeddings: ngrams = map( lambda l: [ [ ("_".join(n)) .strip() .replace("-", "_") .replace(".._", "") .replace("_..", "") for n in sent ] for sent in l ], ngrams, ) ngrams = list(ngrams) print("Pre-training embeddings...") emb = GensimWord2Vec( [sentence for sentences in ngrams for sentence in sentences] ) model = emb.create_model( vector_size=args.dimensions, window=5, workers=args.threads, compute_loss=True, # epochs=90, # min_count=30, epochs=80, min_count=15, sg=1, callbacks=[ MonitorCallback( [ "transformer", "embedding", "syntax", "grammar" ] ) ], ) model.wv.save_word2vec_format(opj(args.location, "embeddings.bin"), binary=True) del model ngrams = [ list(itertools.chain.from_iterable(article_sentences)) for article_sentences in ngrams ] else: ngrams = map( lambda l: [ "_".join(n) .strip() .replace("-", "_") .replace(".._", "") .replace("_..", "") for n in l ], ngrams, ) ngrams = list(ngrams) print("Deriving vocabulary...") voc = defaultdict(int) for article_ngrams in ngrams: _ngrams = set(article_ngrams) for ngram in _ngrams: voc[ngram] += 1 voc = pd.DataFrame({"ngram": voc.keys(), "count": voc.values()}) voc["df"] = voc["count"] / len(articles) voc.set_index("ngram", inplace=True) if args.min_df < 1: voc = voc[voc["df"] >= args.min_df] else: voc = voc[voc["count"] >= args.min_df] if args.max_df < 1: voc = voc[voc["df"] <= args.max_df] else: voc = voc[voc["count"] <= args.max_df] voc["len"] = voc.index.map(len) voc = voc[voc["len"] >= 2] stop_words = nltk.corpus.stopwords.words("english") voc = voc[~voc.index.isin(stop_words)] if args.blacklist is not None: print("Filtering black-listed keywords...") blacklist = pd.read_csv(args.blacklist)["ngram"].tolist() voc = voc[ voc.index.map(lambda s: not any([ngram in s for ngram in blacklist])) == True ] print("Filtering completed.") voc = voc.sort_values("df", ascending=False) voc.to_csv(opj(args.location, "ngrams.csv")) voc = pd.read_csv(opj(args.location, "ngrams.csv"), keep_default_na=False)[ "ngram" ].tolist() vocabulary = {n: i for i, n in enumerate(voc)} print("Filtering n-grams...") with mp.Pool(processes=args.threads) as pool: ngrams = pool.map(partial(filter_ngrams, wl=voc), ngrams) print("Constructing bag-of-words...") bow = [[vocabulary[ngram] for ngram in _ngrams] for _ngrams in ngrams] # if args.limit_redundancy: # print("Building 'within' matrix...") # with mp.Pool(processes=args.threads) as pool: # within = pool.map(partial(ngram_inclusion, js=voc), voc) # within = np.array(within).astype(int) # print("Removing double-counting...") # bow = csr_matrix(bow) # double_counting = bow.dot(csr_matrix(within.T)) # bow = bow - double_counting # print(double_counting.sum(), "redundant keywords removed") # del double_counting # bow = bow.todense() # print((bow <= -1).sum(), "keywords had negative counts after removal") del ngrams with mp.Pool(processes=args.threads) as pool: bow = pool.map(partial(construct_bow, n=len(voc)), bow) keep = [i for i in range(len(bow)) if len(bow[i][0]) > 0] articles = articles.iloc[keep] articles[["article_id"]].to_csv(opj(args.location, "articles.csv")) bow = [bow[i] for i in keep] dataset = { "tokens": [bow[i][0] for i in range(len(bow))], "counts": [bow[i][1] for i in range(len(bow))], "article_id": articles["article_id"], } del bow with open(opj(args.location, "dataset.pickle"), "wb") as handle: pickle.dump(dataset, handle, protocol=pickle.HIGHEST_PROTOCOL) print("Training...") from embedded_topic_model.models.etm import ETM etm_instance = ETM( voc, num_topics=args.topics, rho_size=args.dimensions, emb_size=args.dimensions, epochs=25, debug_mode=True, train_embeddings=not args.pre_trained_embeddings, model_path=opj(args.location, "model"), embeddings=opj(args.location, "embeddings.bin") if args.pre_trained_embeddings else None, use_c_format_w2vec=True, ) etm_instance.fit(dataset) with open(opj(args.location, "etm_instance.pickle"), "wb") as handle: pickle.dump(etm_instance, handle, protocol=pickle.HIGHEST_PROTOCOL) topics = etm_instance.get_topics(20) print(topics) topic_coherence = etm_instance.get_topic_coherence() print(topic_coherence) topic_diversity = etm_instance.get_topic_diversity() print(topic_diversity)