123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290 |
- from AbstractSemantics.terms import TermExtractor
- from AbstractSemantics.embeddings import GensimWord2Vec
- from gensim.models import KeyedVectors
- import nltk
- import pandas as pd
- import numpy as np
- from scipy.sparse import csr_matrix
- from os.path import join as opj
- from os.path import exists
- import itertools
- from functools import partial
- from collections import defaultdict
- import re
- import multiprocessing as mp
- # from matplotlib import pyplot as plt
- import argparse
- import yaml
- import sys
- import pickle
- from gensim.models.callbacks import CallbackAny2Vec
- model_loss = 0
- class MonitorCallback(CallbackAny2Vec):
- def __init__(self, test_words):
- self._test_words = test_words
- self.epoch = 0
- def on_epoch_end(self, model):
- loss = model.get_latest_training_loss()
- model.running_training_loss = 0.0
- print("Loss after epoch {}: {}".format(self.epoch, loss))
- global model_loss
- model_loss = loss
- self.epoch += 1
- for word in self._test_words: # show wv logic changes
- print(f"{word}: {model.wv.most_similar(word)}")
- def filter_ngrams(l, wl):
- return [ngram for ngram in l if ngram in wl]
- def construct_bow(l, n):
- items = list(set(l))
- return np.array(items), np.array([l.count(i) for i in items])
- def ngram_inclusion(i, js):
- return [
- j.find(i) >= 0 # matching
- and bool(re.search(f"(^|\_){re.escape(i)}($|\_)", j))
- and (j.count("_") == i.count("_") + 1)
- and (
- (i.count("_") >= 1)
- or bool(re.search(f"(^|\_){re.escape(i)}$", j))
- or bool(re.search(f"^{re.escape(i)}($|\_)", j))
- )
- for j in js
- ]
- if __name__ == "__main__":
- parser = argparse.ArgumentParser("CT Model")
- parser.add_argument("location", help="model directory")
- parser.add_argument(
- "filter", choices=["categories", "keywords", "no-filter"], help="filter type"
- )
- parser.add_argument("--values", nargs="+", default=[], help="filter allowed values")
- parser.add_argument("--dataset", default="inspire-harvest/database")
- # sample size
- parser.add_argument("--samples", type=int, default=50000)
- parser.add_argument("--constant-sampling", type=int, default=0)
- # text pre-processing
- parser.add_argument(
- "--add-title", default=False, action="store_true", help="include title"
- )
- parser.add_argument(
- "--remove-latex", default=False, action="store_true", help="remove latex"
- )
- parser.add_argument(
- "--lemmatize", default=False, action="store_true", help="lemmatize"
- )
- parser.add_argument(
- "--limit-redundancy",
- default=False,
- action="store_true",
- help="limit redundancy",
- )
- parser.add_argument("--blacklist", default=None, help="blacklist")
- # embeddings
- parser.add_argument("--dimensions", type=int, default=50)
- parser.add_argument("--pre-trained-embeddings", default=False, action="store_true")
- parser.add_argument("--use-saved-embeddings", default=False, action="store_true")
- # topic model parameters
- parser.add_argument("--topics", type=int, default=25)
- parser.add_argument("--min-df", type=float, default=0.001)
- parser.add_argument("--max-df", type=float, default=0.15)
- parser.add_argument("--threads", type=int, default=4)
- args = parser.parse_args(
- [
- "output/etm_20_r",
- "categories",
- "--values",
- "Theory-HEP",
- "Phenomenology-HEP",
- "--dataset",
- "../inspire-harvest/database",
- "--constant-sampling",
- "30000",
- "--samples",
- "300000",
- "--threads",
- "24",
- "--add-title",
- "--remove-latex",
- "--dimensions",
- "50",
- "--topics",
- "25",
- "--min-df",
- "0.00075",
- "--lemmatize",
- "--pre-trained-embeddings",
- # "--limit-redundancy"
- "--use-saved-embeddings"
- # "--blacklist",
- # "output/medialab/blacklist",
- ]
- )
- # with open(opj(args.location, "params.yml"), "w+") as fp:
- # yaml.dump(args, fp)
- articles = pd.read_parquet(
- opj(args.dataset, "articles.parquet")
- )[["title", "abstract", "article_id", "date_created", "categories"]]
- if args.add_title:
- articles["abstract"] = articles["abstract"].str.cat(articles["title"], sep=". ")
- articles.drop(columns=["title"], inplace=True)
- if args.remove_latex:
- articles["abstract"] = articles["abstract"].apply(
- lambda s: re.sub("$[^>]+$", "", s)
- )
- articles["abstract"] = articles["abstract"].apply(
- lambda s: re.sub(r"\b\\\w+", "", s)
- )
- articles["abstract"] = articles["abstract"].apply(
- lambda s: re.sub("[^0-9a-zA-Z--- -\.]+", "", s)
- )
- # articles["abstract"] = articles["abstract"].str.replace("-", " ") # NEW
- articles = articles[articles["abstract"].map(len) >= 100]
- articles["abstract"] = articles["abstract"].str.lower()
- articles = articles[articles["date_created"].str.len() >= 4]
- if "year" not in articles.columns:
- articles["year"] = articles["date_created"].str[:4].astype(int) - 2000
- articles = articles[(articles["year"] >= 0) & (articles["year"] <= 20)]
- else:
- articles["year"] = articles["year"].astype(int)
- articles = articles[articles["year"]>=2000]
-
- articles["year_group"] = articles["year"] // 5
- keep = pd.Series([False] * len(articles), index=articles.index)
- print("Applying filter...")
- if args.filter == "keywords":
- for value in args.values:
- keep |= articles["abstract"].str.contains(value)
- elif args.filter == "categories":
- for value in args.values:
- keep |= articles["categories"].apply(lambda l: value in l)
- elif args.filter == "no-filter":
- keep |= True
- articles = articles[keep == True].sample(frac=1)
- if args.constant_sampling > 0:
- articles = articles.groupby("year").head(args.constant_sampling)
- articles = articles.sample(frac=1).head(args.samples)
- articles.reset_index(inplace=True)
- print(articles)
- print("Extracting n-grams...")
- extractor = TermExtractor(
- articles["abstract"].tolist(),
- # limit_redundancy=args.limit_redundancy,
- patterns=[
- ["JJ.*"],
- ["NN.*"],
- ["JJ.*", "NN.*"],
- ["JJ.*", "NN.*", "NN.*"],
- # ["JJ.*", "NN", "CC", "NN.*"],
- # ["JJ.*", "NN.*", "JJ.*", "NN.*"],
- # ["RB.*", "JJ.*", "NN.*", "NN.*"],
- ],
- )
- ngrams = extractor.ngrams(
- threads=args.threads,
- lemmatize=args.lemmatize,
- lemmatize_ngrams=args.lemmatize,
- split_sentences=True,
- )
- del extractor
- del articles["abstract"]
- ngrams = map(
- lambda l: [
- [
- ("_".join(n)).strip()
- .replace("-", "_")
- .replace(".._", "")
- .replace("_..", "")
- for n in sent
- ]
- for sent in l
- ],
- ngrams,
- )
- ngrams = list(ngrams)
- print("Pre-training embeddings...")
- emb = GensimWord2Vec(
- [sentence for sentences in ngrams for sentence in sentences]
- )
- losses = []
- for dim in [10,25,50,75,100,150,200]:
- for attempt in np.arange(5):
- model = emb.create_model(
- vector_size=dim,
- window=5,
- workers=args.threads,
- compute_loss=True,
- epochs=20,
- min_count=30,
- sg=1,
- callbacks=[
- MonitorCallback(
- [
- "black_hole",
- "supersymmetry",
- ]
- )
- ],
- )
- print(model_loss)
- losses.append({
- 'dim': dim,
- 'loss': model_loss
- })
- print(losses)
- losses = pd.DataFrame(losses)
- losses.to_csv(opj(args.location, "word2vec_losses.csv"))
|