123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144 |
- from AbstractSemantics.terms import TermExtractor
- from AbstractSemantics.embeddings import GensimWord2Vec
- import pandas as pd
- import numpy as np
- from os.path import join as opj
- from os.path import exists
- import itertools
- from functools import partial
- from collections import defaultdict
- import re
- from sklearn.preprocessing import MultiLabelBinarizer
- from sklearn.feature_extraction.text import TfidfTransformer
- from sklearn.model_selection import train_test_split
- import multiprocessing as mp
- from matplotlib import pyplot as plt
- import argparse
- import yaml
- import sys
- from gensim.models.callbacks import CallbackAny2Vec
- class MonitorCallback(CallbackAny2Vec):
- def __init__(self, test_words):
- self._test_words = test_words
- self.epoch = 0
- def on_epoch_end(self, model):
- loss = model.get_latest_training_loss()
- if self.epoch == 0:
- print('Loss after epoch {}: {}'.format(self.epoch, loss))
- else:
- print('Loss after epoch {}: {}'.format(self.epoch, loss- self.loss_previous_step))
-
- self.epoch += 1
- self.loss_previous_step = loss
-
- for word in self._test_words: # show wv logic changes
- print(f"{word}: {model.wv.most_similar(word)}")
- if __name__ == '__main__':
- parser = argparse.ArgumentParser('CT Model')
- parser.add_argument('location', help='model directory')
- parser.add_argument('filter', choices=['categories', 'keywords', 'no-filter'], help='filter type')
- parser.add_argument('--values', nargs='+', default=[], help='filter allowed values')
- parser.add_argument('--samples', type=int, default=50000)
- parser.add_argument('--dimensions', type=int, default=64)
- parser.add_argument('--constant-sampling', type=int, default=0)
- parser.add_argument('--reuse-articles', default=False, action="store_true", help="reuse article selection")
- parser.add_argument('--nouns', default=False, action="store_true", help="include nouns")
- parser.add_argument('--adjectives', default=False, action="store_true", help="include adjectives")
- parser.add_argument('--lemmatize', default=False, action="store_true", help="stemmer")
- parser.add_argument('--remove-latex', default=False, action="store_true", help="remove latex")
- parser.add_argument('--add-title', default=False, action="store_true", help="include title")
- parser.add_argument('--top-unithood', type=int, default=20000, help='top unithood filter')
- parser.add_argument('--min-token-length', type=int, default=0, help='minimum token length')
- parser.add_argument('--min-df', type=int, default=0, help='min_df')
- parser.add_argument('--reuse-stored-vocabulary', default=False, action='store_true')
- parser.add_argument('--threads', type=int, default=4)
- args = parser.parse_args(["output/embeddings", "categories", "--values", "Phenomenology-HEP", "Theory-HEP", "--samples", "150000", "--threads", "4"])
- with open(opj(args.location, "params.yml"), "w+") as fp:
- yaml.dump(args, fp)
- articles = pd.read_parquet("inspire-harvest/database/articles.parquet")[["title", "abstract", "article_id", "date_created", "categories"]]
- if args.add_title:
- articles["abstract"] = articles["abstract"].str.cat(articles["title"])
- articles.drop(columns = ["title"], inplace=True)
- if args.remove_latex:
- articles['abstract'] = articles['abstract'].apply(lambda s: re.sub('$[^>]+$', '', s))
- articles = articles[articles["abstract"].map(len)>=100]
- articles["abstract"] = articles["abstract"].str.lower()
- articles = articles[articles["date_created"].str.len() >= 4]
- articles["year"] = articles["date_created"].str[:4].astype(int)-1980
- articles = articles[(articles["year"] >= 0) & (articles["year"] <= 40)]
- articles["year_group"] = articles["year"]//5
- if args.reuse_articles:
- used = pd.read_csv(opj(args.location, 'articles.csv'))
- articles = articles[articles["article_id"].isin(used["article_id"])]
- else:
- articles = articles[~articles["abstract"].isnull()]
- if args.constant_sampling > 0:
- articles = articles.groupby("year").head(args.constant_sampling)
- keep = pd.Series([False]*len(articles), index=articles.index)
- print("Applying filter...")
- if args.filter == 'keywords':
- for value in args.values:
- keep |= articles["abstract"].str.contains(value)
- elif args.filter == 'categories':
- for value in args.values:
- keep |= articles["categories"].apply(lambda l: value in l)
- articles = articles[keep==True]
- articles = articles.sample(frac=1).head(args.samples)
- articles[["article_id"]].to_csv(opj(args.location, 'articles.csv'))
- articles.reset_index(inplace = True)
- print("Extracting n-grams...")
- extractor = TermExtractor(articles["abstract"].tolist())
- sentences = extractor.tokens(threads=args.threads, lemmatize=True, split_sentences=True)
- print(len(sentences))
- print(sentences[0])
- print(sentences[0][0])
- articles["sentences"] = sentences
- for category in args.values:
- _articles = articles[articles.categories.map(lambda l: category in l)]
- corpus = [sentence for sentences in _articles["sentences"].tolist() for sentence in sentences]
- print(category, len(corpus))
- emb = GensimWord2Vec(corpus)
- model = emb.model(
- vector_size=args.dimensions,
- window=10,
- workers=args.threads,
- compute_loss=True,
- epochs=50,
- callbacks=[MonitorCallback(["quark", "gluino", "renormalization"])]
- )
- # model.build_vocab(corpus)
- model.train(corpus, epochs=10, total_examples=model.corpus_count)
- model.train(corpus, epochs=10, total_examples=model.corpus_count)
- model.train(corpus, epochs=10, total_examples=model.corpus_count)
- model.save(opj(args.location, f"{category}.mdl"))
|