123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425 |
- from AbstractSemantics.terms import TermExtractor
- import pandas as pd
- import numpy as np
- from os.path import join as opj
- from os.path import exists
- import itertools
- from functools import partial
- from collections import defaultdict
- import re
- import tomotopy as tp
- from sklearn.model_selection import train_test_split
- import tqdm
- import multiprocessing as mp
- from matplotlib import pyplot as plt
- import argparse
- import yaml
- import sys
- parser = argparse.ArgumentParser('CT Model')
- parser.add_argument('location', help='model directory')
- parser.add_argument('filter', choices=['categories', 'keywords', 'no-filter'], help='filter type')
- parser.add_argument('--values', nargs='+', default=[], help='filter allowed values')
- parser.add_argument('--samples', type=int, default=100000)
- parser.add_argument('--constant-sampling', type=int, default=0)
- parser.add_argument('--reuse-articles', default=False, action="store_true", help="reuse article selection")
- parser.add_argument('--nouns', default=False, action="store_true", help="include nouns")
- parser.add_argument('--adjectives', default=False, action="store_true", help="include adjectives")
- parser.add_argument('--lemmatize', default=False, action="store_true", help="stemmer")
- parser.add_argument('--remove-latex', default=False, action="store_true", help="remove latex")
- parser.add_argument('--limit-redundancy', default=False, action="store_true", help="limit redundancy")
- parser.add_argument('--add-title', default=False, action="store_true", help="include title")
- parser.add_argument('--top-unithood', type=int, default=20000, help='top unithood filter')
- parser.add_argument('--min-token-length', type=int, default=0, help='minimum token length')
- parser.add_argument('--min-df', type=int, default=0, help='min_df')
- # parser.add_argument('--top-termhood', type=int, default=15000, help='top termhood filter')
- parser.add_argument('--reload-model', default=False, action="store_true", help="reload saved model")
- parser.add_argument('--reuse-stored-vocabulary', default=False, action='store_true')
- parser.add_argument('--compute-best-params', action='store_true', help='optimize hyperparameters (maximzing C_v)', required=False)
- parser.add_argument('--reuse-best-params', action='store_true', help='re-use optimal hyperparameters', required=False)
- parser.add_argument('--topics', type=int, default=8, help='topics')
- parser.add_argument('--alpha', default=0.1, type=float, help='LDA alpha prior')
- parser.add_argument('--eta', default=0.01, type=float, help='LDA beta(eta) prior')
- parser.add_argument('--threads', type=int, default=4)
- args = parser.parse_args()
- if __name__ == "__main__":
- with open(opj(args.location, "params.yml"), "w+") as fp:
- yaml.dump(args, fp)
- articles = pd.read_parquet("inspire-harvest/database/articles.parquet")[["title", "abstract", "article_id", "date_created", "categories"]]
- if args.add_title:
- articles["abstract"] = articles["abstract"].str.cat(articles["title"])
- articles.drop(columns = ["title"], inplace=True)
- if args.remove_latex:
- articles['abstract'] = articles['abstract'].apply(lambda s: re.sub('$[^>]+$', '', s))
- articles = articles[articles["abstract"].map(len)>=100]
- articles["abstract"] = articles["abstract"].str.lower()
- articles = articles[articles["date_created"].str.len() >= 4]
- articles["year"] = articles["date_created"].str[:4].astype(int)-1980
- articles = articles[(articles["year"] >= 0) & (articles["year"] <= 40)]
- if args.reuse_articles:
- used = pd.read_csv(opj(args.location, 'articles.csv'))
- articles = articles[articles["article_id"].isin(used["article_id"])]
- else:
- articles = articles[~articles["abstract"].isnull()]
- if args.constant_sampling > 0:
- articles = articles.groupby("year").head(args.constant_sampling)
- keep = pd.Series([False]*len(articles), index=articles.index)
- print("Applying filter...")
- if args.filter == 'keywords':
- for value in args.values:
- keep |= articles["abstract"].str.contains(value)
- elif args.filter == 'categories':
- for value in args.values:
- keep |= articles["categories"].apply(lambda l: value in l)
- articles = articles[keep==True]
- articles = articles.sample(frac=1).head(args.samples)
- articles[["article_id"]].to_csv(opj(args.location, 'articles.csv'))
- articles.reset_index(inplace = True)
- print("Extracting n-grams...")
- extractor = TermExtractor(articles["abstract"].tolist(), limit_redundancy=args.limit_redundancy)
- if args.nouns:
- extractor.add_patterns([["NN.*"]])
- if args.adjectives:
- extractor.add_patterns([["^JJ$"]])
- ngrams = extractor.ngrams(threads=args.threads,lemmatize=args.lemmatize)
- ngrams = map(lambda l: [" ".join(n) for n in l], ngrams)
- ngrams = list(ngrams)
- articles["ngrams"] = ngrams
- print("Deriving vocabulary...")
- if not args.reuse_stored_vocabulary:
- ngrams_occurrences = defaultdict(int)
- ngrams_cooccurrences = defaultdict(int)
- termhood = defaultdict(int)
- for ngrams in articles["ngrams"].tolist():
- _ngrams = set(ngrams)
- for ngram in _ngrams:
- ngrams_occurrences[ngram] += 1
- ngrams_occurrences = pd.DataFrame(
- {"ngram": ngrams_occurrences.keys(), "count": ngrams_occurrences.values()}
- )
- ngrams_occurrences["unithood"] = (
- np.log(2 + ngrams_occurrences["ngram"].str.count(" "))
- * ngrams_occurrences["count"]
- )
- ngrams_occurrences["unithood"] /= len(articles)
- ngrams_occurrences.set_index("ngram", inplace=True)
- ngrams_occurrences["len"] = ngrams_occurrences.index.map(len)
- ngrams_occurrences = ngrams_occurrences[ngrams_occurrences["len"] > 1]
- top_unithood = ngrams_occurrences.sort_values("unithood", ascending=False).head(
- args.top_unithood
- )
- top = top_unithood
- top.to_csv(opj(args.location, "ngrams.csv"))
- selected_ngrams = set(pd.read_csv(opj(args.location, 'ngrams.csv'))['ngram'].tolist())
- ngrams = articles["ngrams"].tolist()
- ngrams = [[ngram for ngram in _ngrams if ngram in selected_ngrams] for _ngrams in ngrams]
- training_ngrams, validation_ngrams = train_test_split(ngrams, train_size=0.9)
- print("Creating tomotopy copora...")
- training_corpus = tp.utils.Corpus()
- for doc in training_ngrams:
- training_corpus.add_doc(words=doc)
- validation_corpus = tp.utils.Corpus()
- for doc in validation_ngrams:
- validation_corpus.add_doc(words=doc)
- if args.compute_best_params:
- topics = list(range(25, 100, 25)) + list(range(100, 200, 50))
- alphas = np.logspace(-2, 0, 3, True)
- etas = np.logspace(-3, -1, 3, True)
- model_results = {
- 'topics': [],
- 'alphas': [],
- 'etas': [],
- 'u_mass': [],
- 'c_uci': [],
- 'c_npmi': [],
- 'c_v': [],
- 'train_ll_per_word': [],
- 'validation_ll': [],
- 'documents': [],
- 'words': [],
- 'perplexity': [],
- 'train_perplexity': []
- }
- try:
- done = pd.read_csv(opj(args.location, 'lda_tuning_results.csv'))
- model_results = done.to_dict(orient="list")
- print(model_results)
- except Exception as e:
- print(e)
- done = None
- with tqdm.tqdm(total=len(topics)*len(alphas)*len(etas)) as pbar:
- for k in topics:
- for alpha in alphas:
- # alpha = alpha*10/k
- for eta in etas:
- print(k, alpha, eta)
- is_done = done is not None and len(done[(done["topics"] == k) & (done["alphas"] == alpha) & (done["etas"] == eta)]) > 0
- if is_done:
- print("already done")
- continue
- try:
- mdl = tp.CTModel(
- tw=tp.TermWeight.ONE,
- corpus=training_corpus,
- k=k,
- min_df=3,
- smoothing_alpha=alpha,
- eta=eta
- )
- mdl.train(0)
- prev_ll_per_word = None
- for _ in range(0, 100, 10):
- mdl.train(10)
- print('Iteration: {:05}\tll per word: {:.5f}'.format(mdl.global_step, mdl.ll_per_word))
- if prev_ll_per_word is not None and prev_ll_per_word > mdl.ll_per_word:
- print("stopping here")
- break
- else:
- prev_ll_per_word = mdl.ll_per_word
- except:
- print("failed")
- pbar.update(1)
- continue
- for preset in ('u_mass', 'c_uci', 'c_npmi', 'c_v'):
- coh = tp.coherence.Coherence(mdl, coherence=preset)
- average_coherence = coh.get_score()
- model_results[preset].append(average_coherence)
- res, total_ll = mdl.infer(validation_corpus, together=True)
- _ll = np.array([doc.get_ll() for doc in res])
- words = np.array([len(doc.words) for doc in res])
- perplexity = np.exp(-np.sum(total_ll)/np.sum(words))
- print(perplexity, mdl.perplexity)
- print(-np.sum(total_ll)/np.sum(words), np.log(mdl.perplexity), -np.sum(total_ll)/np.sum(words)/np.log(mdl.perplexity))
- #print(total_ll, _ll)
- print(f"Topics: {k}, Perplexity: {perplexity}")
- print(mdl.ll_per_word)
- print(mdl.perplexity)
- print(mdl.num_words)
- model_results['train_ll_per_word'].append(mdl.ll_per_word)
- model_results['validation_ll'].append(np.sum(total_ll))
- model_results['documents'].append(len(res))
- model_results['words'].append(np.sum(words))
- model_results['perplexity'].append(perplexity)
- model_results['train_perplexity'].append(mdl.perplexity)
- model_results['topics'].append(k)
- model_results['alphas'].append(alpha)
- model_results['etas'].append(eta)
- pd.DataFrame(model_results).to_csv(opj(args.location, 'lda_tuning_results.csv'), index=False)
- pbar.update(1)
- params = {'topics': args.topics}
- if not args.reload_model:
- print("Training LDA...")
- min_df = args.min_df
- print(min_df)
- mdl = tp.CTModel(
- tw=tp.TermWeight.ONE,
- corpus=training_corpus,
- k=params['topics'],
- min_df=min_df,
- smoothing_alpha=args.alpha,
- eta=args.eta
- )
- mdl.train(0)
- print('Num docs:', len(mdl.docs), ', Vocab size:', len(mdl.used_vocabs), ', Num words:', mdl.num_words)
- print('Removed top words:', mdl.removed_top_words)
- print('Training...', file=sys.stderr, flush=True)
- for _ in range(0, 250, 10):
- mdl.train(10)
- print('Iteration: {:05}\tll per word: {:.5f}'.format(mdl.global_step, mdl.ll_per_word))
- import pyLDAvis
- topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)])
- doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])
- doc_topic_dists /= doc_topic_dists.sum(axis=1, keepdims=True)
- doc_lengths = np.array([len(doc.words) for doc in mdl.docs])
- vocab = list(mdl.used_vocabs)
- term_frequency = mdl.used_vocab_freq
- prepared_data = pyLDAvis.prepare(
- topic_term_dists,
- doc_topic_dists,
- doc_lengths,
- vocab,
- term_frequency,
- start_index=0, # tomotopy starts topic ids with 0, pyLDAvis with 1
- sort_topics=False # IMPORTANT: otherwise the topic_ids between pyLDAvis and tomotopy are not matching!
- )
- pyLDAvis.save_html(prepared_data, opj(args.location, 'ldavis.html'))
- print('Saving...', file=sys.stderr, flush=True)
- mdl.save(opj(args.location, "model"), True)
- else:
- print("Loading pre-trained model...")
- mdl = tp.CTModel.load(opj(args.location, "model"))
- mdl.summary()
- # extract candidates for auto topic labeling
- extractor = tp.label.PMIExtractor(min_cf=10, min_df=5, max_len=5, max_cand=10000)
- cands = extractor.extract(mdl)
- labeler = tp.label.FoRelevance(mdl, cands, min_df=5, smoothing=1e-2, mu=0.25)
- for k in range(mdl.k):
- print("== Topic #{} ==".format(k))
- print("Labels:", ', '.join(label for label, score in labeler.get_topic_labels(k, top_n=5)))
- for word, prob in mdl.get_topic_words(k, top_n=10):
- print(word, prob, sep='\t')
- print()
- for preset in ('u_mass', 'c_uci', 'c_npmi', 'c_v'):
- coh = tp.coherence.Coherence(mdl, coherence=preset)
- average_coherence = coh.get_score()
- coherence_per_topic = [coh.get_score(topic_id=k) for k in range(mdl.k)]
- print('==== Coherence: {} ===='.format(preset))
- print('Average:', average_coherence, '\nPer Topic:', coherence_per_topic)
- print()
- print("Applying model...")
- used_vocab = set(mdl.used_vocabs)
- articles["ngrams"] = ngrams
- articles = articles[articles["ngrams"].map(len) > 0]
- articles = articles[articles["ngrams"].map(lambda l: len(set(l)&used_vocab) > 0) == True]
- ngrams = articles["ngrams"].tolist()
- corpus = tp.utils.Corpus()
- for doc in ngrams:
- corpus.add_doc(words=doc)
- test_result_cps, ll = mdl.infer(corpus)
- topic_dist = []
- for i, doc in enumerate(test_result_cps):
- print(i, doc)
- dist = doc.get_topic_dist()
- topic_dist.append(dist)
- n = 0
- while exists(opj(args.location, f"topics_{n}.parquet")):
- n +=1
- path = opj(args.location, f"topics_{n}.parquet")
- articles["probs"] = topic_dist
- articles["topics"] = articles["probs"].map(lambda l: ",".join(list(map('{:.6f}'.format, l))))
- articles[["year", "article_id", "topics", "probs"]].to_parquet(path, index=False)
- try:
- descriptions = pd.read_csv(opj(args.location, "descriptions.csv")).set_index("topic")
- except:
- descriptions = None
- cumprobs = np.zeros((42, mdl.k))
- counts = np.zeros(42)
- for year, _articles in articles.groupby("year"):
- print(year)
- for article in _articles.to_dict(orient = 'records'):
- for topic, prob in enumerate(article['probs']):
- cumprobs[year,topic] += prob
- counts[year] = len(_articles)
- cumprobs.dump(opj(args.location, 'cumsprobs.npy'))
- counts.dump(opj(args.location, 'counts.npy'))
- lines = ['-', '--', '-.', ':', 'dotted', (0, (1, 10)), (0, (3, 10, 1, 10)), (0, (5, 10)), (0, (3, 1, 1, 1, 1, 1)), '-', '--']
- for topic in range(mdl.k):
- plt.plot(
- 1980+np.arange(42),
- cumprobs[:,topic],
- linestyle=lines[topic//7],
- label=topic if descriptions is None else descriptions.loc[topic,"description"]
- )
- plt.title("Absolute magnitude of supersymmetry research topics")
- plt.ylabel("Estimated amount of articles\n($\\sum_{d_i \\in \\mathrm{year}} p(t|d_i)$)")
- plt.xlim(1980, 2018)
- plt.legend(fontsize='x-small')
- plt.savefig(opj(args.location, "topics_count.pdf"))
- plt.clf()
- for topic in range(mdl.k):
- plt.plot(
- 1980+np.arange(42),
- cumprobs[:,topic]/counts,
- linestyle=lines[topic//7],
- label=topic if descriptions is None else descriptions.loc[topic,"description"]
- )
- plt.title("Relative magnitude of supersymmetry research topics")
- plt.ylabel("Probability of each topic throughout years\n($p(t|\\mathrm{year}$)")
- plt.xlim(1980, 2018)
- plt.legend(fontsize='x-small')
- plt.savefig(opj(args.location, "topics_probs.pdf"))
- plt.clf()