import pandas as pd import pickle import numpy as np from scipy.stats import entropy from os.path import join as opj import argparse parser = argparse.ArgumentParser() parser.add_argument("--input") args = parser.parse_args() with open(opj(args.input, "dataset.pickle"), "rb") as handle: data = pickle.load(handle) with open(opj(args.input, "etm_instance.pickle"), "rb") as handle: etm_instance = pickle.load(handle) p_w_z = etm_instance.get_topic_word_dist() n_topics = p_w_z.shape[0] topics = pd.read_csv(opj(args.input, "topics.csv")) n_topics = len(topics) junk = np.array(topics["label"].str.contains("Junk")) df = pd.read_csv(opj(args.input, "aggregate.csv")) expertise = np.stack(df[[f"expertise_{k+1}" for k in range(n_topics)]].values) expertise = expertise[:,~junk] p_w_z = p_w_z[~junk,:] theta = etm_instance.get_document_topic_dist()[:,~junk] n_topics -= junk.sum() R = np.array([ [((expertise[:,i]>expertise[:,i].mean())&(expertise[:,j]>expertise[:,j].mean())).mean()/(expertise[:,i]>expertise[:,i].mean()).mean() for j in range(n_topics)] for i in range(n_topics) ]) np.save(opj(args.input, "nu_expertise.npy"), R) R = np.array([ [((expertise[:,i]>expertise[:,i].mean())&(expertise[:,j]>expertise[:,j].mean())).mean()/((expertise[:,i]>expertise[:,i].mean())|(expertise[:,j]>expertise[:,j].mean())).mean() for j in range(n_topics)] for i in range(n_topics) ]) np.save(opj(args.input, "nu_expertise_symmetric.npy"), R) def dist(p,q): return entropy(p+q)-entropy(p) def dist_symmetric(p,q): return entropy(p+q)-0.5*(entropy(p)+entropy(q)) distance = np.zeros((n_topics, n_topics)) distance_symmetric = np.zeros((n_topics, n_topics)) for i in range(n_topics): for j in range(n_topics): distance[i,j] = dist(p_w_z[i], p_w_z[j]) distance_symmetric[i,j] = dist_symmetric(p_w_z[i], p_w_z[j]) V = len(pd.read_csv(opj(args.input, "ngrams.csv"))) distance_symmetric /= np.log(V) np.save(opj(args.input, "nu_ling.npy"), distance) np.save(opj(args.input, "nu_ling_symmetric.npy"), distance_symmetric) np.save(opj(args.input, "nu_etm.npy"), np.corrcoef(theta.T))