123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172 |
- import pandas as pd
- import pickle
- import numpy as np
- from scipy.stats import entropy
- from os.path import join as opj
- import argparse
- parser = argparse.ArgumentParser()
- parser.add_argument("--input")
- args = parser.parse_args()
- with open(opj(args.input, "dataset.pickle"), "rb") as handle:
- data = pickle.load(handle)
- with open(opj(args.input, "etm_instance.pickle"), "rb") as handle:
- etm_instance = pickle.load(handle)
- p_w_z = etm_instance.get_topic_word_dist()
- n_topics = p_w_z.shape[0]
- topics = pd.read_csv(opj(args.input, "topics.csv"))
- n_topics = len(topics)
- junk = np.array(topics["label"].str.contains("Junk"))
- df = pd.read_csv(opj(args.input, "aggregate.csv"))
- expertise = np.stack(df[[f"expertise_{k+1}" for k in range(n_topics)]].values)
- expertise = expertise[:,~junk]
- p_w_z = p_w_z[~junk,:]
- theta = etm_instance.get_document_topic_dist()[:,~junk]
- n_topics -= junk.sum()
- R = np.array([
- [((expertise[:,i]>expertise[:,i].mean())&(expertise[:,j]>expertise[:,j].mean())).mean()/(expertise[:,i]>expertise[:,i].mean()).mean() for j in range(n_topics)]
- for i in range(n_topics)
- ])
- np.save(opj(args.input, "nu_expertise.npy"), R)
- R = np.array([
- [((expertise[:,i]>expertise[:,i].mean())&(expertise[:,j]>expertise[:,j].mean())).mean()/((expertise[:,i]>expertise[:,i].mean())|(expertise[:,j]>expertise[:,j].mean())).mean() for j in range(n_topics)]
- for i in range(n_topics)
- ])
- np.save(opj(args.input, "nu_expertise_symmetric.npy"), R)
- def dist(p,q):
- return entropy(p+q)-entropy(p)
- def dist_symmetric(p,q):
- return entropy(p+q)-0.5*(entropy(p)+entropy(q))
- distance = np.zeros((n_topics, n_topics))
- distance_symmetric = np.zeros((n_topics, n_topics))
- for i in range(n_topics):
- for j in range(n_topics):
- distance[i,j] = dist(p_w_z[i], p_w_z[j])
- distance_symmetric[i,j] = dist_symmetric(p_w_z[i], p_w_z[j])
- V = len(pd.read_csv(opj(args.input, "ngrams.csv")))
- distance_symmetric /= np.log(V)
- np.save(opj(args.input, "nu_ling.npy"), distance)
- np.save(opj(args.input, "nu_ling_symmetric.npy"), distance_symmetric)
- np.save(opj(args.input, "nu_etm.npy"), np.corrcoef(theta.T))
|