topic_distance.py 2.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. import pandas as pd
  2. import pickle
  3. import numpy as np
  4. from scipy.stats import entropy
  5. from os.path import join as opj
  6. import argparse
  7. parser = argparse.ArgumentParser()
  8. parser.add_argument("--input")
  9. args = parser.parse_args()
  10. with open(opj(args.input, "dataset.pickle"), "rb") as handle:
  11. data = pickle.load(handle)
  12. with open(opj(args.input, "etm_instance.pickle"), "rb") as handle:
  13. etm_instance = pickle.load(handle)
  14. p_w_z = etm_instance.get_topic_word_dist()
  15. n_topics = p_w_z.shape[0]
  16. topics = pd.read_csv(opj(args.input, "topics.csv"))
  17. n_topics = len(topics)
  18. junk = np.array(topics["label"].str.contains("Junk"))
  19. df = pd.read_csv(opj(args.input, "aggregate.csv"))
  20. expertise = np.stack(df[[f"expertise_{k+1}" for k in range(n_topics)]].values)
  21. expertise = expertise[:,~junk]
  22. p_w_z = p_w_z[~junk,:]
  23. theta = etm_instance.get_document_topic_dist()[:,~junk]
  24. n_topics -= junk.sum()
  25. R = np.array([
  26. [((expertise[:,i]>expertise[:,i].mean())&(expertise[:,j]>expertise[:,j].mean())).mean()/(expertise[:,i]>expertise[:,i].mean()).mean() for j in range(n_topics)]
  27. for i in range(n_topics)
  28. ])
  29. np.save(opj(args.input, "nu_expertise.npy"), R)
  30. R = np.array([
  31. [((expertise[:,i]>expertise[:,i].mean())&(expertise[:,j]>expertise[:,j].mean())).mean()/((expertise[:,i]>expertise[:,i].mean())|(expertise[:,j]>expertise[:,j].mean())).mean() for j in range(n_topics)]
  32. for i in range(n_topics)
  33. ])
  34. np.save(opj(args.input, "nu_expertise_symmetric.npy"), R)
  35. def dist(p,q):
  36. return entropy(p+q)-entropy(p)
  37. def dist_symmetric(p,q):
  38. return entropy(p+q)-0.5*(entropy(p)+entropy(q))
  39. distance = np.zeros((n_topics, n_topics))
  40. distance_symmetric = np.zeros((n_topics, n_topics))
  41. for i in range(n_topics):
  42. for j in range(n_topics):
  43. distance[i,j] = dist(p_w_z[i], p_w_z[j])
  44. distance_symmetric[i,j] = dist_symmetric(p_w_z[i], p_w_z[j])
  45. V = len(pd.read_csv(opj(args.input, "ngrams.csv")))
  46. distance_symmetric /= np.log(V)
  47. np.save(opj(args.input, "nu_ling.npy"), distance)
  48. np.save(opj(args.input, "nu_ling_symmetric.npy"), distance_symmetric)
  49. np.save(opj(args.input, "nu_etm.npy"), np.corrcoef(theta.T))