supersymmetry_usages.py 1.0 KB

123456789101112131415161718192021222324252627282930313233
  1. import pandas as pd
  2. import numpy as np
  3. import tomotopy as tp
  4. mdl = tp.CTModel.load("output/hep-ct-75-0.1-0.001-130000-20/model")
  5. topic_term_dists = np.stack([mdl.get_topic_word_dist(k) for k in range(mdl.k)])
  6. doc_topic_dists = np.stack([doc.get_topic_dist() for doc in mdl.docs])
  7. doc_topic_dists /= doc_topic_dists.sum(axis=1, keepdims=True)
  8. doc_lengths = np.array([len(doc.words) for doc in mdl.docs])
  9. vocab = list(mdl.used_vocabs)
  10. term_frequency = mdl.used_vocab_freq
  11. p_w_t = topic_term_dists
  12. p_t = np.sum(doc_topic_dists.transpose()*doc_lengths,axis=1)/np.sum(doc_lengths)
  13. p_w = term_frequency/np.sum(doc_lengths)
  14. p_t_w = (p_w_t.T*p_t).T/p_w
  15. data = []
  16. terms = ['supersymmetry', 'supersymmetric', 'susy']
  17. for term in terms:
  18. w = vocab.index(term)
  19. topic_word = p_t_w[:,w]
  20. largest_idx = topic_word.argsort()[-5:][::-1]
  21. for idx in largest_idx:
  22. data.append({
  23. 'term': term,
  24. 'topic': idx,
  25. 'p_t_w': topic_word[idx]
  26. })
  27. pd.DataFrame(data).to_csv('output/supersymmetry_usages.csv')