get_most_probable_phonemes.py 644 B

12345678910111213141516
  1. from collections import defaultdict
  2. def get_most_probable_phonemes(one_sentence_per_line_file, p=0.007) :
  3. """
  4. Compute the probabilities of phonemes and return the phonemes for
  5. which probabilities > p.
  6. """
  7. counts = defaultdict(int)
  8. for sentence in open(one_sentence_per_line_file) :
  9. sentence = sentence.rstrip()
  10. for word in sentence.split("@") :
  11. for phoneme in word.split("$") :
  12. counts[phoneme] += 1
  13. total = sum(counts.values())
  14. for phoneme in counts :
  15. counts[phoneme] /= total
  16. return [phoneme for phoneme, probability in counts.items() if probability >= p]