from collections import defaultdict def get_most_probable_phonemes(one_sentence_per_line_file, p=0.007) : """ Compute the probabilities of phonemes and return the phonemes for which probabilities > p. """ counts = defaultdict(int) for sentence in open(one_sentence_per_line_file) : sentence = sentence.rstrip() for word in sentence.split("@") : for phoneme in word.split("$") : counts[phoneme] += 1 total = sum(counts.values()) for phoneme in counts : counts[phoneme] /= total return [phoneme for phoneme, probability in counts.items() if probability >= p]