utterances_cleaner.py 4.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127
  1. # pylint: disable=no-member
  2. """This module contains an implementation of a class that help /
  3. to clean orthographic or IPA transcripts of utterances. /
  4. Crucially, this class will clean utterances by removing or replacing /
  5. markers. See the file markers.json to see what kinds of markers are /
  6. accounted.
  7. """
  8. import re
  9. import string
  10. class UtterancesCleaner :
  11. """
  12. """
  13. def __init__(self, markers: dict) :
  14. self.delete_marker_pattern = '|'.join(markers["marker_to_delete"])
  15. self.word_contains_delete_pattern = '|'.join(markers["word_contains_delete"])
  16. self.poncts_to_delete_pattern = '|'.join(markers["poncts_to_delete"])
  17. self.delete_comments_pattern = r"(\(|\<|\*)(.+?)(\)|\>|\*)"
  18. self.replace_unk_pattern = r"xxx|yyy|www|[0-9]+|\*"
  19. self.pattern_letter = re.compile(r"(\s?)([^ ]*)\s\[x (\d+)\]")
  20. self.pattern_repetition = re.compile(r"(\s?)([^ ]*)\s\[x (\d+)\]")
  21. def replace_marker(self, utterance: str, pattern: str, replacement: str="∑") -> list:
  22. """
  23. Method that replace some markers by an other symbol
  24. Parameters
  25. ----------
  26. - utterance : list
  27. list of words utterance
  28. - pattern : list
  29. regex pattern containing markers to delete from the utterance
  30. - replacement :
  31. symbol that will replace markers
  32. """
  33. return " ".join(re.sub(pattern, replacement, word) for word in utterance.split(" "))
  34. def delete_words(self, utterance: str) -> str:
  35. """
  36. Method that delete some words from a given utterance.
  37. Parameters
  38. ----------
  39. - utterance : list
  40. list of words utterance
  41. """
  42. return " ".join(word for word in utterance.split(" ") \
  43. if not re.match(self.word_contains_delete_pattern, word))
  44. def remove_ponctuations(self, utterance: str) -> str :
  45. """
  46. Remove ponctuations from a given utterance.
  47. Parameters
  48. ----------
  49. - utterance : str
  50. The utterance from which the punctuation will be removed.
  51. Returns
  52. -------
  53. str :
  54. The utterance without punctuations.
  55. """
  56. return utterance.translate(str.maketrans('', '', string.punctuation))
  57. def remove_brackets(self, utterance: str) -> str :
  58. """
  59. Remove brackets from a given utterance.
  60. Parameters
  61. ----------
  62. - utterance : str
  63. The utterance from which the brackets will be removed.
  64. Returns
  65. -------
  66. str :
  67. The utterance without brackets.
  68. """
  69. return re.sub(r"[\(\[].*?[\)\]]", '', utterance)
  70. def handle_repetitions(self, utterance: str) -> str:
  71. while True:
  72. matched = re.search(self.pattern_repetition, utterance)
  73. if not matched:
  74. break
  75. all_match = matched.group(0)
  76. separator = matched.group(1)
  77. word, repetitions = matched.group(2),matched.group(3)
  78. repeated_word = '{}{}'.format(separator, ' '.join([word] * int(repetitions)))
  79. utterance = utterance.replace(all_match, repeated_word, 1)
  80. return utterance
  81. def remove_multiple_spaces(self, utterance: str) -> str :
  82. return re.sub(' +', ' ', utterance)
  83. def clean(self, utterance: str) -> str :
  84. """
  85. Method that clean utterances by deleting or replacing /
  86. markers.
  87. Parameters
  88. ----------
  89. - utterances : list
  90. list of utterances to clean
  91. Returns
  92. -------
  93. - generator over cleaned utterances
  94. """
  95. utterance = self.handle_repetitions(utterance)
  96. utterance = self.replace_marker(utterance, self.delete_marker_pattern, "")
  97. utterance = self.delete_words(utterance)
  98. utterance = self.replace_marker(utterance, self.poncts_to_delete_pattern, "")
  99. utterance = self.replace_marker(utterance, self.delete_comments_pattern, "")
  100. utterance = self.replace_marker(utterance, self.replace_unk_pattern, "") # pour mot non retranscrit
  101. utterance = self.remove_brackets(utterance)
  102. utterance = self.remove_ponctuations(utterance)
  103. utterance = self.remove_multiple_spaces(utterance)
  104. utterance = utterance.strip()
  105. return utterance