utterances_cleaner.py 5.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152
  1. """This module contains an implementation of a class that help /
  2. to clean orthographic or IPA transcripts of utterances. /
  3. Crucially, this class will clean utterances by removing or replacing /
  4. markers. See the file markers.json to see what kinds of markers are /
  5. accounted.
  6. """
  7. import re
  8. import string
  9. class UtterancesCleaner :
  10. """
  11. This class will clean utterances from CHILDES,\
  12. by deleting words, patterns, ponctuation or replacing\
  13. or replacing them by other things.
  14. """
  15. def __init__(self, markers: dict) :
  16. self.delete_marker_pattern = '|'.join(markers["marker_to_delete"])
  17. self.word_contains_delete_pattern = '|'.join(markers["word_contains_delete"])
  18. self.poncts_to_delete_pattern = '|'.join(markers["poncts_to_delete"])
  19. self.delete_comments_pattern = r"(\(|\<|\*)(.+?)(\)|\>|\*)"
  20. self.replace_unk_pattern = r"xxx|yyy|www|[0-9]+|\*"
  21. self.pattern_letter = re.compile(r"(\s?)([^ ]*)\s\[x (\d+)\]")
  22. self.pattern_repetition = re.compile(r"(\s?)([^ ]*)\s\[x (\d+)\]")
  23. def replace_marker(self, utterance: str, pattern: str, replacement: str="∑") -> list:
  24. """
  25. Method that replace some markers by an other symbol
  26. Parameters
  27. ----------
  28. - utterance : str
  29. Utterance from which markers will be replaced
  30. - pattern : str
  31. Regex pattern containing markers to delete from the utterance
  32. - replacement :
  33. Symbol that will replace markers
  34. """
  35. return " ".join(re.sub(pattern, replacement, word) for word in utterance.split(" "))
  36. def delete_words(self, utterance: str) -> str:
  37. """
  38. Method that delete some words from a given utterance.
  39. Parameters
  40. ----------
  41. - utterance : str
  42. Utterance from which those words will be removed
  43. """
  44. return " ".join(word for word in utterance.split(" ") \
  45. if not re.match(self.word_contains_delete_pattern, word))
  46. def remove_ponctuations(self, utterance: str) -> str :
  47. """
  48. Remove ponctuations from a given utterance.
  49. Parameters
  50. ----------
  51. - utterance : str
  52. The utterance from which the punctuation will be removed.
  53. Returns
  54. -------
  55. str :
  56. The utterance without punctuations.
  57. """
  58. return utterance.translate(str.maketrans('', '', string.punctuation))
  59. def remove_brackets(self, utterance: str) -> str :
  60. """
  61. Remove brackets from a given utterance.
  62. Parameters
  63. ----------
  64. - utterance : str
  65. The utterance from which the brackets will be removed.
  66. Returns
  67. -------
  68. str :
  69. The utterance without brackets.
  70. """
  71. return re.sub(r"[\(\[].*?[\)\]]", '', utterance)
  72. def handle_repetitions(self, utterance: str) -> str:
  73. """
  74. This function will repeat n times some units from\
  75. a give utterance.
  76. Parameters
  77. ----------
  78. utterance: str
  79. Utterance from which some units will be repeated.
  80. """
  81. while True:
  82. matched = re.search(self.pattern_repetition, utterance)
  83. if not matched:
  84. break
  85. all_match = matched.group(0)
  86. separator = matched.group(1)
  87. word, repetitions = matched.group(2),matched.group(3)
  88. repeated_word = f"{separator}{' '.join([word] * int(repetitions))}"
  89. utterance = utterance.replace(all_match, repeated_word, 1)
  90. return utterance
  91. def remove_multiple_spaces(self, utterance: str) -> str :
  92. """
  93. Remove multiple spaces from a given utterance.
  94. Parameters
  95. ----------
  96. utterance: str
  97. Utterance from which multiple successive spaces\
  98. will be replaced.
  99. Returns
  100. -------
  101. - str
  102. Utterance without multiple successive spaces.
  103. """
  104. return re.sub(' +', ' ', utterance)
  105. def clean(self, utterance: str) -> str :
  106. """
  107. Method that clean utterances by deleting or replacing /
  108. markers.
  109. Parameters
  110. ----------
  111. - utterances : str
  112. Utterance to clean
  113. Returns
  114. -------
  115. - str
  116. Cleaned utterance
  117. """
  118. utterance = self.handle_repetitions(utterance)
  119. utterance = self.replace_marker(utterance, self.delete_marker_pattern, "")
  120. utterance = self.delete_words(utterance)
  121. utterance = self.replace_marker(utterance, self.poncts_to_delete_pattern, "")
  122. utterance = self.replace_marker(utterance, self.delete_comments_pattern, "")
  123. utterance = self.replace_marker(utterance, self.replace_unk_pattern, "")
  124. utterance = self.remove_brackets(utterance)
  125. utterance = self.remove_ponctuations(utterance)
  126. utterance = self.remove_multiple_spaces(utterance)
  127. utterance = utterance.strip()
  128. return utterance