# pylint: disable=no-member """This module contains an implementation of a class that help / to clean orthographic or IPA transcripts of utterances. / Crucially, this class will clean utterances by removing or replacing / markers. See the file markers.json to see what kinds of markers are / accounted. """ import re import string class UtterancesCleaner : """ """ def __init__(self, markers: dict) : self.delete_marker_pattern = '|'.join(markers["marker_to_delete"]) self.word_contains_delete_pattern = '|'.join(markers["word_contains_delete"]) self.poncts_to_delete_pattern = '|'.join(markers["poncts_to_delete"]) self.delete_comments_pattern = r"(\(|\<|\*)(.+?)(\)|\>|\*)" self.replace_unk_pattern = r"xxx|yyy|www|[0-9]+|\*" self.pattern_letter = re.compile(r"(\s?)([^ ]*)\s\[x (\d+)\]") self.pattern_repetition = re.compile(r"(\s?)([^ ]*)\s\[x (\d+)\]") def replace_marker(self, utterance: str, pattern: str, replacement: str="∑") -> list: """ Method that replace some markers by an other symbol Parameters ---------- - utterance : list list of words utterance - pattern : list regex pattern containing markers to delete from the utterance - replacement : symbol that will replace markers """ return " ".join(re.sub(pattern, replacement, word) for word in utterance.split(" ")) def delete_words(self, utterance: str) -> str: """ Method that delete some words from a given utterance. Parameters ---------- - utterance : list list of words utterance """ return " ".join(word for word in utterance.split(" ") \ if not re.match(self.word_contains_delete_pattern, word)) def remove_ponctuations(self, utterance: str) -> str : """ Remove ponctuations from a given utterance. Parameters ---------- - utterance : str The utterance from which the punctuation will be removed. Returns ------- str : The utterance without punctuations. """ return utterance.translate(str.maketrans('', '', string.punctuation)) def remove_brackets(self, utterance: str) -> str : """ Remove brackets from a given utterance. Parameters ---------- - utterance : str The utterance from which the brackets will be removed. Returns ------- str : The utterance without brackets. """ return re.sub(r"[\(\[].*?[\)\]]", '', utterance) def handle_repetitions(self, utterance: str) -> str: while True: matched = re.search(self.pattern_repetition, utterance) if not matched: break all_match = matched.group(0) separator = matched.group(1) word, repetitions = matched.group(2),matched.group(3) repeated_word = '{}{}'.format(separator, ' '.join([word] * int(repetitions))) utterance = utterance.replace(all_match, repeated_word, 1) return utterance def remove_multiple_spaces(self, utterance: str) -> str : return re.sub(' +', ' ', utterance) def clean(self, utterance: str) -> str : """ Method that clean utterances by deleting or replacing / markers. Parameters ---------- - utterances : list list of utterances to clean Returns ------- - generator over cleaned utterances """ utterance = self.handle_repetitions(utterance) utterance = self.replace_marker(utterance, self.delete_marker_pattern, "") utterance = self.delete_words(utterance) utterance = self.replace_marker(utterance, self.poncts_to_delete_pattern, "") utterance = self.replace_marker(utterance, self.delete_comments_pattern, "") utterance = self.replace_marker(utterance, self.replace_unk_pattern, "") # pour mot non retranscrit utterance = self.remove_brackets(utterance) utterance = self.remove_ponctuations(utterance) utterance = self.remove_multiple_spaces(utterance) utterance = utterance.strip() return utterance