123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127 |
- # pylint: disable=no-member
- """This module contains an implementation of a class that help /
- to clean orthographic or IPA transcripts of utterances. /
- Crucially, this class will clean utterances by removing or replacing /
- markers. See the file markers.json to see what kinds of markers are /
- accounted.
- """
- import re
- import string
- class UtterancesCleaner :
- """
- """
- def __init__(self, markers: dict) :
- self.delete_marker_pattern = '|'.join(markers["marker_to_delete"])
- self.word_contains_delete_pattern = '|'.join(markers["word_contains_delete"])
- self.poncts_to_delete_pattern = '|'.join(markers["poncts_to_delete"])
- self.delete_comments_pattern = r"(\(|\<|\*)(.+?)(\)|\>|\*)"
- self.replace_unk_pattern = r"xxx|yyy|www|[0-9]+|\*"
- self.pattern_letter = re.compile(r"(\s?)([^ ]*)\s\[x (\d+)\]")
- self.pattern_repetition = re.compile(r"(\s?)([^ ]*)\s\[x (\d+)\]")
- def replace_marker(self, utterance: str, pattern: str, replacement: str="∑") -> list:
- """
- Method that replace some markers by an other symbol
- Parameters
- ----------
- - utterance : list
- list of words utterance
- - pattern : list
- regex pattern containing markers to delete from the utterance
- - replacement :
- symbol that will replace markers
- """
- return " ".join(re.sub(pattern, replacement, word) for word in utterance.split(" "))
- def delete_words(self, utterance: str) -> str:
- """
- Method that delete some words from a given utterance.
- Parameters
- ----------
- - utterance : list
- list of words utterance
- """
- return " ".join(word for word in utterance.split(" ") \
- if not re.match(self.word_contains_delete_pattern, word))
- def remove_ponctuations(self, utterance: str) -> str :
- """
- Remove ponctuations from a given utterance.
- Parameters
- ----------
- - utterance : str
- The utterance from which the punctuation will be removed.
-
- Returns
- -------
- str :
- The utterance without punctuations.
- """
- return utterance.translate(str.maketrans('', '', string.punctuation))
-
- def remove_brackets(self, utterance: str) -> str :
- """
- Remove brackets from a given utterance.
- Parameters
- ----------
- - utterance : str
- The utterance from which the brackets will be removed.
-
- Returns
- -------
- str :
- The utterance without brackets.
- """
- return re.sub(r"[\(\[].*?[\)\]]", '', utterance)
- def handle_repetitions(self, utterance: str) -> str:
- while True:
- matched = re.search(self.pattern_repetition, utterance)
- if not matched:
- break
- all_match = matched.group(0)
- separator = matched.group(1)
- word, repetitions = matched.group(2),matched.group(3)
- repeated_word = '{}{}'.format(separator, ' '.join([word] * int(repetitions)))
- utterance = utterance.replace(all_match, repeated_word, 1)
- return utterance
-
- def remove_multiple_spaces(self, utterance: str) -> str :
- return re.sub(' +', ' ', utterance)
- def clean(self, utterance: str) -> str :
- """
- Method that clean utterances by deleting or replacing /
- markers.
- Parameters
- ----------
- - utterances : list
- list of utterances to clean
- Returns
- -------
- - generator over cleaned utterances
- """
- utterance = self.handle_repetitions(utterance)
- utterance = self.replace_marker(utterance, self.delete_marker_pattern, "")
- utterance = self.delete_words(utterance)
- utterance = self.replace_marker(utterance, self.poncts_to_delete_pattern, "")
- utterance = self.replace_marker(utterance, self.delete_comments_pattern, "")
- utterance = self.replace_marker(utterance, self.replace_unk_pattern, "") # pour mot non retranscrit
- utterance = self.remove_brackets(utterance)
- utterance = self.remove_ponctuations(utterance)
- utterance = self.remove_multiple_spaces(utterance)
- utterance = utterance.strip()
- return utterance
|