1 year ago · e4d54bd35e
--- a/code/download_childes_corpora.py
+++ b/code/download_childes_corpora.py
@@ -0,0 +1,257 @@
 
				+"""Module that downloads the datasets from CHILDES using pylangacq."""
			
 
				+from typing import List, Generator
			
 
				+import os
			
 
				+import json
			
 
				+import random
			
 
				+import pylangacq
			
 
				+from tqdm import tqdm
			
 
				+import yaml
			
 
				+from phonemizer.backend import EspeakBackend
			
 
				+from phonemizer.separator import Separator
			
 
				+from utterances_cleaner import UtterancesCleaner
			
 
				+# from utterances_cleaner_new import clean_transcription
			
 
				+import panphon
			
 
				+
			
 
				+random.seed(80)
			
 
				+
			
 
				+class DownloadChildCorpora :
			
 
				+    """
			
 
				+    Class that downloads child and adult interactions corpora from the\
			
 
				+    childes databases.
			
 
				+
			
 
				+    Atributes
			
 
				+    ---------
			
 
				+    - json_markers_file : str
			
 
				+        Filename containing the markers to manage when cleaning the utterances
			
 
				+    """
			
 
				+
			
 
				+    def __init__(self, out_dirname, json_markers_filename: str):
			
 
				+        # This will help us to tokenize phonemized utterances in words and/or in phonemes
			
 
				+        self.separator = Separator(phone="$", word="@")
			
 
				+        # This text file will will contain all the corpora that have failed to be downloaded
			
 
				+        self.not_downloaded_data = open(f"{out_dirname}/not_downloaded_data.txt", "w",
			
 
				+                                        encoding="UTF-8")
			
 
				+        self.utterances_cleaner = UtterancesCleaner(json.load(open(json_markers_filename,
			
 
				+                                                                    encoding="UTF-8")))
			
 
				+        self.features_table = panphon.FeatureTable()
			
 
				+
			
 
				+    def get_segments(self, utterance: str) -> str:
			
 
				+        """
			
 
				+        Function that retrieves phonemic segments of a given utterance. The utterance\
			
 
				+        must be in a phonetic form.
			
 
				+        We use panphon in order to deal with multi-character phonemes.
			
 
				+
			
 
				+        Parameters
			
 
				+        ----------
			
 
				+        - utterance : str
			
 
				+            The utterance for which we want to get phonemic segments.
			
 
				+        """
			
 
				+        return "@".join("$".join(seg.strip()
			
 
				+                for seg in self.features_table.ipa_segs(word.strip())\
			
 
				+                    if seg.strip()) for word in utterance.split() if word.strip())
			
 
				+
			
 
				+    def participants_data(self,
			
 
				+                            chat,
			
 
				+                            participants_to_consider: List[str],
			
 
				+                            phonemize_child: bool,
			
 
				+                            ) -> Generator[str, str, float, List[str]]:
			
 
				+        """
			
 
				+        Get the data for each participant. Here, the data for each participant\
			
 
				+        is the set of utterances produced by this participant at all child ages.
			
 
				+
			
 
				+        Parameters
			
 
				+        ----------
			
 
				+        - chat : Pylangacq class
			
 
				+            The chat file containing the utterances.
			
 
				+        - participants_to_consider : list
			
 
				+            The participants for which we want to get utterances in the chat file.
			
 
				+
			
 
				+        Returns
			
 
				+        -------
			
 
				+        - Iterator:
			
 
				+            Tuple where the first element is the role of the speaker,\
			
 
				+            the second element is the child age (in months) and the last element\
			
 
				+            is an utterance produced by the speaker at this child age.
			
 
				+        """
			
 
				+        ages = chat.ages(months=True)
			
 
				+        participants = {
			
 
				+            speaker : header["Participants"][speaker]["role"]
			
 
				+                        for header in chat.headers()
			
 
				+                            for speaker in header["Participants"]
			
 
				+            }
			
 
				+        for participant in participants:
			
 
				+            role = participants[participant]
			
 
				+            if role not in participants_to_consider :
			
 
				+                continue
			
 
				+            file_utterances = chat.utterances(by_files=True, participants=participant)
			
 
				+            if not(phonemize_child) and participant == "CHI" :
			
 
				+                tiers = ["pho", "%pho", "xpho", "%xpho"]
			
 
				+            else :
			
 
				+                tiers = [participant]
			
 
				+            for age, utterances in zip(ages, file_utterances) :
			
 
				+                utterances = self.get_utterances(utterances, tiers)
			
 
				+                yield(role,
			
 
				+                        participant,
			
 
				+                        age,
			
 
				+                        [self.utterances_cleaner.clean(utterance) for utterance in utterances])
			
 
				+
			
 
				+    def get_utterances(self, utterances: list, tiers: List[str]) -> List[str]:
			
 
				+        """
			
 
				+        This function will get utterances of a given list of tiers. A tier\
			
 
				+        is an annotation (for example morphology, phonetic, etc) of a given utterance.
			
 
				+
			
 
				+        Parameters
			
 
				+        ----------
			
 
				+        - utterances : list
			
 
				+            List of pylangacq utterances containing different tiers.
			
 
				+        - tiers : list
			
 
				+            List of tiers to extract from utterances
			
 
				+
			
 
				+        Returns
			
 
				+        -------
			
 
				+        - list
			
 
				+            Utterances of a given list of tiers
			
 
				+        """
			
 
				+
			
 
				+        str_utterances = []
			
 
				+        for utterance in utterances :
			
 
				+            for tier in tiers :
			
 
				+                if tier in utterance.tiers :
			
 
				+                    str_utterances.append(utterance.tiers[tier])
			
 
				+        return str_utterances
			
 
				+
			
 
				+    def get_phonetic_utterances(self,
			
 
				+                                utterances: List[str],
			
 
				+                                participant: str,
			
 
				+                                backend: EspeakBackend,
			
 
				+                                phonemize_child: bool) -> List[str]:
			
 
				+        """
			
 
				+        This function will get phonemic representation of a given list\
			
 
				+        list of utterances.
			
 
				+
			
 
				+        Parameters
			
 
				+        ----------
			
 
				+        - utterances: str
			
 
				+            List of utterances in standard orthography.
			
 
				+        - participant: str
			
 
				+            The participant who has produced the utterance.
			
 
				+        - backend: EspeakBackend
			
 
				+            The espeak backend of the language of the utterance.
			
 
				+        - phonemize_child: bool
			
 
				+            Whether to get the automatic or manual phonemization of the children's utterances.
			
 
				+
			
 
				+        Returns
			
 
				+        -------
			
 
				+        - list:
			
 
				+            List of the utterances in phonetic form.
			
 
				+        """
			
 
				+        if(not(phonemize_child) and participant == "CHI") :
			
 
				+            phon_utterances = []
			
 
				+            for utterance in utterances :
			
 
				+                phon_utterances.append(self.get_segments(utterance))
			
 
				+            return phon_utterances
			
 
				+        return backend.phonemize(utterances, separator=self.separator, strip=True)
			
 
				+
			
 
				+    def download_data(self,
			
 
				+                        language: str,
			
 
				+                        languages_to_download_informations: dict,
			
 
				+                        out_dirname: str,
			
 
				+                        phonemize_child) -> None:
			
 
				+        """
			
 
				+        Download data for all speaker for a given language.
			
 
				+
			
 
				+        Parameters
			
 
				+        ----------
			
 
				+        - language: str
			
 
				+            The language for which to retrieve the data
			
 
				+        - languages_to_download_informations:
			
 
				+
			
 
				+        - out_dirname: str
			
 
				+            The directory where the downloaded data will be stored.
			
 
				+        - phonemize_child: bool
			
 
				+
			
 
				+        """
			
 
				+        participants_to_consider = languages_to_download_informations[language]["participants"]
			
 
				+        downloading_file = open(f"{out_dirname}/{language}.one_utterance_per_line", 
			
 
				+                                "w", encoding="UTF-8")
			
 
				+        backend = EspeakBackend(language=languages_to_download_informations[language]["espeak_language_id"],
			
 
				+                                language_switch="remove-utterance")
			
 
				+        for url in languages_to_download_informations[language]["urls"] :
			
 
				+            try :
			
 
				+                chat = pylangacq.read_chat(url)
			
 
				+                corpus_family = set()
			
 
				+                for file_path in chat.file_paths() :
			
 
				+                    informations = file_path.split("/")
			
 
				+                    if len(informations) < 3 :
			
 
				+                        # only the name of the corpus
			
 
				+                        corpus_family.add((informations[0], ""))
			
 
				+                    else :
			
 
				+                        # the name of the corpus and the family
			
 
				+                        corpus_family.add((informations[0], informations[1]))
			
 
				+            except :
			
 
				+                self.not_downloaded_data.write(f"{url}\n")
			
 
				+                continue
			
 
				+            for corpus, family in corpus_family :
			
 
				+                family = family if family else corpus
			
 
				+                chat_family = chat.filter(match=family)
			
 
				+                for role, participant, age, utterances in self.participants_data(chat_family,
			
 
				+                                                                participants_to_consider,
			
 
				+                                                                phonemize_child=phonemize_child) :
			
 
				+                    for utterance in self.get_phonetic_utterances(utterances,
			
 
				+                                                                    participant,
			
 
				+                                                                    backend,
			
 
				+                                                                    phonemize_child=phonemize_child) :
			
 
				+                        utterance = self.utterances_cleaner.clean(utterance)
			
 
				+                        utterance = self.utterances_cleaner.remove_multiple_spaces(utterance)
			
 
				+                        if not utterance :
			
 
				+                            continue
			
 
				+                        family_name = "_".join((corpus, family))
			
 
				+                        downloading_file.write(f"{family_name},{role},{age},{utterance}\n")
			
 
				+
			
 
				+    def __call__(self,
			
 
				+                languages_to_download_informations: dict,
			
 
				+                out_dirname: str,
			
 
				+                phonemize_child: bool) -> None :
			
 
				+        """
			
 
				+        Download the data for each languages
			
 
				+
			
 
				+        Parameters
			
 
				+        ----------
			
 
				+        - languages_to_download_informations : dict
			
 
				+            The dictionary that contains all relevant informations for downloading\
			
 
				+            the data.
			
 
				+        - out_dirname : str
			
 
				+            Directory where the outpouts will be stored.
			
 
				+        """
			
 
				+        total = len(languages_to_download_informations)
			
 
				+        for language in tqdm(languages_to_download_informations, total=total) :
			
 
				+            self.download_data(language,
			
 
				+                                languages_to_download_informations,
			
 
				+                                out_dirname,
			
 
				+                                phonemize_child=phonemize_child)
			
 
				+
			
 
				+if __name__ == "__main__" :
			
 
				+    from argparse import ArgumentParser, BooleanOptionalAction
			
 
				+    parser = ArgumentParser()
			
 
				+
			
 
				+    parser.add_argument("--yaml_file",
			
 
				+                        help="YAML File containing for each language, all relevant information for downloading the data.",
			
 
				+                        required=True)
			
 
				+    parser.add_argument("--out_dirname",
			
 
				+                        help="The directory where outputs will be stored.",
			
 
				+                        required=True)
			
 
				+    parser.add_argument("--markers_json",
			
 
				+                        help="Json markers that serve for cleaning.",
			
 
				+                        required=True)
			
 
				+    parser.add_argument("--phonemize_child", action=BooleanOptionalAction)
			
 
				+    args = parser.parse_args()
			
 
				+    phonemize_child_or_not = args.phonemize_child
			
 
				+    yaml_file = args.yaml_file
			
 
				+    out_directory_name = args.out_dirname
			
 
				+    markers_json = args.markers_json
			
 
				+    if not os.path.exists(out_directory_name):
			
 
				+        os.makedirs(out_directory_name)
			
 
				+    loaded_languages_to_download_informations = yaml.safe_load(open(args.yaml_file,
			
 
				+                                                                    encoding="UTF-8"))
			
 
				+    downloader = DownloadChildCorpora(out_directory_name, markers_json)
			
 
				+    downloader(loaded_languages_to_download_informations, out_directory_name, phonemize_child_or_not)
			
--- a/code/utterances_cleaner.py
+++ b/code/utterances_cleaner.py
@@ -0,0 +1,127 @@
 
				+# pylint: disable=no-member
			
 
				+"""This module contains an implementation of a class that help /
			
 
				+    to clean orthographic or IPA transcripts of utterances. /
			
 
				+    Crucially, this class will clean utterances by removing or replacing /
			
 
				+    markers. See the file markers.json to see what kinds of markers are /
			
 
				+    accounted.
			
 
				+"""
			
 
				+import re
			
 
				+import string
			
 
				+
			
 
				+
			
 
				+class UtterancesCleaner :
			
 
				+    """
			
 
				+    """
			
 
				+    def __init__(self, markers: dict) :
			
 
				+        self.delete_marker_pattern = '|'.join(markers["marker_to_delete"])
			
 
				+        self.word_contains_delete_pattern = '|'.join(markers["word_contains_delete"])
			
 
				+        self.poncts_to_delete_pattern = '|'.join(markers["poncts_to_delete"])
			
 
				+        self.delete_comments_pattern = r"(\(|\<|\*)(.+?)(\)|\>|\*)"
			
 
				+        self.replace_unk_pattern = r"xxx|yyy|www|[0-9]+|\*"
			
 
				+        self.pattern_letter = re.compile(r"(\s?)([^ ]*)\s\[x (\d+)\]")
			
 
				+        self.pattern_repetition = re.compile(r"(\s?)([^ ]*)\s\[x (\d+)\]")
			
 
				+
			
 
				+    def replace_marker(self, utterance: str, pattern: str, replacement: str="∑") -> list:
			
 
				+        """
			
 
				+        Method that replace some markers by an other symbol
			
 
				+
			
 
				+        Parameters
			
 
				+        ----------
			
 
				+        - utterance : list
			
 
				+            list of words utterance
			
 
				+        - pattern : list
			
 
				+            regex pattern containing markers to delete from the utterance
			
 
				+        - replacement :
			
 
				+            symbol that will replace markers
			
 
				+        """
			
 
				+        return " ".join(re.sub(pattern, replacement, word) for word in utterance.split(" "))
			
 
				+
			
 
				+    def delete_words(self, utterance: str) -> str:
			
 
				+
			
 
				+        """
			
 
				+        Method that delete some words from a given utterance.
			
 
				+
			
 
				+        Parameters
			
 
				+        ----------
			
 
				+        - utterance : list
			
 
				+            list of words utterance
			
 
				+        """
			
 
				+        return " ".join(word for word in utterance.split(" ") \
			
 
				+            if not re.match(self.word_contains_delete_pattern, word))
			
 
				+
			
 
				+    def remove_ponctuations(self, utterance: str) -> str :
			
 
				+        """
			
 
				+        Remove ponctuations from a given utterance.
			
 
				+
			
 
				+        Parameters
			
 
				+        ----------
			
 
				+        - utterance : str
			
 
				+            The utterance from which the punctuation will be removed.
			
 
				+        
			
 
				+        Returns
			
 
				+        -------
			
 
				+        str :
			
 
				+            The utterance without punctuations.
			
 
				+        """
			
 
				+        return utterance.translate(str.maketrans('', '', string.punctuation))
			
 
				+    
			
 
				+    def remove_brackets(self, utterance: str) -> str :
			
 
				+        """
			
 
				+        Remove brackets from a given utterance.
			
 
				+
			
 
				+        Parameters
			
 
				+        ----------
			
 
				+        - utterance : str
			
 
				+            The utterance from which the brackets will be removed.
			
 
				+        
			
 
				+        Returns
			
 
				+        -------
			
 
				+        str :
			
 
				+            The utterance without brackets.
			
 
				+        """
			
 
				+        return re.sub(r"[\(\[].*?[\)\]]", '', utterance)
			
 
				+
			
 
				+    def handle_repetitions(self, utterance: str) -> str:
			
 
				+        while True:
			
 
				+            matched = re.search(self.pattern_repetition, utterance)
			
 
				+
			
 
				+            if not matched:
			
 
				+                break
			
 
				+
			
 
				+            all_match = matched.group(0)
			
 
				+            separator = matched.group(1)
			
 
				+            word, repetitions = matched.group(2),matched.group(3)
			
 
				+            repeated_word = '{}{}'.format(separator, ' '.join([word] * int(repetitions)))
			
 
				+
			
 
				+            utterance = utterance.replace(all_match, repeated_word, 1)
			
 
				+
			
 
				+        return utterance
			
 
				+    
			
 
				+    def remove_multiple_spaces(self, utterance: str) -> str :
			
 
				+        return re.sub(' +', ' ', utterance)
			
 
				+
			
 
				+    def clean(self, utterance: str) -> str :
			
 
				+
			
 
				+        """
			
 
				+        Method that clean utterances by deleting or replacing /
			
 
				+        markers.
			
 
				+
			
 
				+        Parameters
			
 
				+        ----------
			
 
				+        - utterances : list
			
 
				+            list of utterances to clean
			
 
				+        Returns
			
 
				+        -------
			
 
				+        - generator over cleaned utterances
			
 
				+        """
			
 
				+        utterance = self.handle_repetitions(utterance)
			
 
				+        utterance = self.replace_marker(utterance, self.delete_marker_pattern, "")
			
 
				+        utterance = self.delete_words(utterance)
			
 
				+        utterance = self.replace_marker(utterance, self.poncts_to_delete_pattern, "")
			
 
				+        utterance = self.replace_marker(utterance, self.delete_comments_pattern, "")
			
 
				+        utterance = self.replace_marker(utterance, self.replace_unk_pattern, "") # pour mot non retranscrit
			
 
				+        utterance = self.remove_brackets(utterance)
			
 
				+        utterance = self.remove_ponctuations(utterance)
			
 
				+        utterance = self.remove_multiple_spaces(utterance)
			
 
				+        utterance = utterance.strip()
			
 
				+        return utterance