Browse Source

add script for downloading corpora from childes database

yaya-sy 1 year ago
parent
commit
e4d54bd35e
2 changed files with 384 additions and 0 deletions
  1. 257 0
      code/download_childes_corpora.py
  2. 127 0
      code/utterances_cleaner.py

+ 257 - 0
code/download_childes_corpora.py

@@ -0,0 +1,257 @@
+"""Module that downloads the datasets from CHILDES using pylangacq."""
+from typing import List, Generator
+import os
+import json
+import random
+import pylangacq
+from tqdm import tqdm
+import yaml
+from phonemizer.backend import EspeakBackend
+from phonemizer.separator import Separator
+from utterances_cleaner import UtterancesCleaner
+# from utterances_cleaner_new import clean_transcription
+import panphon
+
+random.seed(80)
+
+class DownloadChildCorpora :
+    """
+    Class that downloads child and adult interactions corpora from the\
+    childes databases.
+
+    Atributes
+    ---------
+    - json_markers_file : str
+        Filename containing the markers to manage when cleaning the utterances
+    """
+
+    def __init__(self, out_dirname, json_markers_filename: str):
+        # This will help us to tokenize phonemized utterances in words and/or in phonemes
+        self.separator = Separator(phone="$", word="@")
+        # This text file will will contain all the corpora that have failed to be downloaded
+        self.not_downloaded_data = open(f"{out_dirname}/not_downloaded_data.txt", "w",
+                                        encoding="UTF-8")
+        self.utterances_cleaner = UtterancesCleaner(json.load(open(json_markers_filename,
+                                                                    encoding="UTF-8")))
+        self.features_table = panphon.FeatureTable()
+
+    def get_segments(self, utterance: str) -> str:
+        """
+        Function that retrieves phonemic segments of a given utterance. The utterance\
+        must be in a phonetic form.
+        We use panphon in order to deal with multi-character phonemes.
+
+        Parameters
+        ----------
+        - utterance : str
+            The utterance for which we want to get phonemic segments.
+        """
+        return "@".join("$".join(seg.strip()
+                for seg in self.features_table.ipa_segs(word.strip())\
+                    if seg.strip()) for word in utterance.split() if word.strip())
+
+    def participants_data(self,
+                            chat,
+                            participants_to_consider: List[str],
+                            phonemize_child: bool,
+                            ) -> Generator[str, str, float, List[str]]:
+        """
+        Get the data for each participant. Here, the data for each participant\
+        is the set of utterances produced by this participant at all child ages.
+
+        Parameters
+        ----------
+        - chat : Pylangacq class
+            The chat file containing the utterances.
+        - participants_to_consider : list
+            The participants for which we want to get utterances in the chat file.
+
+        Returns
+        -------
+        - Iterator:
+            Tuple where the first element is the role of the speaker,\
+            the second element is the child age (in months) and the last element\
+            is an utterance produced by the speaker at this child age.
+        """
+        ages = chat.ages(months=True)
+        participants = {
+            speaker : header["Participants"][speaker]["role"]
+                        for header in chat.headers()
+                            for speaker in header["Participants"]
+            }
+        for participant in participants:
+            role = participants[participant]
+            if role not in participants_to_consider :
+                continue
+            file_utterances = chat.utterances(by_files=True, participants=participant)
+            if not(phonemize_child) and participant == "CHI" :
+                tiers = ["pho", "%pho", "xpho", "%xpho"]
+            else :
+                tiers = [participant]
+            for age, utterances in zip(ages, file_utterances) :
+                utterances = self.get_utterances(utterances, tiers)
+                yield(role,
+                        participant,
+                        age,
+                        [self.utterances_cleaner.clean(utterance) for utterance in utterances])
+
+    def get_utterances(self, utterances: list, tiers: List[str]) -> List[str]:
+        """
+        This function will get utterances of a given list of tiers. A tier\
+        is an annotation (for example morphology, phonetic, etc) of a given utterance.
+
+        Parameters
+        ----------
+        - utterances : list
+            List of pylangacq utterances containing different tiers.
+        - tiers : list
+            List of tiers to extract from utterances
+
+        Returns
+        -------
+        - list
+            Utterances of a given list of tiers
+        """
+
+        str_utterances = []
+        for utterance in utterances :
+            for tier in tiers :
+                if tier in utterance.tiers :
+                    str_utterances.append(utterance.tiers[tier])
+        return str_utterances
+
+    def get_phonetic_utterances(self,
+                                utterances: List[str],
+                                participant: str,
+                                backend: EspeakBackend,
+                                phonemize_child: bool) -> List[str]:
+        """
+        This function will get phonemic representation of a given list\
+        list of utterances.
+
+        Parameters
+        ----------
+        - utterances: str
+            List of utterances in standard orthography.
+        - participant: str
+            The participant who has produced the utterance.
+        - backend: EspeakBackend
+            The espeak backend of the language of the utterance.
+        - phonemize_child: bool
+            Whether to get the automatic or manual phonemization of the children's utterances.
+
+        Returns
+        -------
+        - list:
+            List of the utterances in phonetic form.
+        """
+        if(not(phonemize_child) and participant == "CHI") :
+            phon_utterances = []
+            for utterance in utterances :
+                phon_utterances.append(self.get_segments(utterance))
+            return phon_utterances
+        return backend.phonemize(utterances, separator=self.separator, strip=True)
+
+    def download_data(self,
+                        language: str,
+                        languages_to_download_informations: dict,
+                        out_dirname: str,
+                        phonemize_child) -> None:
+        """
+        Download data for all speaker for a given language.
+
+        Parameters
+        ----------
+        - language: str
+            The language for which to retrieve the data
+        - languages_to_download_informations:
+
+        - out_dirname: str
+            The directory where the downloaded data will be stored.
+        - phonemize_child: bool
+
+        """
+        participants_to_consider = languages_to_download_informations[language]["participants"]
+        downloading_file = open(f"{out_dirname}/{language}.one_utterance_per_line", 
+                                "w", encoding="UTF-8")
+        backend = EspeakBackend(language=languages_to_download_informations[language]["espeak_language_id"],
+                                language_switch="remove-utterance")
+        for url in languages_to_download_informations[language]["urls"] :
+            try :
+                chat = pylangacq.read_chat(url)
+                corpus_family = set()
+                for file_path in chat.file_paths() :
+                    informations = file_path.split("/")
+                    if len(informations) < 3 :
+                        # only the name of the corpus
+                        corpus_family.add((informations[0], ""))
+                    else :
+                        # the name of the corpus and the family
+                        corpus_family.add((informations[0], informations[1]))
+            except :
+                self.not_downloaded_data.write(f"{url}\n")
+                continue
+            for corpus, family in corpus_family :
+                family = family if family else corpus
+                chat_family = chat.filter(match=family)
+                for role, participant, age, utterances in self.participants_data(chat_family,
+                                                                participants_to_consider,
+                                                                phonemize_child=phonemize_child) :
+                    for utterance in self.get_phonetic_utterances(utterances,
+                                                                    participant,
+                                                                    backend,
+                                                                    phonemize_child=phonemize_child) :
+                        utterance = self.utterances_cleaner.clean(utterance)
+                        utterance = self.utterances_cleaner.remove_multiple_spaces(utterance)
+                        if not utterance :
+                            continue
+                        family_name = "_".join((corpus, family))
+                        downloading_file.write(f"{family_name},{role},{age},{utterance}\n")
+
+    def __call__(self,
+                languages_to_download_informations: dict,
+                out_dirname: str,
+                phonemize_child: bool) -> None :
+        """
+        Download the data for each languages
+
+        Parameters
+        ----------
+        - languages_to_download_informations : dict
+            The dictionary that contains all relevant informations for downloading\
+            the data.
+        - out_dirname : str
+            Directory where the outpouts will be stored.
+        """
+        total = len(languages_to_download_informations)
+        for language in tqdm(languages_to_download_informations, total=total) :
+            self.download_data(language,
+                                languages_to_download_informations,
+                                out_dirname,
+                                phonemize_child=phonemize_child)
+
+if __name__ == "__main__" :
+    from argparse import ArgumentParser, BooleanOptionalAction
+    parser = ArgumentParser()
+
+    parser.add_argument("--yaml_file",
+                        help="YAML File containing for each language, all relevant information for downloading the data.",
+                        required=True)
+    parser.add_argument("--out_dirname",
+                        help="The directory where outputs will be stored.",
+                        required=True)
+    parser.add_argument("--markers_json",
+                        help="Json markers that serve for cleaning.",
+                        required=True)
+    parser.add_argument("--phonemize_child", action=BooleanOptionalAction)
+    args = parser.parse_args()
+    phonemize_child_or_not = args.phonemize_child
+    yaml_file = args.yaml_file
+    out_directory_name = args.out_dirname
+    markers_json = args.markers_json
+    if not os.path.exists(out_directory_name):
+        os.makedirs(out_directory_name)
+    loaded_languages_to_download_informations = yaml.safe_load(open(args.yaml_file,
+                                                                    encoding="UTF-8"))
+    downloader = DownloadChildCorpora(out_directory_name, markers_json)
+    downloader(loaded_languages_to_download_informations, out_directory_name, phonemize_child_or_not)

+ 127 - 0
code/utterances_cleaner.py

@@ -0,0 +1,127 @@
+# pylint: disable=no-member
+"""This module contains an implementation of a class that help /
+    to clean orthographic or IPA transcripts of utterances. /
+    Crucially, this class will clean utterances by removing or replacing /
+    markers. See the file markers.json to see what kinds of markers are /
+    accounted.
+"""
+import re
+import string
+
+
+class UtterancesCleaner :
+    """
+    """
+    def __init__(self, markers: dict) :
+        self.delete_marker_pattern = '|'.join(markers["marker_to_delete"])
+        self.word_contains_delete_pattern = '|'.join(markers["word_contains_delete"])
+        self.poncts_to_delete_pattern = '|'.join(markers["poncts_to_delete"])
+        self.delete_comments_pattern = r"(\(|\<|\*)(.+?)(\)|\>|\*)"
+        self.replace_unk_pattern = r"xxx|yyy|www|[0-9]+|\*"
+        self.pattern_letter = re.compile(r"(\s?)([^ ]*)\s\[x (\d+)\]")
+        self.pattern_repetition = re.compile(r"(\s?)([^ ]*)\s\[x (\d+)\]")
+
+    def replace_marker(self, utterance: str, pattern: str, replacement: str="∑") -> list:
+        """
+        Method that replace some markers by an other symbol
+
+        Parameters
+        ----------
+        - utterance : list
+            list of words utterance
+        - pattern : list
+            regex pattern containing markers to delete from the utterance
+        - replacement :
+            symbol that will replace markers
+        """
+        return " ".join(re.sub(pattern, replacement, word) for word in utterance.split(" "))
+
+    def delete_words(self, utterance: str) -> str:
+
+        """
+        Method that delete some words from a given utterance.
+
+        Parameters
+        ----------
+        - utterance : list
+            list of words utterance
+        """
+        return " ".join(word for word in utterance.split(" ") \
+            if not re.match(self.word_contains_delete_pattern, word))
+
+    def remove_ponctuations(self, utterance: str) -> str :
+        """
+        Remove ponctuations from a given utterance.
+
+        Parameters
+        ----------
+        - utterance : str
+            The utterance from which the punctuation will be removed.
+        
+        Returns
+        -------
+        str :
+            The utterance without punctuations.
+        """
+        return utterance.translate(str.maketrans('', '', string.punctuation))
+    
+    def remove_brackets(self, utterance: str) -> str :
+        """
+        Remove brackets from a given utterance.
+
+        Parameters
+        ----------
+        - utterance : str
+            The utterance from which the brackets will be removed.
+        
+        Returns
+        -------
+        str :
+            The utterance without brackets.
+        """
+        return re.sub(r"[\(\[].*?[\)\]]", '', utterance)
+
+    def handle_repetitions(self, utterance: str) -> str:
+        while True:
+            matched = re.search(self.pattern_repetition, utterance)
+
+            if not matched:
+                break
+
+            all_match = matched.group(0)
+            separator = matched.group(1)
+            word, repetitions = matched.group(2),matched.group(3)
+            repeated_word = '{}{}'.format(separator, ' '.join([word] * int(repetitions)))
+
+            utterance = utterance.replace(all_match, repeated_word, 1)
+
+        return utterance
+    
+    def remove_multiple_spaces(self, utterance: str) -> str :
+        return re.sub(' +', ' ', utterance)
+
+    def clean(self, utterance: str) -> str :
+
+        """
+        Method that clean utterances by deleting or replacing /
+        markers.
+
+        Parameters
+        ----------
+        - utterances : list
+            list of utterances to clean
+        Returns
+        -------
+        - generator over cleaned utterances
+        """
+        utterance = self.handle_repetitions(utterance)
+        utterance = self.replace_marker(utterance, self.delete_marker_pattern, "")
+        utterance = self.delete_words(utterance)
+        utterance = self.replace_marker(utterance, self.poncts_to_delete_pattern, "")
+        utterance = self.replace_marker(utterance, self.delete_comments_pattern, "")
+        utterance = self.replace_marker(utterance, self.replace_unk_pattern, "") # pour mot non retranscrit
+        utterance = self.remove_brackets(utterance)
+        utterance = self.remove_ponctuations(utterance)
+        utterance = self.remove_multiple_spaces(utterance)
+        utterance = utterance.strip()
+        return utterance