"""Module that downloads the datasets from CHILDES using pylangacq.""" from typing import List, Generator import os import json import random import pylangacq from tqdm import tqdm import yaml from phonemizer.backend import EspeakBackend from phonemizer.separator import Separator from utterances_cleaner import UtterancesCleaner # from utterances_cleaner_new import clean_transcription import panphon random.seed(80) class DownloadChildCorpora : """ Class that downloads child and adult interactions corpora from the\ childes databases. Atributes --------- - json_markers_file : str Filename containing the markers to manage when cleaning the utterances """ def __init__(self, out_dirname, json_markers_filename: str): # This will help us to tokenize phonemized utterances in words and/or in phonemes self.separator = Separator(phone="$", word="@") # This text file will will contain all the corpora that have failed to be downloaded self.not_downloaded_data = open(f"{out_dirname}/not_downloaded_data.txt", "w", encoding="UTF-8") self.utterances_cleaner = UtterancesCleaner(json.load(open(json_markers_filename, encoding="UTF-8"))) self.features_table = panphon.FeatureTable() def get_segments(self, utterance: str) -> str: """ Function that retrieves phonemic segments of a given utterance. The utterance\ must be in a phonetic form. We use panphon in order to deal with multi-character phonemes. Parameters ---------- - utterance : str The utterance for which we want to get phonemic segments. """ return "@".join("$".join(seg.strip() for seg in self.features_table.ipa_segs(word.strip())\ if seg.strip()) for word in utterance.split() if word.strip()) def participants_data(self, chat, participants_to_consider: List[str], ort_tier, phonemize_child: bool, ) -> Generator: """ Get the data for each participant. Here, the data for each participant\ is the set of utterances produced by this participant at all child ages. Parameters ---------- - chat : Pylangacq class The chat file containing the utterances. - participants_to_consider : list The participants for which we want to get utterances in the chat file. Returns ------- - Iterator: Tuple where the first element is the role of the speaker,\ the second element is the child age (in months) and the last element\ is an utterance produced by the speaker at this child age. """ ages = chat.ages(months=True) participants = { speaker : header["Participants"][speaker]["role"] for header in chat.headers() for speaker in header["Participants"] } for participant in participants: role = participants[participant] if role not in participants_to_consider : continue file_utterances = chat.utterances(by_files=True, participants=participant) if not(phonemize_child) and participant == "CHI" : tiers = ["pho", "%pho", "xpho", "%xpho"] elif ort_tier : tiers = ["ort", "%ort", "xort", "%xort"] else : tiers = [participant] for age, utterances in zip(ages, file_utterances) : utterances = self.get_utterances(utterances, tiers) yield(role, participant, age, [self.utterances_cleaner.clean(utterance) for utterance in utterances]) def get_utterances(self, utterances: list, tiers: List[str]) -> List[str]: """ This function will get utterances of a given list of tiers. A tier\ is an annotation (for example morphology, phonetic, etc) of a given utterance. Parameters ---------- - utterances : list List of pylangacq utterances containing different tiers. - tiers : list List of tiers to extract from utterances Returns ------- - list Utterances of a given list of tiers """ str_utterances = [] for utterance in utterances : for tier in tiers : if tier in utterance.tiers : str_utterances.append(utterance.tiers[tier]) return str_utterances def get_phonetic_utterances(self, utterances: List[str], participant: str, backend: EspeakBackend, phonemize_child: bool) -> List[str]: """ This function will get phonemic representation of a given list\ list of utterances. Parameters ---------- - utterances: str List of utterances in standard orthography. - participant: str The participant who has produced the utterance. - backend: EspeakBackend The espeak backend of the language of the utterance. - phonemize_child: bool Whether to get the automatic or manual phonemization of the children's utterances. Returns ------- - list: List of the utterances in phonetic form. """ if(not(phonemize_child) and participant == "CHI") : phon_utterances = [] for utterance in utterances : phon_utterances.append(self.get_segments(utterance)) return phon_utterances return backend.phonemize(utterances, separator=self.separator, strip=True) def download_data(self, language: str, languages_to_download_informations: dict, out_dirname: str, phonemize_child) -> None: """ Download data for all speaker for a given language. Parameters ---------- - language: str The language for which to retrieve the data - languages_to_download_informations: - out_dirname: str The directory where the downloaded data will be stored. - phonemize_child: bool """ print(language, languages_to_download_informations[language]["ort_tier"]) participants_to_consider = languages_to_download_informations[language]["participants"] downloading_file = open(f"{out_dirname}/{language}.one_utterance_per_line", "w", encoding="UTF-8") backend = EspeakBackend(language=languages_to_download_informations[language]["espeak_language_id"], language_switch="remove-utterance") for url in languages_to_download_informations[language]["urls"] : try : chat = pylangacq.read_chat(url) corpus_family = set() for file_path in chat.file_paths() : informations = file_path.split("/") if len(informations) < 3 : # only the name of the corpus corpus_family.add((informations[0], "")) else : # the name of the corpus and the family corpus_family.add((informations[0], informations[1])) except : self.not_downloaded_data.write(f"{url}\n") continue for corpus, family in corpus_family : family = family if family else corpus chat_family = chat.filter(match=family) for role, participant, age, utterances in self.participants_data(chat_family, participants_to_consider, ort_tier=languages_to_download_informations[language]["ort_tier"], phonemize_child=phonemize_child) : for utterance in self.get_phonetic_utterances(utterances, participant, backend, phonemize_child=phonemize_child) : if not utterance : continue family_name = "_".join((corpus, family)) downloading_file.write(f"{family_name},{role},{age},{utterance}\n") def __call__(self, languages_to_download_informations: dict, out_dirname: str, phonemize_child: bool) -> None : """ Download the data for each languages Parameters ---------- - languages_to_download_informations : dict The dictionary that contains all relevant informations for downloading\ the data. - out_dirname : str Directory where the outpouts will be stored. """ total = len(languages_to_download_informations) for language in tqdm(languages_to_download_informations, total=total) : self.download_data(language, languages_to_download_informations, out_dirname, phonemize_child=phonemize_child) if __name__ == "__main__" : from argparse import ArgumentParser, BooleanOptionalAction parser = ArgumentParser() parser.add_argument("--yaml_file", help="YAML File containing for each language, all relevant information for downloading the data.", required=True) parser.add_argument("--out_dirname", help="The directory where outputs will be stored.", required=True) parser.add_argument("--markers_json", help="Json markers that serve for cleaning.", required=True) parser.add_argument("--phonemize_child", help="Whether phonemize child utterances or not.", action=BooleanOptionalAction) args = parser.parse_args() phonemize_child_or_not = args.phonemize_child yaml_file = args.yaml_file out_directory_name = args.out_dirname markers_json = args.markers_json if not os.path.exists(out_directory_name): os.makedirs(out_directory_name) loaded_languages_to_download_informations = yaml.safe_load(open(args.yaml_file, encoding="UTF-8")) downloader = DownloadChildCorpora(out_directory_name, markers_json) downloader(loaded_languages_to_download_informations, out_directory_name, phonemize_child_or_not)