|
@@ -0,0 +1,257 @@
|
|
|
+"""Module that downloads the datasets from CHILDES using pylangacq."""
|
|
|
+from typing import List, Generator
|
|
|
+import os
|
|
|
+import json
|
|
|
+import random
|
|
|
+import pylangacq
|
|
|
+from tqdm import tqdm
|
|
|
+import yaml
|
|
|
+from phonemizer.backend import EspeakBackend
|
|
|
+from phonemizer.separator import Separator
|
|
|
+from utterances_cleaner import UtterancesCleaner
|
|
|
+# from utterances_cleaner_new import clean_transcription
|
|
|
+import panphon
|
|
|
+
|
|
|
+random.seed(80)
|
|
|
+
|
|
|
+class DownloadChildCorpora :
|
|
|
+ """
|
|
|
+ Class that downloads child and adult interactions corpora from the\
|
|
|
+ childes databases.
|
|
|
+
|
|
|
+ Atributes
|
|
|
+ ---------
|
|
|
+ - json_markers_file : str
|
|
|
+ Filename containing the markers to manage when cleaning the utterances
|
|
|
+ """
|
|
|
+
|
|
|
+ def __init__(self, out_dirname, json_markers_filename: str):
|
|
|
+ # This will help us to tokenize phonemized utterances in words and/or in phonemes
|
|
|
+ self.separator = Separator(phone="$", word="@")
|
|
|
+ # This text file will will contain all the corpora that have failed to be downloaded
|
|
|
+ self.not_downloaded_data = open(f"{out_dirname}/not_downloaded_data.txt", "w",
|
|
|
+ encoding="UTF-8")
|
|
|
+ self.utterances_cleaner = UtterancesCleaner(json.load(open(json_markers_filename,
|
|
|
+ encoding="UTF-8")))
|
|
|
+ self.features_table = panphon.FeatureTable()
|
|
|
+
|
|
|
+ def get_segments(self, utterance: str) -> str:
|
|
|
+ """
|
|
|
+ Function that retrieves phonemic segments of a given utterance. The utterance\
|
|
|
+ must be in a phonetic form.
|
|
|
+ We use panphon in order to deal with multi-character phonemes.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ - utterance : str
|
|
|
+ The utterance for which we want to get phonemic segments.
|
|
|
+ """
|
|
|
+ return "@".join("$".join(seg.strip()
|
|
|
+ for seg in self.features_table.ipa_segs(word.strip())\
|
|
|
+ if seg.strip()) for word in utterance.split() if word.strip())
|
|
|
+
|
|
|
+ def participants_data(self,
|
|
|
+ chat,
|
|
|
+ participants_to_consider: List[str],
|
|
|
+ phonemize_child: bool,
|
|
|
+ ) -> Generator[str, str, float, List[str]]:
|
|
|
+ """
|
|
|
+ Get the data for each participant. Here, the data for each participant\
|
|
|
+ is the set of utterances produced by this participant at all child ages.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ - chat : Pylangacq class
|
|
|
+ The chat file containing the utterances.
|
|
|
+ - participants_to_consider : list
|
|
|
+ The participants for which we want to get utterances in the chat file.
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ - Iterator:
|
|
|
+ Tuple where the first element is the role of the speaker,\
|
|
|
+ the second element is the child age (in months) and the last element\
|
|
|
+ is an utterance produced by the speaker at this child age.
|
|
|
+ """
|
|
|
+ ages = chat.ages(months=True)
|
|
|
+ participants = {
|
|
|
+ speaker : header["Participants"][speaker]["role"]
|
|
|
+ for header in chat.headers()
|
|
|
+ for speaker in header["Participants"]
|
|
|
+ }
|
|
|
+ for participant in participants:
|
|
|
+ role = participants[participant]
|
|
|
+ if role not in participants_to_consider :
|
|
|
+ continue
|
|
|
+ file_utterances = chat.utterances(by_files=True, participants=participant)
|
|
|
+ if not(phonemize_child) and participant == "CHI" :
|
|
|
+ tiers = ["pho", "%pho", "xpho", "%xpho"]
|
|
|
+ else :
|
|
|
+ tiers = [participant]
|
|
|
+ for age, utterances in zip(ages, file_utterances) :
|
|
|
+ utterances = self.get_utterances(utterances, tiers)
|
|
|
+ yield(role,
|
|
|
+ participant,
|
|
|
+ age,
|
|
|
+ [self.utterances_cleaner.clean(utterance) for utterance in utterances])
|
|
|
+
|
|
|
+ def get_utterances(self, utterances: list, tiers: List[str]) -> List[str]:
|
|
|
+ """
|
|
|
+ This function will get utterances of a given list of tiers. A tier\
|
|
|
+ is an annotation (for example morphology, phonetic, etc) of a given utterance.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ - utterances : list
|
|
|
+ List of pylangacq utterances containing different tiers.
|
|
|
+ - tiers : list
|
|
|
+ List of tiers to extract from utterances
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ - list
|
|
|
+ Utterances of a given list of tiers
|
|
|
+ """
|
|
|
+
|
|
|
+ str_utterances = []
|
|
|
+ for utterance in utterances :
|
|
|
+ for tier in tiers :
|
|
|
+ if tier in utterance.tiers :
|
|
|
+ str_utterances.append(utterance.tiers[tier])
|
|
|
+ return str_utterances
|
|
|
+
|
|
|
+ def get_phonetic_utterances(self,
|
|
|
+ utterances: List[str],
|
|
|
+ participant: str,
|
|
|
+ backend: EspeakBackend,
|
|
|
+ phonemize_child: bool) -> List[str]:
|
|
|
+ """
|
|
|
+ This function will get phonemic representation of a given list\
|
|
|
+ list of utterances.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ - utterances: str
|
|
|
+ List of utterances in standard orthography.
|
|
|
+ - participant: str
|
|
|
+ The participant who has produced the utterance.
|
|
|
+ - backend: EspeakBackend
|
|
|
+ The espeak backend of the language of the utterance.
|
|
|
+ - phonemize_child: bool
|
|
|
+ Whether to get the automatic or manual phonemization of the children's utterances.
|
|
|
+
|
|
|
+ Returns
|
|
|
+ -------
|
|
|
+ - list:
|
|
|
+ List of the utterances in phonetic form.
|
|
|
+ """
|
|
|
+ if(not(phonemize_child) and participant == "CHI") :
|
|
|
+ phon_utterances = []
|
|
|
+ for utterance in utterances :
|
|
|
+ phon_utterances.append(self.get_segments(utterance))
|
|
|
+ return phon_utterances
|
|
|
+ return backend.phonemize(utterances, separator=self.separator, strip=True)
|
|
|
+
|
|
|
+ def download_data(self,
|
|
|
+ language: str,
|
|
|
+ languages_to_download_informations: dict,
|
|
|
+ out_dirname: str,
|
|
|
+ phonemize_child) -> None:
|
|
|
+ """
|
|
|
+ Download data for all speaker for a given language.
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ - language: str
|
|
|
+ The language for which to retrieve the data
|
|
|
+ - languages_to_download_informations:
|
|
|
+
|
|
|
+ - out_dirname: str
|
|
|
+ The directory where the downloaded data will be stored.
|
|
|
+ - phonemize_child: bool
|
|
|
+
|
|
|
+ """
|
|
|
+ participants_to_consider = languages_to_download_informations[language]["participants"]
|
|
|
+ downloading_file = open(f"{out_dirname}/{language}.one_utterance_per_line",
|
|
|
+ "w", encoding="UTF-8")
|
|
|
+ backend = EspeakBackend(language=languages_to_download_informations[language]["espeak_language_id"],
|
|
|
+ language_switch="remove-utterance")
|
|
|
+ for url in languages_to_download_informations[language]["urls"] :
|
|
|
+ try :
|
|
|
+ chat = pylangacq.read_chat(url)
|
|
|
+ corpus_family = set()
|
|
|
+ for file_path in chat.file_paths() :
|
|
|
+ informations = file_path.split("/")
|
|
|
+ if len(informations) < 3 :
|
|
|
+ # only the name of the corpus
|
|
|
+ corpus_family.add((informations[0], ""))
|
|
|
+ else :
|
|
|
+ # the name of the corpus and the family
|
|
|
+ corpus_family.add((informations[0], informations[1]))
|
|
|
+ except :
|
|
|
+ self.not_downloaded_data.write(f"{url}\n")
|
|
|
+ continue
|
|
|
+ for corpus, family in corpus_family :
|
|
|
+ family = family if family else corpus
|
|
|
+ chat_family = chat.filter(match=family)
|
|
|
+ for role, participant, age, utterances in self.participants_data(chat_family,
|
|
|
+ participants_to_consider,
|
|
|
+ phonemize_child=phonemize_child) :
|
|
|
+ for utterance in self.get_phonetic_utterances(utterances,
|
|
|
+ participant,
|
|
|
+ backend,
|
|
|
+ phonemize_child=phonemize_child) :
|
|
|
+ utterance = self.utterances_cleaner.clean(utterance)
|
|
|
+ utterance = self.utterances_cleaner.remove_multiple_spaces(utterance)
|
|
|
+ if not utterance :
|
|
|
+ continue
|
|
|
+ family_name = "_".join((corpus, family))
|
|
|
+ downloading_file.write(f"{family_name},{role},{age},{utterance}\n")
|
|
|
+
|
|
|
+ def __call__(self,
|
|
|
+ languages_to_download_informations: dict,
|
|
|
+ out_dirname: str,
|
|
|
+ phonemize_child: bool) -> None :
|
|
|
+ """
|
|
|
+ Download the data for each languages
|
|
|
+
|
|
|
+ Parameters
|
|
|
+ ----------
|
|
|
+ - languages_to_download_informations : dict
|
|
|
+ The dictionary that contains all relevant informations for downloading\
|
|
|
+ the data.
|
|
|
+ - out_dirname : str
|
|
|
+ Directory where the outpouts will be stored.
|
|
|
+ """
|
|
|
+ total = len(languages_to_download_informations)
|
|
|
+ for language in tqdm(languages_to_download_informations, total=total) :
|
|
|
+ self.download_data(language,
|
|
|
+ languages_to_download_informations,
|
|
|
+ out_dirname,
|
|
|
+ phonemize_child=phonemize_child)
|
|
|
+
|
|
|
+if __name__ == "__main__" :
|
|
|
+ from argparse import ArgumentParser, BooleanOptionalAction
|
|
|
+ parser = ArgumentParser()
|
|
|
+
|
|
|
+ parser.add_argument("--yaml_file",
|
|
|
+ help="YAML File containing for each language, all relevant information for downloading the data.",
|
|
|
+ required=True)
|
|
|
+ parser.add_argument("--out_dirname",
|
|
|
+ help="The directory where outputs will be stored.",
|
|
|
+ required=True)
|
|
|
+ parser.add_argument("--markers_json",
|
|
|
+ help="Json markers that serve for cleaning.",
|
|
|
+ required=True)
|
|
|
+ parser.add_argument("--phonemize_child", action=BooleanOptionalAction)
|
|
|
+ args = parser.parse_args()
|
|
|
+ phonemize_child_or_not = args.phonemize_child
|
|
|
+ yaml_file = args.yaml_file
|
|
|
+ out_directory_name = args.out_dirname
|
|
|
+ markers_json = args.markers_json
|
|
|
+ if not os.path.exists(out_directory_name):
|
|
|
+ os.makedirs(out_directory_name)
|
|
|
+ loaded_languages_to_download_informations = yaml.safe_load(open(args.yaml_file,
|
|
|
+ encoding="UTF-8"))
|
|
|
+ downloader = DownloadChildCorpora(out_directory_name, markers_json)
|
|
|
+ downloader(loaded_languages_to_download_informations, out_directory_name, phonemize_child_or_not)
|