123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264 |
- """Module that downloads the datasets from CHILDES using pylangacq."""
- from typing import List, Generator
- import os
- import json
- import random
- import pylangacq
- from tqdm import tqdm
- import yaml
- from phonemizer.backend import EspeakBackend
- from phonemizer.separator import Separator
- from utterances_cleaner import UtterancesCleaner
- # from utterances_cleaner_new import clean_transcription
- import panphon
- random.seed(80)
- class DownloadChildCorpora :
- """
- Class that downloads child and adult interactions corpora from the\
- childes databases.
- Atributes
- ---------
- - json_markers_file : str
- Filename containing the markers to manage when cleaning the utterances
- """
- def __init__(self, out_dirname, json_markers_filename: str):
- # This will help us to tokenize phonemized utterances in words and/or in phonemes
- self.separator = Separator(phone="$", word="@")
- # This text file will will contain all the corpora that have failed to be downloaded
- self.not_downloaded_data = open(f"{out_dirname}/not_downloaded_data.txt", "w",
- encoding="UTF-8")
- self.utterances_cleaner = UtterancesCleaner(json.load(open(json_markers_filename,
- encoding="UTF-8")))
- self.features_table = panphon.FeatureTable()
- def get_segments(self, utterance: str) -> str:
- """
- Function that retrieves phonemic segments of a given utterance. The utterance\
- must be in a phonetic form.
- We use panphon in order to deal with multi-character phonemes.
- Parameters
- ----------
- - utterance : str
- The utterance for which we want to get phonemic segments.
- """
- return "@".join("$".join(seg.strip()
- for seg in self.features_table.ipa_segs(word.strip())\
- if seg.strip()) for word in utterance.split() if word.strip())
- def participants_data(self,
- chat,
- participants_to_consider: List[str],
- ort_tier,
- phonemize_child: bool,
- ) -> Generator:
- """
- Get the data for each participant. Here, the data for each participant\
- is the set of utterances produced by this participant at all child ages.
- Parameters
- ----------
- - chat : Pylangacq class
- The chat file containing the utterances.
- - participants_to_consider : list
- The participants for which we want to get utterances in the chat file.
- Returns
- -------
- - Iterator:
- Tuple where the first element is the role of the speaker,\
- the second element is the child age (in months) and the last element\
- is an utterance produced by the speaker at this child age.
- """
- ages = chat.ages(months=True)
- participants = {
- speaker : header["Participants"][speaker]["role"]
- for header in chat.headers()
- for speaker in header["Participants"]
- }
- for participant in participants:
- role = participants[participant]
- if role not in participants_to_consider :
- continue
- file_utterances = chat.utterances(by_files=True, participants=participant)
- if not(phonemize_child) and participant == "CHI" :
- tiers = ["pho", "%pho", "xpho", "%xpho"]
- elif ort_tier :
- tiers = ["ort", "%ort", "xort", "%xort"]
- else :
- tiers = [participant]
- for age, utterances in zip(ages, file_utterances) :
- utterances = self.get_utterances(utterances, tiers)
- yield(role,
- participant,
- age,
- [self.utterances_cleaner.clean(utterance) for utterance in utterances])
- def get_utterances(self, utterances: list, tiers: List[str]) -> List[str]:
- """
- This function will get utterances of a given list of tiers. A tier\
- is an annotation (for example morphology, phonetic, etc) of a given utterance.
- Parameters
- ----------
- - utterances : list
- List of pylangacq utterances containing different tiers.
- - tiers : list
- List of tiers to extract from utterances
- Returns
- -------
- - list
- Utterances of a given list of tiers
- """
- str_utterances = []
- for utterance in utterances :
- for tier in tiers :
- if tier in utterance.tiers :
- str_utterances.append(utterance.tiers[tier])
- return str_utterances
- def get_phonetic_utterances(self,
- utterances: List[str],
- participant: str,
- backend: EspeakBackend,
- phonemize_child: bool) -> List[str]:
- """
- This function will get phonemic representation of a given list\
- list of utterances.
- Parameters
- ----------
- - utterances: str
- List of utterances in standard orthography.
- - participant: str
- The participant who has produced the utterance.
- - backend: EspeakBackend
- The espeak backend of the language of the utterance.
- - phonemize_child: bool
- Whether to get the automatic or manual phonemization of the children's utterances.
- Returns
- -------
- - list:
- List of the utterances in phonetic form.
- """
- if(not(phonemize_child) and participant == "CHI") :
- phon_utterances = []
- for utterance in utterances :
- phon_utterances.append(self.get_segments(utterance))
- return phon_utterances
- return backend.phonemize(utterances, separator=self.separator, strip=True)
- def download_data(self,
- language: str,
- languages_to_download_informations: dict,
- out_dirname: str,
- phonemize_child) -> None:
- """
- Download data for all speaker for a given language.
- Parameters
- ----------
- - language: str
- The language for which to retrieve the data
- - languages_to_download_informations:
- - out_dirname: str
- The directory where the downloaded data will be stored.
- - phonemize_child: bool
- """
- participants_to_consider = languages_to_download_informations[language]["participants"]
- downloading_file = open(f"{out_dirname}/{language}.one_utterance_per_line",
- "w", encoding="UTF-8")
- backend = EspeakBackend(language=languages_to_download_informations[language]["espeak_language_id"],
- language_switch="remove-utterance")
- for url in languages_to_download_informations[language]["urls"] :
- try :
- chat = pylangacq.read_chat(url)
- corpus_family = set()
- for file_path in chat.file_paths() :
- informations = file_path.split("/")
- if len(informations) < 3 :
- # only the name of the corpus
- corpus_family.add((informations[0], ""))
- else :
- # the name of the corpus and the family
- corpus_family.add((informations[0], informations[1]))
- except :
- self.not_downloaded_data.write(f"{url}\n")
- continue
- for corpus, family in corpus_family :
- family = family if family else corpus
- chat_family = chat.filter(match=family)
- for role, participant, age, utterances in self.participants_data(chat_family,
- participants_to_consider,
- ort_tier=languages_to_download_informations[language]["ort_tier"],
- phonemize_child=phonemize_child) :
- for utterance in self.get_phonetic_utterances(utterances,
- participant,
- backend,
- phonemize_child=phonemize_child) :
- utterance = utterance.replace("$", " ").replace("@", " ")
- utterance = self.utterances_cleaner.remove_multiple_spaces(utterance)
- utterance = utterance.strip()
- if not utterance :
- continue
- family_name = "_".join((corpus, family))
- downloading_file.write(f"{family_name},{role},{age},{utterance}\n")
- def __call__(self,
- languages_to_download_informations: dict,
- out_dirname: str,
- phonemize_child: bool) -> None :
- """
- Download the data for each languages
- Parameters
- ----------
- - languages_to_download_informations : dict
- The dictionary that contains all relevant informations for downloading\
- the data.
- - out_dirname : str
- Directory where the outpouts will be stored.
- """
- total = len(languages_to_download_informations)
- for language in tqdm(languages_to_download_informations, total=total) :
- self.download_data(language,
- languages_to_download_informations,
- out_dirname,
- phonemize_child=phonemize_child)
- if __name__ == "__main__" :
- from argparse import ArgumentParser, BooleanOptionalAction
- parser = ArgumentParser()
- parser.add_argument("--yaml_file",
- help="YAML File containing for each language, all relevant information for downloading the data.",
- required=True)
- parser.add_argument("--out_dirname",
- help="The directory where outputs will be stored.",
- required=True)
- parser.add_argument("--markers_json",
- help="Json markers that serve for cleaning.",
- required=True)
- parser.add_argument("--phonemize_child",
- help="Whether phonemize child utterances or not.",
- action=BooleanOptionalAction)
- args = parser.parse_args()
- phonemize_child_or_not = args.phonemize_child
- yaml_file = args.yaml_file
- out_directory_name = args.out_dirname
- markers_json = args.markers_json
- if not os.path.exists(out_directory_name):
- os.makedirs(out_directory_name)
- loaded_languages_to_download_informations = yaml.safe_load(open(args.yaml_file,
- encoding="UTF-8"))
- downloader = DownloadChildCorpora(out_directory_name, markers_json)
- downloader(loaded_languages_to_download_informations, out_directory_name, phonemize_child_or_not)
|