doi
/
Unsupervised_Metrics_CLD_Sy
forked from LAAC-LSCP/Unsupervised_Metrics_CLD_Sy


			
			
				
					
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
							"""Module that downloads the datasets from CHILDES using pylangacq."""
from typing import List, Generator
import os
import json
import random
import pylangacq
from tqdm import tqdm
import yaml
from phonemizer.backend import EspeakBackend
from phonemizer.separator import Separator
from utterances_cleaner import UtterancesCleaner
# from utterances_cleaner_new import clean_transcription
import panphon

random.seed(80)

class DownloadChildCorpora :
    """
    Class that downloads child and adult interactions corpora from the\
    childes databases.

    Atributes
    ---------
    - json_markers_file : str
        Filename containing the markers to manage when cleaning the utterances
    """

    def __init__(self, out_dirname, json_markers_filename: str):
        # This will help us to tokenize phonemized utterances in words and/or in phonemes
        self.separator = Separator(phone="$", word="@")
        # This text file will will contain all the corpora that have failed to be downloaded
        self.not_downloaded_data = open(f"{out_dirname}/not_downloaded_data.txt", "w",
                                        encoding="UTF-8")
        self.utterances_cleaner = UtterancesCleaner(json.load(open(json_markers_filename,
                                                                    encoding="UTF-8")))
        self.features_table = panphon.FeatureTable()

    def get_segments(self, utterance: str) -> str:
        """
        Function that retrieves phonemic segments of a given utterance. The utterance\
        must be in a phonetic form.
        We use panphon in order to deal with multi-character phonemes.

        Parameters
        ----------
        - utterance : str
            The utterance for which we want to get phonemic segments.
        """
        return "@".join("$".join(seg.strip()
                for seg in self.features_table.ipa_segs(word.strip())\
                    if seg.strip()) for word in utterance.split() if word.strip())

    def participants_data(self,
                            chat,
                            participants_to_consider: List[str],
                            phonemize_child: bool,
                            ) -> Generator:
        """
        Get the data for each participant. Here, the data for each participant\
        is the set of utterances produced by this participant at all child ages.

        Parameters
        ----------
        - chat : Pylangacq class
            The chat file containing the utterances.
        - participants_to_consider : list
            The participants for which we want to get utterances in the chat file.

        Returns
        -------
        - Iterator:
            Tuple where the first element is the role of the speaker,\
            the second element is the child age (in months) and the last element\
            is an utterance produced by the speaker at this child age.
        """
        ages = chat.ages(months=True)
        participants = {
            speaker : header["Participants"][speaker]["role"]
                        for header in chat.headers()
                            for speaker in header["Participants"]
            }
        for participant in participants:
            role = participants[participant]
            if role not in participants_to_consider :
                continue
            file_utterances = chat.utterances(by_files=True, participants=participant)
            if not(phonemize_child) and participant == "CHI" :
                tiers = ["pho", "%pho", "xpho", "%xpho"]
            else :
                tiers = [participant]
            for age, utterances in zip(ages, file_utterances) :
                utterances = self.get_utterances(utterances, tiers)
                yield(role,
                        participant,
                        age,
                        [self.utterances_cleaner.clean(utterance) for utterance in utterances])

    def get_utterances(self, utterances: list, tiers: List[str]) -> List[str]:
        """
        This function will get utterances of a given list of tiers. A tier\
        is an annotation (for example morphology, phonetic, etc) of a given utterance.

        Parameters
        ----------
        - utterances : list
            List of pylangacq utterances containing different tiers.
        - tiers : list
            List of tiers to extract from utterances

        Returns
        -------
        - list
            Utterances of a given list of tiers
        """

        str_utterances = []
        for utterance in utterances :
            for tier in tiers :
                if tier in utterance.tiers :
                    str_utterances.append(utterance.tiers[tier])
        return str_utterances

    def get_phonetic_utterances(self,
                                utterances: List[str],
                                participant: str,
                                backend: EspeakBackend,
                                phonemize_child: bool) -> List[str]:
        """
        This function will get phonemic representation of a given list\
        list of utterances.

        Parameters
        ----------
        - utterances: str
            List of utterances in standard orthography.
        - participant: str
            The participant who has produced the utterance.
        - backend: EspeakBackend
            The espeak backend of the language of the utterance.
        - phonemize_child: bool
            Whether to get the automatic or manual phonemization of the children's utterances.

        Returns
        -------
        - list:
            List of the utterances in phonetic form.
        """
        if(not(phonemize_child) and participant == "CHI") :
            phon_utterances = []
            for utterance in utterances :
                phon_utterances.append(self.get_segments(utterance))
            return phon_utterances
        return backend.phonemize(utterances, separator=self.separator, strip=True)

    def download_data(self,
                        language: str,
                        languages_to_download_informations: dict,
                        out_dirname: str,
                        phonemize_child) -> None:
        """
        Download data for all speaker for a given language.

        Parameters
        ----------
        - language: str
            The language for which to retrieve the data
        - languages_to_download_informations:

        - out_dirname: str
            The directory where the downloaded data will be stored.
        - phonemize_child: bool

        """
        participants_to_consider = languages_to_download_informations[language]["participants"]
        downloading_file = open(f"{out_dirname}/{language}.one_utterance_per_line", 
                                "w", encoding="UTF-8")
        backend = EspeakBackend(language=languages_to_download_informations[language]["espeak_language_id"],
                                language_switch="remove-utterance")
        for url in languages_to_download_informations[language]["urls"] :
            try :
                chat = pylangacq.read_chat(url)
                corpus_family = set()
                for file_path in chat.file_paths() :
                    informations = file_path.split("/")
                    if len(informations) < 3 :
                        # only the name of the corpus
                        corpus_family.add((informations[0], ""))
                    else :
                        # the name of the corpus and the family
                        corpus_family.add((informations[0], informations[1]))
            except :
                self.not_downloaded_data.write(f"{url}\n")
                continue
            for corpus, family in corpus_family :
                family = family if family else corpus
                chat_family = chat.filter(match=family)
                for role, participant, age, utterances in self.participants_data(chat_family,
                                                                participants_to_consider,
                                                                phonemize_child=phonemize_child) :
                    for utterance in self.get_phonetic_utterances(utterances,
                                                                    participant,
                                                                    backend,
                                                                    phonemize_child=phonemize_child) :
                        if not utterance :
                            continue
                        family_name = "_".join((corpus, family))
                        downloading_file.write(f"{family_name},{role},{age},{utterance}\n")

    def __call__(self,
                languages_to_download_informations: dict,
                out_dirname: str,
                phonemize_child: bool) -> None :
        """
        Download the data for each languages

        Parameters
        ----------
        - languages_to_download_informations : dict
            The dictionary that contains all relevant informations for downloading\
            the data.
        - out_dirname : str
            Directory where the outpouts will be stored.
        """
        total = len(languages_to_download_informations)
        for language in tqdm(languages_to_download_informations, total=total) :
            self.download_data(language,
                                languages_to_download_informations,
                                out_dirname,
                                phonemize_child=phonemize_child)

if __name__ == "__main__" :
    from argparse import ArgumentParser, BooleanOptionalAction
    parser = ArgumentParser()

    parser.add_argument("--yaml_file",
                        help="YAML File containing for each language, all relevant information for downloading the data.",
                        required=True)
    parser.add_argument("--out_dirname",
                        help="The directory where outputs will be stored.",
                        required=True)
    parser.add_argument("--markers_json",
                        help="Json markers that serve for cleaning.",
                        required=True)
    parser.add_argument("--phonemize_child", action=BooleanOptionalAction)
    args = parser.parse_args()
    phonemize_child_or_not = args.phonemize_child
    yaml_file = args.yaml_file
    out_directory_name = args.out_dirname
    markers_json = args.markers_json
    if not os.path.exists(out_directory_name):
        os.makedirs(out_directory_name)
    loaded_languages_to_download_informations = yaml.safe_load(open(args.yaml_file,
                                                                    encoding="UTF-8"))
    downloader = DownloadChildCorpora(out_directory_name, markers_json)
    downloader(loaded_languages_to_download_informations, out_directory_name, phonemize_child_or_not)