download_childes_corpora.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. """Module that downloads the datasets from CHILDES using pylangacq."""
  2. from typing import List, Generator
  3. import os
  4. import json
  5. import random
  6. import pylangacq
  7. from tqdm import tqdm
  8. import yaml
  9. from phonemizer.backend import EspeakBackend
  10. from phonemizer.separator import Separator
  11. from utterances_cleaner import UtterancesCleaner
  12. # from utterances_cleaner_new import clean_transcription
  13. import panphon
  14. random.seed(80)
  15. class DownloadChildCorpora :
  16. """
  17. Class that downloads child and adult interactions corpora from the\
  18. childes databases.
  19. Atributes
  20. ---------
  21. - json_markers_file : str
  22. Filename containing the markers to manage when cleaning the utterances
  23. """
  24. def __init__(self, out_dirname, json_markers_filename: str):
  25. # This will help us to tokenize phonemized utterances in words and/or in phonemes
  26. self.separator = Separator(phone="$", word="@")
  27. # This text file will will contain all the corpora that have failed to be downloaded
  28. self.not_downloaded_data = open(f"{out_dirname}/not_downloaded_data.txt", "w",
  29. encoding="UTF-8")
  30. self.utterances_cleaner = UtterancesCleaner(json.load(open(json_markers_filename,
  31. encoding="UTF-8")))
  32. self.features_table = panphon.FeatureTable()
  33. def get_segments(self, utterance: str) -> str:
  34. """
  35. Function that retrieves phonemic segments of a given utterance. The utterance\
  36. must be in a phonetic form.
  37. We use panphon in order to deal with multi-character phonemes.
  38. Parameters
  39. ----------
  40. - utterance : str
  41. The utterance for which we want to get phonemic segments.
  42. """
  43. return "@".join("$".join(seg.strip()
  44. for seg in self.features_table.ipa_segs(word.strip())\
  45. if seg.strip()) for word in utterance.split() if word.strip())
  46. def participants_data(self,
  47. chat,
  48. participants_to_consider: List[str],
  49. phonemize_child: bool,
  50. ) -> Generator:
  51. """
  52. Get the data for each participant. Here, the data for each participant\
  53. is the set of utterances produced by this participant at all child ages.
  54. Parameters
  55. ----------
  56. - chat : Pylangacq class
  57. The chat file containing the utterances.
  58. - participants_to_consider : list
  59. The participants for which we want to get utterances in the chat file.
  60. Returns
  61. -------
  62. - Iterator:
  63. Tuple where the first element is the role of the speaker,\
  64. the second element is the child age (in months) and the last element\
  65. is an utterance produced by the speaker at this child age.
  66. """
  67. ages = chat.ages(months=True)
  68. participants = {
  69. speaker : header["Participants"][speaker]["role"]
  70. for header in chat.headers()
  71. for speaker in header["Participants"]
  72. }
  73. for participant in participants:
  74. role = participants[participant]
  75. if role not in participants_to_consider :
  76. continue
  77. file_utterances = chat.utterances(by_files=True, participants=participant)
  78. if not(phonemize_child) and participant == "CHI" :
  79. tiers = ["pho", "%pho", "xpho", "%xpho"]
  80. else :
  81. tiers = [participant]
  82. for age, utterances in zip(ages, file_utterances) :
  83. utterances = self.get_utterances(utterances, tiers)
  84. yield(role,
  85. participant,
  86. age,
  87. [self.utterances_cleaner.clean(utterance) for utterance in utterances])
  88. def get_utterances(self, utterances: list, tiers: List[str]) -> List[str]:
  89. """
  90. This function will get utterances of a given list of tiers. A tier\
  91. is an annotation (for example morphology, phonetic, etc) of a given utterance.
  92. Parameters
  93. ----------
  94. - utterances : list
  95. List of pylangacq utterances containing different tiers.
  96. - tiers : list
  97. List of tiers to extract from utterances
  98. Returns
  99. -------
  100. - list
  101. Utterances of a given list of tiers
  102. """
  103. str_utterances = []
  104. for utterance in utterances :
  105. for tier in tiers :
  106. if tier in utterance.tiers :
  107. str_utterances.append(utterance.tiers[tier])
  108. return str_utterances
  109. def get_phonetic_utterances(self,
  110. utterances: List[str],
  111. participant: str,
  112. backend: EspeakBackend,
  113. phonemize_child: bool) -> List[str]:
  114. """
  115. This function will get phonemic representation of a given list\
  116. list of utterances.
  117. Parameters
  118. ----------
  119. - utterances: str
  120. List of utterances in standard orthography.
  121. - participant: str
  122. The participant who has produced the utterance.
  123. - backend: EspeakBackend
  124. The espeak backend of the language of the utterance.
  125. - phonemize_child: bool
  126. Whether to get the automatic or manual phonemization of the children's utterances.
  127. Returns
  128. -------
  129. - list:
  130. List of the utterances in phonetic form.
  131. """
  132. if(not(phonemize_child) and participant == "CHI") :
  133. phon_utterances = []
  134. for utterance in utterances :
  135. phon_utterances.append(self.get_segments(utterance))
  136. return phon_utterances
  137. return backend.phonemize(utterances, separator=self.separator, strip=True)
  138. def download_data(self,
  139. language: str,
  140. languages_to_download_informations: dict,
  141. out_dirname: str,
  142. phonemize_child) -> None:
  143. """
  144. Download data for all speaker for a given language.
  145. Parameters
  146. ----------
  147. - language: str
  148. The language for which to retrieve the data
  149. - languages_to_download_informations:
  150. - out_dirname: str
  151. The directory where the downloaded data will be stored.
  152. - phonemize_child: bool
  153. """
  154. participants_to_consider = languages_to_download_informations[language]["participants"]
  155. downloading_file = open(f"{out_dirname}/{language}.one_utterance_per_line",
  156. "w", encoding="UTF-8")
  157. backend = EspeakBackend(language=languages_to_download_informations[language]["espeak_language_id"],
  158. language_switch="remove-utterance")
  159. for url in languages_to_download_informations[language]["urls"] :
  160. try :
  161. chat = pylangacq.read_chat(url)
  162. corpus_family = set()
  163. for file_path in chat.file_paths() :
  164. informations = file_path.split("/")
  165. if len(informations) < 3 :
  166. # only the name of the corpus
  167. corpus_family.add((informations[0], ""))
  168. else :
  169. # the name of the corpus and the family
  170. corpus_family.add((informations[0], informations[1]))
  171. except :
  172. self.not_downloaded_data.write(f"{url}\n")
  173. continue
  174. for corpus, family in corpus_family :
  175. family = family if family else corpus
  176. chat_family = chat.filter(match=family)
  177. for role, participant, age, utterances in self.participants_data(chat_family,
  178. participants_to_consider,
  179. phonemize_child=phonemize_child) :
  180. for utterance in self.get_phonetic_utterances(utterances,
  181. participant,
  182. backend,
  183. phonemize_child=phonemize_child) :
  184. if not utterance :
  185. continue
  186. family_name = "_".join((corpus, family))
  187. downloading_file.write(f"{family_name},{role},{age},{utterance}\n")
  188. def __call__(self,
  189. languages_to_download_informations: dict,
  190. out_dirname: str,
  191. phonemize_child: bool) -> None :
  192. """
  193. Download the data for each languages
  194. Parameters
  195. ----------
  196. - languages_to_download_informations : dict
  197. The dictionary that contains all relevant informations for downloading\
  198. the data.
  199. - out_dirname : str
  200. Directory where the outpouts will be stored.
  201. """
  202. total = len(languages_to_download_informations)
  203. for language in tqdm(languages_to_download_informations, total=total) :
  204. self.download_data(language,
  205. languages_to_download_informations,
  206. out_dirname,
  207. phonemize_child=phonemize_child)
  208. if __name__ == "__main__" :
  209. from argparse import ArgumentParser, BooleanOptionalAction
  210. parser = ArgumentParser()
  211. parser.add_argument("--yaml_file",
  212. help="YAML File containing for each language, all relevant information for downloading the data.",
  213. required=True)
  214. parser.add_argument("--out_dirname",
  215. help="The directory where outputs will be stored.",
  216. required=True)
  217. parser.add_argument("--markers_json",
  218. help="Json markers that serve for cleaning.",
  219. required=True)
  220. parser.add_argument("--phonemize_child", action=BooleanOptionalAction)
  221. args = parser.parse_args()
  222. phonemize_child_or_not = args.phonemize_child
  223. yaml_file = args.yaml_file
  224. out_directory_name = args.out_dirname
  225. markers_json = args.markers_json
  226. if not os.path.exists(out_directory_name):
  227. os.makedirs(out_directory_name)
  228. loaded_languages_to_download_informations = yaml.safe_load(open(args.yaml_file,
  229. encoding="UTF-8"))
  230. downloader = DownloadChildCorpora(out_directory_name, markers_json)
  231. downloader(loaded_languages_to_download_informations, out_directory_name, phonemize_child_or_not)