download_childes_corpora.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264
  1. """Module that downloads the datasets from CHILDES using pylangacq."""
  2. from typing import List, Generator
  3. import os
  4. import json
  5. import random
  6. import pylangacq
  7. from tqdm import tqdm
  8. import yaml
  9. from phonemizer.backend import EspeakBackend
  10. from phonemizer.separator import Separator
  11. from utterances_cleaner import UtterancesCleaner
  12. # from utterances_cleaner_new import clean_transcription
  13. import panphon
  14. random.seed(80)
  15. class DownloadChildCorpora :
  16. """
  17. Class that downloads child and adult interactions corpora from the\
  18. childes databases.
  19. Atributes
  20. ---------
  21. - json_markers_file : str
  22. Filename containing the markers to manage when cleaning the utterances
  23. """
  24. def __init__(self, out_dirname, json_markers_filename: str):
  25. # This will help us to tokenize phonemized utterances in words and/or in phonemes
  26. self.separator = Separator(phone="$", word="@")
  27. # This text file will will contain all the corpora that have failed to be downloaded
  28. self.not_downloaded_data = open(f"{out_dirname}/not_downloaded_data.txt", "w",
  29. encoding="UTF-8")
  30. self.utterances_cleaner = UtterancesCleaner(json.load(open(json_markers_filename,
  31. encoding="UTF-8")))
  32. self.features_table = panphon.FeatureTable()
  33. def get_segments(self, utterance: str) -> str:
  34. """
  35. Function that retrieves phonemic segments of a given utterance. The utterance\
  36. must be in a phonetic form.
  37. We use panphon in order to deal with multi-character phonemes.
  38. Parameters
  39. ----------
  40. - utterance : str
  41. The utterance for which we want to get phonemic segments.
  42. """
  43. return "@".join("$".join(seg.strip()
  44. for seg in self.features_table.ipa_segs(word.strip())\
  45. if seg.strip()) for word in utterance.split() if word.strip())
  46. def participants_data(self,
  47. chat,
  48. participants_to_consider: List[str],
  49. ort_tier,
  50. phonemize_child: bool,
  51. ) -> Generator:
  52. """
  53. Get the data for each participant. Here, the data for each participant\
  54. is the set of utterances produced by this participant at all child ages.
  55. Parameters
  56. ----------
  57. - chat : Pylangacq class
  58. The chat file containing the utterances.
  59. - participants_to_consider : list
  60. The participants for which we want to get utterances in the chat file.
  61. Returns
  62. -------
  63. - Iterator:
  64. Tuple where the first element is the role of the speaker,\
  65. the second element is the child age (in months) and the last element\
  66. is an utterance produced by the speaker at this child age.
  67. """
  68. ages = chat.ages(months=True)
  69. participants = {
  70. speaker : header["Participants"][speaker]["role"]
  71. for header in chat.headers()
  72. for speaker in header["Participants"]
  73. }
  74. for participant in participants:
  75. role = participants[participant]
  76. if role not in participants_to_consider :
  77. continue
  78. file_utterances = chat.utterances(by_files=True, participants=participant)
  79. if not(phonemize_child) and participant == "CHI" :
  80. tiers = ["pho", "%pho", "xpho", "%xpho"]
  81. elif ort_tier :
  82. tiers = ["ort", "%ort", "xort", "%xort"]
  83. else :
  84. tiers = [participant]
  85. for age, utterances in zip(ages, file_utterances) :
  86. utterances = self.get_utterances(utterances, tiers)
  87. yield(role,
  88. participant,
  89. age,
  90. [self.utterances_cleaner.clean(utterance) for utterance in utterances])
  91. def get_utterances(self, utterances: list, tiers: List[str]) -> List[str]:
  92. """
  93. This function will get utterances of a given list of tiers. A tier\
  94. is an annotation (for example morphology, phonetic, etc) of a given utterance.
  95. Parameters
  96. ----------
  97. - utterances : list
  98. List of pylangacq utterances containing different tiers.
  99. - tiers : list
  100. List of tiers to extract from utterances
  101. Returns
  102. -------
  103. - list
  104. Utterances of a given list of tiers
  105. """
  106. str_utterances = []
  107. for utterance in utterances :
  108. for tier in tiers :
  109. if tier in utterance.tiers :
  110. str_utterances.append(utterance.tiers[tier])
  111. return str_utterances
  112. def get_phonetic_utterances(self,
  113. utterances: List[str],
  114. participant: str,
  115. backend: EspeakBackend,
  116. phonemize_child: bool) -> List[str]:
  117. """
  118. This function will get phonemic representation of a given list\
  119. list of utterances.
  120. Parameters
  121. ----------
  122. - utterances: str
  123. List of utterances in standard orthography.
  124. - participant: str
  125. The participant who has produced the utterance.
  126. - backend: EspeakBackend
  127. The espeak backend of the language of the utterance.
  128. - phonemize_child: bool
  129. Whether to get the automatic or manual phonemization of the children's utterances.
  130. Returns
  131. -------
  132. - list:
  133. List of the utterances in phonetic form.
  134. """
  135. if(not(phonemize_child) and participant == "CHI") :
  136. phon_utterances = []
  137. for utterance in utterances :
  138. phon_utterances.append(self.get_segments(utterance))
  139. return phon_utterances
  140. return backend.phonemize(utterances, separator=self.separator, strip=True)
  141. def download_data(self,
  142. language: str,
  143. languages_to_download_informations: dict,
  144. out_dirname: str,
  145. phonemize_child) -> None:
  146. """
  147. Download data for all speaker for a given language.
  148. Parameters
  149. ----------
  150. - language: str
  151. The language for which to retrieve the data
  152. - languages_to_download_informations:
  153. - out_dirname: str
  154. The directory where the downloaded data will be stored.
  155. - phonemize_child: bool
  156. """
  157. participants_to_consider = languages_to_download_informations[language]["participants"]
  158. downloading_file = open(f"{out_dirname}/{language}.one_utterance_per_line",
  159. "w", encoding="UTF-8")
  160. backend = EspeakBackend(language=languages_to_download_informations[language]["espeak_language_id"],
  161. language_switch="remove-utterance")
  162. for url in languages_to_download_informations[language]["urls"] :
  163. try :
  164. chat = pylangacq.read_chat(url)
  165. corpus_family = set()
  166. for file_path in chat.file_paths() :
  167. informations = file_path.split("/")
  168. if len(informations) < 3 :
  169. # only the name of the corpus
  170. corpus_family.add((informations[0], ""))
  171. else :
  172. # the name of the corpus and the family
  173. corpus_family.add((informations[0], informations[1]))
  174. except :
  175. self.not_downloaded_data.write(f"{url}\n")
  176. continue
  177. for corpus, family in corpus_family :
  178. family = family if family else corpus
  179. chat_family = chat.filter(match=family)
  180. for role, participant, age, utterances in self.participants_data(chat_family,
  181. participants_to_consider,
  182. ort_tier=languages_to_download_informations[language]["ort_tier"],
  183. phonemize_child=phonemize_child) :
  184. for utterance in self.get_phonetic_utterances(utterances,
  185. participant,
  186. backend,
  187. phonemize_child=phonemize_child) :
  188. utterance = utterance.replace("$", " ").replace("@", " ")
  189. utterance = self.utterances_cleaner.remove_multiple_spaces(utterance)
  190. utterance = utterance.strip()
  191. if not utterance :
  192. continue
  193. family_name = "_".join((corpus, family))
  194. downloading_file.write(f"{family_name},{role},{age},{utterance}\n")
  195. def __call__(self,
  196. languages_to_download_informations: dict,
  197. out_dirname: str,
  198. phonemize_child: bool) -> None :
  199. """
  200. Download the data for each languages
  201. Parameters
  202. ----------
  203. - languages_to_download_informations : dict
  204. The dictionary that contains all relevant informations for downloading\
  205. the data.
  206. - out_dirname : str
  207. Directory where the outpouts will be stored.
  208. """
  209. total = len(languages_to_download_informations)
  210. for language in tqdm(languages_to_download_informations, total=total) :
  211. self.download_data(language,
  212. languages_to_download_informations,
  213. out_dirname,
  214. phonemize_child=phonemize_child)
  215. if __name__ == "__main__" :
  216. from argparse import ArgumentParser, BooleanOptionalAction
  217. parser = ArgumentParser()
  218. parser.add_argument("--yaml_file",
  219. help="YAML File containing for each language, all relevant information for downloading the data.",
  220. required=True)
  221. parser.add_argument("--out_dirname",
  222. help="The directory where outputs will be stored.",
  223. required=True)
  224. parser.add_argument("--markers_json",
  225. help="Json markers that serve for cleaning.",
  226. required=True)
  227. parser.add_argument("--phonemize_child",
  228. help="Whether phonemize child utterances or not.",
  229. action=BooleanOptionalAction)
  230. args = parser.parse_args()
  231. phonemize_child_or_not = args.phonemize_child
  232. yaml_file = args.yaml_file
  233. out_directory_name = args.out_dirname
  234. markers_json = args.markers_json
  235. if not os.path.exists(out_directory_name):
  236. os.makedirs(out_directory_name)
  237. loaded_languages_to_download_informations = yaml.safe_load(open(args.yaml_file,
  238. encoding="UTF-8"))
  239. downloader = DownloadChildCorpora(out_directory_name, markers_json)
  240. downloader(loaded_languages_to_download_informations, out_directory_name, phonemize_child_or_not)