Scheduled service maintenance on November 22


On Friday, November 22, 2024, between 06:00 CET and 18:00 CET, GIN services will undergo planned maintenance. Extended service interruptions should be expected. We will try to keep downtimes to a minimum, but recommend that users avoid critical tasks, large data uploads, or DOI requests during this time.

We apologize for any inconvenience.

main.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272
  1. import os
  2. import pandas as pd
  3. import re
  4. import datetime
  5. import shutil
  6. import urllib
  7. import requests
  8. import pylangacq
  9. from urllib.request import Request, urlopen
  10. from urllib.error import HTTPError
  11. from pathlib import Path
  12. from os.path import isdir, join
  13. from io import BytesIO
  14. from zipfile import ZipFile
  15. from bs4 import BeautifulSoup
  16. import datalad.api as dl
  17. if __name__ == "__main__":
  18. import argparse
  19. parser = argparse.ArgumentParser()
  20. parser.add_argument('--corpus',
  21. required=True,
  22. help='the whole path to the folder where you would like to create your corpus: /...'
  23. )
  24. parser.add_argument('--url',
  25. required=True,
  26. help='a link to the website with your corpus'
  27. )
  28. args = parser.parse_args()
  29. path_corpus = args.corpus
  30. url_path = args.url
  31. m = url_path.split('/')
  32. name_corpus = m[len(m) - 1].replace('.html', '')
  33. name_experiment = f'{name_corpus}'
  34. print(f'Your corpus will be created in {path_corpus}/{name_experiment}')
  35. def retrieve_links(url_path):
  36. page = requests.get(url_path).text
  37. content = BeautifulSoup(page, 'html.parser')
  38. for node in content.find_all('a'):
  39. if '/data/' in node.get('href'):
  40. cha_path = 'https://phonbank.talkbank.org' + node.get('href')
  41. elif '/media' in node.get('href'):
  42. rec_path = node.get('href')
  43. else:
  44. continue
  45. return cha_path, rec_path
  46. cha_path, rec_path = retrieve_links(url_path)
  47. def change_directory(path):
  48. try:
  49. os.chdir(path)
  50. print("Current working directory: {0}".format(os.getcwd()))
  51. except FileNotFoundError:
  52. print("Directory: {0} does not exist".format(path))
  53. except NotADirectoryError:
  54. print("{0} is not a directory".format(path))
  55. except PermissionError:
  56. print("You do not have permissions to change to {0}".format(path))
  57. def create_directory(parent_dir, dir, tree):
  58. if not os.path.isdir(parent_dir + dir):
  59. if tree:
  60. path = os.path.join(parent_dir, dir)
  61. os.makedirs(path)
  62. else:
  63. path = os.path.join(parent_dir, dir)
  64. os.mkdir(path)
  65. else:
  66. pass
  67. def copy_files(source, target, substring):
  68. for filename in Path(source).glob('*.*'):
  69. if substring in str(filename):
  70. shutil.copy(filename, target)
  71. def move_files(source, target, files, substring):
  72. if files:
  73. for filename in Path(source).glob('*.*'):
  74. if substring in str(filename):
  75. shutil.copy(filename, target)
  76. os.remove(filename)
  77. else:
  78. alldirs = os.listdir(source)
  79. for f in alldirs:
  80. shutil.move(source + f, target + f)
  81. def remove_files (source, dir, files):
  82. if dir:
  83. for dirname in files:
  84. try:
  85. shutil.rmtree(source + f'/{dirname}')
  86. except OSError as e:
  87. print("Error: %s : %s" % (source + f'/{dirname}', e.strerror))
  88. else:
  89. os.remove(source + files)
  90. # Create the main folder with the name of the experiment
  91. dl.create(f'{path_corpus}/{name_experiment}')
  92. # Create subfolders
  93. create_directory(f'{path_corpus}/{name_experiment}/', 'metadata', False)
  94. create_directory(f'{path_corpus}/{name_experiment}/recordings/', 'raw', True)
  95. create_directory(f'{path_corpus}/{name_experiment}/annotations/cha/', 'raw', True)
  96. create_directory(f'{path_corpus}/{name_experiment}/', 'extra', False)
  97. # Download files *.cha from a zipped file
  98. def unzip(zipurl, path_corpus):
  99. with urlopen(zipurl) as zipresp:
  100. with ZipFile(BytesIO(zipresp.read())) as zfile:
  101. zfile.extractall(f'{path_corpus}')
  102. unzip(cha_path, f'{path_corpus}/{name_experiment}/annotations/cha/raw')
  103. # Move metadata and other files into the folders METADATA and EXTRA
  104. source = f'{path_corpus}/{name_experiment}/annotations/cha/raw/{name_corpus}/'
  105. destination_metadata = f'{path_corpus}/{name_experiment}/metadata/'
  106. destination_extra = f'{path_corpus}/{name_experiment}/extra/'
  107. destination_cha = f'{path_corpus}/{name_experiment}/annotations/cha/raw/'
  108. unzip(cha_path, f'{path_corpus}/{name_experiment}/annotations/cha/raw')
  109. source2 = f'{path_corpus}/{name_experiment}/annotations/cha/raw/{name_corpus}'
  110. move_files(source2, destination_metadata, True, 'metadata')
  111. move_files(source2, destination_extra, True, '.txt')
  112. print('The metadata files have been moved to metadata folder')
  113. change_directory(source)
  114. dirs = sorted([f for f in os.listdir(source) if isdir(join(source, f))])
  115. to_drop = {dir: [] for dir in dirs}
  116. for dir in dirs:
  117. files = sorted([f for f in os.listdir(source + '/' + dir)])
  118. for file in files:
  119. child = pylangacq.read_chat(source + dir + '/' + file)
  120. if child.headers()[0]['Participants']['CHI']['age'] == '':
  121. to_drop[f'{dir}'].append(file)
  122. for utt in child.utterances():
  123. if utt.time_marks == None:
  124. to_drop[f'{dir}'].append(file)
  125. break
  126. print('The annotations have been uploaded.')
  127. def move_file(source, target, filename):
  128. shutil.copy(filename, target)
  129. os.remove(filename)
  130. for dir in dirs:
  131. files = sorted([f for f in os.listdir(source + '/' + dir)])
  132. for file in files:
  133. if file not in to_drop[f'{dir}']:
  134. move_file(source + '/' + dir, destination_cha, source + '/' + dir + '/' + file)
  135. os.rename(destination_cha + '/' + file, destination_cha + '/' + f'{name_corpus}_{dir}_{file}')
  136. shutil.rmtree(f'{path_corpus}/{name_experiment}/annotations/cha/raw/{name_corpus}')
  137. change_directory(destination_cha)
  138. recording_date = []
  139. recording_name = {}
  140. children_rec = []
  141. corr_age_rec = {}
  142. files = sorted([f for f in os.listdir(destination_cha)])
  143. file_set = set([f[:8] for f in files])
  144. filename = '%%%%%%%'
  145. for file in files:
  146. child = pylangacq.read_chat(destination_cha + '/' + file)
  147. if child.headers()[0]['Participants']['CHI']['name'] == "Sheng":
  148. children_rec.append('HYS')
  149. recording_name['HYS'].append(child.headers()[0]['Media'])
  150. else:
  151. children_rec.append(child.headers()[0]['Participants']['CHI']['name'])
  152. if child.headers()[0]['Participants']['CHI']['name'] not in recording_name:
  153. recording_name[child.headers()[0]['Participants']['CHI']['name']] = [child.headers()[0]['Media']]
  154. else:
  155. recording_name[child.headers()[0]['Participants']['CHI']['name']].append(child.headers()[0]['Media'])
  156. recording_date.append((child.dates_of_recording().pop()).isoformat())
  157. if filename not in file:
  158. corr_age_rec[child.headers()[0]['Participants']['CHI']['age']] = child.headers()[0]['Date']
  159. m = re.search('(?<=\_)(.*?)(?=\_)', file)
  160. filename = m.group(1)
  161. child_dob = []
  162. for age, date in corr_age_rec.items():
  163. nb_days = int(age.split(';')[0])*365 + int(age.split(';')[1].split('.')[0])*30 + int(age.split('.')[1])
  164. child_dob.append(str(date.pop() - datetime.timedelta(days=nb_days))[:10])
  165. child_experiment = [f'{name_corpus}']*len(child_dob)
  166. dob_criterion = ['extrapolated']*len(child_dob)
  167. dob_accuracy = ['week']*len(child_dob)
  168. children = sorted(set(children_rec))
  169. recording_name_wav = {}
  170. recording_name_mp3 = {}
  171. for k,v in recording_name.items():
  172. recording_name_wav[k] = [rec.replace(', audio', '.wav') for rec in v]
  173. recording_name_mp3[k] = [rec.replace(', audio', '.mp3') for rec in v]
  174. change_directory(f'{path_corpus}/{name_experiment}/recordings/raw/')
  175. missing_rec = []
  176. for name, value in recording_name_mp3.items():
  177. for rec in value:
  178. rec1 = rec.replace('.mp3', '.cha')
  179. if os.path.exists(f'{path_corpus}/{name_experiment}/annotations/cha/raw/{name_corpus}_{name}_{rec1}'):
  180. req = Request(rec_path + '/' + name + '/' + rec)
  181. try:
  182. urllib.request.urlretrieve(rec_path + '/' + name + '/' + rec, f'{name_corpus}_{name}_{rec}')
  183. except HTTPError as e:
  184. print(f'{name}_{rec} is missing')
  185. missing_rec.append(f'{name}_{rec}')
  186. for name, value in recording_name_wav.items():
  187. for rec in value:
  188. rec1 = rec.replace('.wav', '.cha')
  189. if f'{name_corpus}_{name}_{rec1}':
  190. req = Request(rec_path + '/' + name + '/0wav/' + rec)
  191. try:
  192. urllib.request.urlretrieve(rec_path + '/' + name + '/0wav/' + rec, f'{name_corpus}_{name}_{rec}')
  193. except HTTPError as e:
  194. print(f'{name}_{rec} is missing')
  195. missing_rec.append(f'{name}_{rec}')
  196. print('The recordings have been uploaded.')
  197. recording_names = []
  198. def to_list(dict):
  199. to_list = []
  200. for k,v in dict.items():
  201. for it in v:
  202. to_list.append(f'{k}_{it}')
  203. return to_list
  204. recording_name_wav_list = to_list(recording_name_wav)
  205. recording_name_mp3_list = to_list(recording_name_mp3)
  206. def check_rec(list_rec, list_children, list_dates):
  207. copy_list = list_rec
  208. copy_children = list_children
  209. copy_dates = list_dates
  210. for i, rec_name in enumerate(list_rec):
  211. if rec_name in missing_rec:
  212. copy_list.remove(rec_name)
  213. del copy_children[i]
  214. del copy_dates[i]
  215. return copy_list, copy_children, copy_dates
  216. rec_name_wav, child_wav, date_wav = check_rec(recording_name_wav_list, children_rec, recording_date)
  217. rec_name_mp3, child_mp3, date_mp3 = check_rec(recording_name_mp3_list, children_rec, recording_date)
  218. def rename(list_rec):
  219. copy_list = []
  220. for rec in list_rec:
  221. copy_list.append(f'{name_corpus}_{rec}')
  222. return copy_list
  223. recording_date = []
  224. recording_names = rename(rec_name_wav) + rename(rec_name_mp3)
  225. children_recordings = child_wav + child_mp3
  226. recording_date = date_wav + date_mp3
  227. rec_experiment = [f'{name_corpus}']*len(children_recordings)
  228. recording_device = ['usb']*len(children_recordings)
  229. start_time = ['00:00']*len(children_recordings)
  230. #create .csv
  231. recordings = {'experiment': rec_experiment, 'child_id': children_recordings, 'date_iso': recording_date, 'start_time': start_time, 'recording_device_type': recording_device, 'recording_filename': recording_names}
  232. children = {'experiment': child_experiment, 'child_id': children, 'child_dob': child_dob, 'dob_criterion': dob_criterion, 'dob_accuracy': dob_accuracy}
  233. df_recordings = pd.DataFrame(data=recordings)
  234. df_children = pd.DataFrame(data=children)
  235. df_children.to_csv(f'/{path_corpus}/{name_experiment}/metadata/children.csv', index=False)
  236. df_recordings.to_csv(f'{path_corpus}/{name_experiment}/metadata/recordings.csv', index=False)
  237. #recordings should be put directly to 'recordings/raw' without intermediate folders
  238. change_directory(f'{path_corpus}/{name_corpus}')
  239. print(f'Your corpus {name_corpus} has been created.')