Scheduled service maintenance on November 22


On Friday, November 22, 2024, between 06:00 CET and 18:00 CET, GIN services will undergo planned maintenance. Extended service interruptions should be expected. We will try to keep downtimes to a minimum, but recommend that users avoid critical tasks, large data uploads, or DOI requests during this time.

We apologize for any inconvenience.

metadata.py 10.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234
  1. import csv
  2. import multiprocessing as mp
  3. import os
  4. import re
  5. from datetime import datetime, timedelta
  6. import numpy as np
  7. import pandas as pd
  8. from dateutil.relativedelta import relativedelta
  9. from lxml import etree
  10. def extract_from_regex(pattern, subject):
  11. match = pattern.search(subject)
  12. return match.group(1) if match else ''
  13. def recording_date(row):
  14. row['date_iso'] = (
  15. datetime.strptime(row['child_dob'], '%Y-%m-%d') + relativedelta(months = row['age'])
  16. ).strftime('%Y-%m-%d')
  17. return row
  18. def fake_dob(row):
  19. row['child_dob'] = (datetime.strptime(row['date_iso'], '%Y-%m-%d') - relativedelta(months = row['age'])).strftime('%Y-%m-%d')
  20. row['child_dob'] = row['child_dob'].rjust(len('1000-01-01'), '0')
  21. return row
  22. def age(date, dob, unit = 'month'):
  23. delta = relativedelta(date, dob)
  24. if unit == 'year':
  25. return delta.years
  26. elif unit == 'month':
  27. return delta.years*12 + delta.months
  28. elif unit == 'day':
  29. return
  30. class MetadataImporter:
  31. def __init__(self,
  32. experiment: str,
  33. metadata: str,
  34. preserve_dates: bool = False,
  35. recompute_ages: bool = False,
  36. preserve_metadata: bool = False,
  37. split_sessions: bool = False,
  38. weekend_flag: bool = False):
  39. """MetadataImporter
  40. :param experiment: corpus name
  41. :type experiment: str
  42. :param metadata: path to existing metadata csv file
  43. :type metadata: str
  44. :param preserve_dates: preserve true dates instead of setting children date of birth to 1000-01-01 and all recording dates accordingly, defaults to False
  45. :type preserve_dates: bool, optional
  46. :param recompute_ages: extract children date of birth from the its and recompute ages, defaults to False
  47. :type recompute_ages: bool, optional
  48. :param preserve_metadata: preserve languages spoken metadata into children.csv instead of moving it to a confidential section, defaults to False
  49. :type preserve_metadata: bool, optional
  50. :param split_sessions: split its files into 1 session per day, defaults to False
  51. :type split_sessions: bool, optional
  52. :param weekend_flag: add a boolean flag for week-end days in recordings.csv, defaults to False
  53. :type weekend_flag: bool, optional
  54. """
  55. self.children = pd.DataFrame()
  56. self.recordings = pd.DataFrame()
  57. self.experiment = experiment
  58. self.metadata = metadata
  59. self.preserve_dates = bool(preserve_dates)
  60. self.recompute_ages = bool(recompute_ages)
  61. self.preserve_metadata = bool(preserve_metadata)
  62. self.split_sessions = bool(split_sessions)
  63. self.weekend_flag = bool(weekend_flag)
  64. def parse_cli(self):
  65. parser = argparse.ArgumentParser(description = 'Extract the metadata')
  66. parser.add_argument('experiment', help = 'collection name')
  67. parser.add_argument('metadata', help = 'path to original metadata')
  68. parser.add_argument('--preserve-dates', help = 'preserve true dates', action = 'store_true')
  69. parser.add_argument('--recompute-ages', help = 'extract children date of birth from the its and recompute ages', action = 'store_true')
  70. parser.add_argument('--preserve-metadata', help = 'preserve metadata such as languages spoken', action = 'store_true')
  71. parser.add_argument('--split-sessions', help = 'split its files into 1 session per day', action = 'store_true')
  72. self.args = parser.parse_args()
  73. self.experiment = self.args.experiment
  74. self.metadata = self.args.metadata
  75. self.preserve_dates = self.args.preserve_dates
  76. self.recompute_ages = self.args.recompute_ages
  77. self.preserve_metadata = self.args.preserve_metadata
  78. def recover_recording(self, annotation):
  79. try:
  80. xml = etree.parse(os.path.join('annotations/its/{}raw'.format('confidential/' if os.path.exists('annotations/its/confidential/raw') else ''), annotation['its_filename']))
  81. except Exception as e:
  82. print(str(e))
  83. return pd.DataFrame()
  84. timestamp_pattern = re.compile(r"^P(?:T?)(\d+(\.\d+)?)S$")
  85. annotation_recordings = xml.xpath('/ITS/ProcessingUnit/Recording')
  86. try:
  87. timezone_delta = int(xml.xpath('/ITS/ProcessingUnit/UPL_Header/TransferredUPL/RecordingInformation/Audio/TimeZone')[0].get('StandardSecondsOffset'))
  88. except:
  89. try:
  90. local_time = xml.xpath('/ITS/ProcessingUnit/UPL_Header/TransferredUPL/RecordingInformation/TransferTime')[0].get('LocalTime')
  91. utc_time = xml.xpath('/ITS/ProcessingUnit/UPL_Header/TransferredUPL/RecordingInformation/TransferTime')[0].get('UTCTime')
  92. timezone_delta = (datetime.strptime(local_time, '%Y-%m-%dT%H:%M:%S')-datetime.strptime(utc_time, '%Y-%m-%dT%H:%M:%S')).total_seconds()
  93. except Exception as e:
  94. print('could not recover timezone')
  95. timezone_delta = 0
  96. dob = xml.xpath('/ITS/ProcessingUnit/ChildInfo')[0].get('dob')
  97. if dob:
  98. child_dob = datetime.strptime(dob, '%Y-%m-%d')
  99. else:
  100. child_dob = None
  101. first_date = None
  102. n_session = 1
  103. recordings = []
  104. for rec in annotation_recordings:
  105. date = datetime.strptime(rec.get('startClockTime')[:19], '%Y-%m-%dT%H:%M:%S')
  106. time_is_real = date.strftime('%H:%M') != '00:00'
  107. if time_is_real:
  108. date += timedelta(seconds = timezone_delta)
  109. if first_date is None:
  110. first_date = date
  111. duration_seconds = float(extract_from_regex(timestamp_pattern, rec.get('endTime')))-float(extract_from_regex(timestamp_pattern, rec.get('startTime')))
  112. #round to avoid edge cases where float will get truncated to lower int
  113. duration = round(duration_seconds*1000)
  114. if self.split_sessions:
  115. if first_date and first_date.strftime('%Y-%m-%d') != date.strftime('%Y-%m-%d'):
  116. first_date = date
  117. n_session += 1
  118. session_id = os.path.splitext(annotation['its_filename'])[0] + '_' + str(n_session)
  119. else:
  120. session_id = os.path.splitext(annotation['its_filename'])[0]
  121. recording = {
  122. 'duration': duration,
  123. 'date_iso': date.strftime('%Y-%m-%d'),
  124. 'start_time': date.strftime('%H:%M') if time_is_real else 'NA',
  125. 'child_dob': child_dob,
  126. 'session_id': session_id,
  127. 'session_offset': int(1000*(date-first_date).total_seconds())
  128. }
  129. if self.weekend_flag:
  130. recording['weekend'] = date.weekday() >= 5
  131. if self.recompute_ages:
  132. recording['age'] = age(date, child_dob)
  133. elif 'age' in annotation:
  134. recording['age'] = annotation['age']
  135. else:
  136. raise Exception('ages not found, try with recompute_ages = True')
  137. recording.update(annotation)
  138. recording['recording_filename'] += '_{}'.format(rec.get('num'))
  139. recordings.append(recording)
  140. return pd.DataFrame(recordings)
  141. def process(self, its):
  142. self.children = its.copy()
  143. annotations = its.copy()
  144. annotations = annotations[['child_id', 'its_filename', 'age']]
  145. annotations['recording_filename'] = annotations['its_filename'].apply(lambda s: os.path.splitext(s)[0])
  146. pool = mp.Pool(processes = mp.cpu_count())
  147. self.recordings = pool.map(self.recover_recording, annotations.to_dict(orient = 'records'))
  148. self.recordings = pd.concat(self.recordings)
  149. self.recordings['recording_device_type'] = 'lena'
  150. self.recordings['experiment'] = self.experiment
  151. self.children.drop_duplicates('child_id', inplace = True, keep = 'first')
  152. self.children.set_index('child_id', inplace = True)
  153. if not self.preserve_dates:
  154. self.recordings['date_iso'] = '1000-01-01'
  155. self.children = self.children.merge(self.recordings.groupby('child_id').agg({'date_iso': 'min'}), left_index = True, right_index = True)
  156. self.children = self.children.apply(fake_dob, axis = 1)
  157. self.recordings = self.recordings.set_index('child_id')\
  158. .drop(columns = 'child_dob')\
  159. .merge(self.children[['child_dob']], left_index = True, right_index = True)\
  160. .reset_index()
  161. self.recordings = self.recordings.apply(recording_date, axis = 1)
  162. self.children['dob_criterion'] = 'extrapolated'
  163. self.children['dob_accuracy'] = 'day' # actually that is only true is the dates in input were not rounded already
  164. elif 'child_dob' not in self.children.columns:
  165. self.children = self.children.merge(
  166. self.recordings.drop_duplicates('child_id', keep = 'first').set_index('child_id')[['child_dob']],
  167. how = 'left',
  168. left_index = True,
  169. right_index = True
  170. )
  171. self.recordings.drop(columns = set(self.recordings.columns) & {'age', 'child_dob'}, inplace = True)
  172. self.children['experiment'] = self.experiment
  173. self.children.index = self.children.index.astype(str)
  174. self.recordings['child_id'] = self.recordings['child_id'].astype(str)
  175. self.recordings['session_id'] = self.recordings['session_id'].astype(str)
  176. self.recordings.index = self.recordings.index.astype(str)
  177. if not self.preserve_metadata:
  178. confidential = self.children[['languages']]
  179. confidential.to_csv('metadata/confidential/children.csv', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
  180. self.children.drop(columns = ['languages'], inplace = True)
  181. try:
  182. self.children['ses'] = self.children['ses'].astype(int)
  183. except:
  184. self.children['ses'] = self.children['ses'].astype(str).apply(lambda x: np.where(x.isdigit(), x, 'NA'))
  185. return self.children, self.recordings