metadata.py 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. import csv
  2. import multiprocessing as mp
  3. import os
  4. import re
  5. from datetime import datetime, timedelta
  6. import numpy as np
  7. import pandas as pd
  8. from dateutil.relativedelta import relativedelta
  9. from lxml import etree
  10. def extract_from_regex(pattern, subject):
  11. match = pattern.search(subject)
  12. return match.group(1) if match else ''
  13. def recording_date(row):
  14. row['date_iso'] = (
  15. datetime.strptime(row['child_dob'], '%Y-%m-%d') + relativedelta(months = row['age'])
  16. ).strftime('%Y-%m-%d')
  17. return row
  18. def fake_dob(row):
  19. row['child_dob'] = (datetime.strptime(row['date_iso'], '%Y-%m-%d') - relativedelta(months = row['age'])).strftime('%Y-%m-%d')
  20. row['child_dob'] = row['child_dob'].rjust(len('1000-01-01'), '0')
  21. return row
  22. class MetadataImporter:
  23. def __init__(self,
  24. experiment,
  25. metadata,
  26. preserve_dates = False,
  27. recompute_ages = False,
  28. preserve_metadata = False,
  29. split_sessions = False,
  30. weekend_flag = False):
  31. self.children = pd.DataFrame()
  32. self.recordings = pd.DataFrame()
  33. self.experiment = experiment
  34. self.metadata = metadata
  35. self.preserve_dates = bool(preserve_dates)
  36. self.recompute_ages = bool(recompute_ages)
  37. self.preserve_metadata = bool(preserve_metadata)
  38. self.split_sessions = bool(split_sessions)
  39. self.weekend_flag = bool(weekend_flag)
  40. def parse_cli(self):
  41. parser = argparse.ArgumentParser(description = 'Extract the metadata')
  42. parser.add_argument('experiment', help = 'collection name')
  43. parser.add_argument('metadata', help = 'path to original metadata')
  44. parser.add_argument('--preserve-dates', help = 'preserve true dates', action = 'store_true')
  45. parser.add_argument('--recompute-ages', help = 'extract children date of birth from the its and recompute ages', action = 'store_true')
  46. parser.add_argument('--preserve-metadata', help = 'preserve metadata such as languages spoken', action = 'store_true')
  47. parser.add_argument('--split-sessions', help = 'split its files into 1 session per day', action = 'store_true')
  48. self.args = parser.parse_args()
  49. self.experiment = self.args.experiment
  50. self.metadata = self.args.metadata
  51. self.preserve_dates = self.args.preserve_dates
  52. self.recompute_ages = self.args.recompute_ages
  53. self.preserve_metadata = self.args.preserve_metadata
  54. def age(date, dob, unit = 'month'):
  55. delta = relativedelta(date, dob)
  56. if unit == 'year':
  57. return delta.years
  58. elif unit == 'month':
  59. return delta.years*12 + delta.months
  60. elif unit == 'day':
  61. return
  62. def recover_recording(self, annotation):
  63. try:
  64. xml = etree.parse(os.path.join('annotations/its/{}raw'.format('confidential/' if os.path.exists('annotations/its/confidential/raw') else ''), annotation['its_filename']))
  65. except Exception as e:
  66. print(str(e))
  67. return pd.DataFrame()
  68. timestamp_pattern = re.compile(r"^P(?:T?)(\d+(\.\d+)?)S$")
  69. annotation_recordings = xml.xpath('/ITS/ProcessingUnit/Recording')
  70. try:
  71. timezone_delta = int(xml.xpath('/ITS/ProcessingUnit/UPL_Header/TransferredUPL/RecordingInformation/Audio/TimeZone')[0].get('StandardSecondsOffset'))
  72. except:
  73. try:
  74. local_time = xml.xpath('/ITS/ProcessingUnit/UPL_Header/TransferredUPL/RecordingInformation/TransferTime')[0].get('LocalTime')
  75. utc_time = xml.xpath('/ITS/ProcessingUnit/UPL_Header/TransferredUPL/RecordingInformation/TransferTime')[0].get('UTCTime')
  76. timezone_delta = (datetime.strptime(local_time, '%Y-%m-%dT%H:%M:%S')-datetime.strptime(utc_time, '%Y-%m-%dT%H:%M:%S')).total_seconds()
  77. except Exception as e:
  78. print('could not recover timezone')
  79. timezone_delta = 0
  80. dob = xml.xpath('/ITS/ProcessingUnit/ChildInfo')[0].get('dob')
  81. if dob:
  82. child_dob = datetime.strptime(dob, '%Y-%m-%d')
  83. else:
  84. child_dob = None
  85. first_date = None
  86. n_session = 1
  87. recordings = []
  88. for rec in annotation_recordings:
  89. date = datetime.strptime(rec.get('startClockTime')[:19], '%Y-%m-%dT%H:%M:%S')
  90. time_is_real = date.strftime('%H:%M') != '00:00'
  91. if time_is_real:
  92. date += timedelta(seconds = timezone_delta)
  93. if first_date is None:
  94. first_date = date
  95. duration_seconds = float(extract_from_regex(timestamp_pattern, rec.get('endTime')))-float(extract_from_regex(timestamp_pattern, rec.get('startTime')))
  96. duration = int(duration_seconds*1000)
  97. if self.split_sessions:
  98. if first_date and first_date.strftime('%Y-%m-%d') != date.strftime('%Y-%m-%d'):
  99. first_date = date
  100. n_session += 1
  101. session_id = os.path.splitext(annotation['its_filename'])[0] + '_' + str(n_session)
  102. else:
  103. session_id = os.path.splitext(annotation['its_filename'])[0]
  104. recording = {
  105. 'duration': duration,
  106. 'date_iso': date.strftime('%Y-%m-%d'),
  107. 'start_time': date.strftime('%H:%M') if time_is_real else 'NA',
  108. 'child_dob': child_dob,
  109. 'session_id': session_id,
  110. 'session_offset': int(1000*(date-first_date).total_seconds())
  111. }
  112. if self.weekend_flag:
  113. recording['weekend'] = date.weekday() >= 5
  114. if self.recompute_ages:
  115. recording['age'] = age(date, child_dob)
  116. elif 'age' in annotation:
  117. recording['age'] = annotation['age']
  118. else:
  119. raise Exception('ages not found, try with recompute_ages = True')
  120. recording.update(annotation)
  121. recording['recording_filename'] += '_{}'.format(rec.get('num'))
  122. recordings.append(recording)
  123. return pd.DataFrame(recordings)
  124. def process(self, its):
  125. self.children = its.copy()
  126. annotations = its.copy()
  127. annotations = annotations[['child_id', 'its_filename', 'age']]
  128. annotations['recording_filename'] = annotations['its_filename'].apply(lambda s: os.path.splitext(s)[0])
  129. pool = mp.Pool(processes = mp.cpu_count())
  130. self.recordings = pool.map(self.recover_recording, annotations.to_dict(orient = 'records'))
  131. self.recordings = pd.concat(self.recordings)
  132. self.recordings['recording_device_type'] = 'lena'
  133. self.recordings['experiment'] = self.experiment
  134. self.children.drop_duplicates('child_id', inplace = True, keep = 'first')
  135. self.children.set_index('child_id', inplace = True)
  136. if not self.preserve_dates:
  137. self.recordings['date_iso'] = '1000-01-01'
  138. self.children = self.children.merge(self.recordings.groupby('child_id').agg({'date_iso': 'min'}), left_index = True, right_index = True)
  139. self.children = self.children.apply(fake_dob, axis = 1)
  140. self.recordings = self.recordings.set_index('child_id')\
  141. .drop(columns = 'child_dob')\
  142. .merge(self.children[['child_dob']], left_index = True, right_index = True)\
  143. .reset_index()
  144. self.recordings = self.recordings.apply(recording_date, axis = 1)
  145. self.children['dob_criterion'] = 'extrapolated'
  146. self.children['dob_accuracy'] = 'month'
  147. elif 'child_dob' not in self.children.columns:
  148. self.children = self.children.merge(
  149. self.recordings.drop_duplicates('child_id', keep = 'first').set_index('child_id')[['child_dob']],
  150. how = 'left',
  151. left_index = True,
  152. right_index = True
  153. )
  154. self.recordings.drop(columns = set(self.recordings.columns) & {'age', 'child_dob'}, inplace = True)
  155. self.children['experiment'] = self.experiment
  156. self.children['child_id'] = self.children['child_id'].astype(str)
  157. self.recordings['child_id'] = self.recordings['child_id'].astype(str)
  158. self.recordings['session_id'] = self.recordings['session_id'].astype(str)
  159. self.recordings['recording_filename'] = self.recordings['recording_filename'].astype(str)
  160. if not self.preserve_metadata:
  161. confidential = self.children[['languages']]
  162. confidential.to_csv('metadata/confidential/children.csv', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
  163. self.children.drop(columns = ['languages'], inplace = True)
  164. try:
  165. self.children['ses'] = self.children['ses'].astype(int)
  166. except:
  167. self.children['ses'] = self.children['ses'].astype(str).apply(lambda x: np.where(x.isdigit(), x, 'NA'))
  168. return self.children, self.recordings