123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234 |
- import csv
- import multiprocessing as mp
- import os
- import re
- from datetime import datetime, timedelta
- import numpy as np
- import pandas as pd
- from dateutil.relativedelta import relativedelta
- from lxml import etree
- def extract_from_regex(pattern, subject):
- match = pattern.search(subject)
- return match.group(1) if match else ''
- def recording_date(row):
- row['date_iso'] = (
- datetime.strptime(row['child_dob'], '%Y-%m-%d') + relativedelta(months = row['age'])
- ).strftime('%Y-%m-%d')
- return row
- def fake_dob(row):
- row['child_dob'] = (datetime.strptime(row['date_iso'], '%Y-%m-%d') - relativedelta(months = row['age'])).strftime('%Y-%m-%d')
- row['child_dob'] = row['child_dob'].rjust(len('1000-01-01'), '0')
- return row
- def age(date, dob, unit = 'month'):
- delta = relativedelta(date, dob)
- if unit == 'year':
- return delta.years
- elif unit == 'month':
- return delta.years*12 + delta.months
- elif unit == 'day':
- return
-
- class MetadataImporter:
- def __init__(self,
- experiment: str,
- metadata: str,
- preserve_dates: bool = False,
- recompute_ages: bool = False,
- preserve_metadata: bool = False,
- split_sessions: bool = False,
- weekend_flag: bool = False):
- """MetadataImporter
- :param experiment: corpus name
- :type experiment: str
- :param metadata: path to existing metadata csv file
- :type metadata: str
- :param preserve_dates: preserve true dates instead of setting children date of birth to 1000-01-01 and all recording dates accordingly, defaults to False
- :type preserve_dates: bool, optional
- :param recompute_ages: extract children date of birth from the its and recompute ages, defaults to False
- :type recompute_ages: bool, optional
- :param preserve_metadata: preserve languages spoken metadata into children.csv instead of moving it to a confidential section, defaults to False
- :type preserve_metadata: bool, optional
- :param split_sessions: split its files into 1 session per day, defaults to False
- :type split_sessions: bool, optional
- :param weekend_flag: add a boolean flag for week-end days in recordings.csv, defaults to False
- :type weekend_flag: bool, optional
- """
- self.children = pd.DataFrame()
- self.recordings = pd.DataFrame()
-
- self.experiment = experiment
- self.metadata = metadata
- self.preserve_dates = bool(preserve_dates)
- self.recompute_ages = bool(recompute_ages)
- self.preserve_metadata = bool(preserve_metadata)
- self.split_sessions = bool(split_sessions)
- self.weekend_flag = bool(weekend_flag)
- def parse_cli(self):
- parser = argparse.ArgumentParser(description = 'Extract the metadata')
- parser.add_argument('experiment', help = 'collection name')
- parser.add_argument('metadata', help = 'path to original metadata')
- parser.add_argument('--preserve-dates', help = 'preserve true dates', action = 'store_true')
- parser.add_argument('--recompute-ages', help = 'extract children date of birth from the its and recompute ages', action = 'store_true')
- parser.add_argument('--preserve-metadata', help = 'preserve metadata such as languages spoken', action = 'store_true')
- parser.add_argument('--split-sessions', help = 'split its files into 1 session per day', action = 'store_true')
- self.args = parser.parse_args()
- self.experiment = self.args.experiment
- self.metadata = self.args.metadata
- self.preserve_dates = self.args.preserve_dates
- self.recompute_ages = self.args.recompute_ages
- self.preserve_metadata = self.args.preserve_metadata
- def recover_recording(self, annotation):
- try:
- xml = etree.parse(os.path.join('annotations/its/{}raw'.format('confidential/' if os.path.exists('annotations/its/confidential/raw') else ''), annotation['its_filename']))
- except Exception as e:
- print(str(e))
- return pd.DataFrame()
- timestamp_pattern = re.compile(r"^P(?:T?)(\d+(\.\d+)?)S$")
- annotation_recordings = xml.xpath('/ITS/ProcessingUnit/Recording')
- try:
- timezone_delta = int(xml.xpath('/ITS/ProcessingUnit/UPL_Header/TransferredUPL/RecordingInformation/Audio/TimeZone')[0].get('StandardSecondsOffset'))
- except:
- try:
- local_time = xml.xpath('/ITS/ProcessingUnit/UPL_Header/TransferredUPL/RecordingInformation/TransferTime')[0].get('LocalTime')
- utc_time = xml.xpath('/ITS/ProcessingUnit/UPL_Header/TransferredUPL/RecordingInformation/TransferTime')[0].get('UTCTime')
- timezone_delta = (datetime.strptime(local_time, '%Y-%m-%dT%H:%M:%S')-datetime.strptime(utc_time, '%Y-%m-%dT%H:%M:%S')).total_seconds()
- except Exception as e:
- print('could not recover timezone')
- timezone_delta = 0
- dob = xml.xpath('/ITS/ProcessingUnit/ChildInfo')[0].get('dob')
- if dob:
- child_dob = datetime.strptime(dob, '%Y-%m-%d')
- else:
- child_dob = None
-
- first_date = None
- n_session = 1
- recordings = []
- for rec in annotation_recordings:
- date = datetime.strptime(rec.get('startClockTime')[:19], '%Y-%m-%dT%H:%M:%S')
- time_is_real = date.strftime('%H:%M') != '00:00'
- if time_is_real:
- date += timedelta(seconds = timezone_delta)
- if first_date is None:
- first_date = date
- duration_seconds = float(extract_from_regex(timestamp_pattern, rec.get('endTime')))-float(extract_from_regex(timestamp_pattern, rec.get('startTime')))
- #round to avoid edge cases where float will get truncated to lower int
- duration = round(duration_seconds*1000)
- if self.split_sessions:
- if first_date and first_date.strftime('%Y-%m-%d') != date.strftime('%Y-%m-%d'):
- first_date = date
- n_session += 1
- session_id = os.path.splitext(annotation['its_filename'])[0] + '_' + str(n_session)
- else:
- session_id = os.path.splitext(annotation['its_filename'])[0]
- recording = {
- 'duration': duration,
- 'date_iso': date.strftime('%Y-%m-%d'),
- 'start_time': date.strftime('%H:%M') if time_is_real else 'NA',
- 'child_dob': child_dob,
- 'session_id': session_id,
- 'session_offset': int(1000*(date-first_date).total_seconds())
- }
- if self.weekend_flag:
- recording['weekend'] = date.weekday() >= 5
- if self.recompute_ages:
- recording['age'] = age(date, child_dob)
- elif 'age' in annotation:
- recording['age'] = annotation['age']
- else:
- raise Exception('ages not found, try with recompute_ages = True')
-
- recording.update(annotation)
- recording['recording_filename'] += '_{}'.format(rec.get('num'))
- recordings.append(recording)
- return pd.DataFrame(recordings)
- def process(self, its):
- self.children = its.copy()
- annotations = its.copy()
- annotations = annotations[['child_id', 'its_filename', 'age']]
- annotations['recording_filename'] = annotations['its_filename'].apply(lambda s: os.path.splitext(s)[0])
- pool = mp.Pool(processes = mp.cpu_count())
- self.recordings = pool.map(self.recover_recording, annotations.to_dict(orient = 'records'))
- self.recordings = pd.concat(self.recordings)
- self.recordings['recording_device_type'] = 'lena'
- self.recordings['experiment'] = self.experiment
- self.children.drop_duplicates('child_id', inplace = True, keep = 'first')
- self.children.set_index('child_id', inplace = True)
- if not self.preserve_dates:
- self.recordings['date_iso'] = '1000-01-01'
- self.children = self.children.merge(self.recordings.groupby('child_id').agg({'date_iso': 'min'}), left_index = True, right_index = True)
- self.children = self.children.apply(fake_dob, axis = 1)
- self.recordings = self.recordings.set_index('child_id')\
- .drop(columns = 'child_dob')\
- .merge(self.children[['child_dob']], left_index = True, right_index = True)\
- .reset_index()
- self.recordings = self.recordings.apply(recording_date, axis = 1)
- self.children['dob_criterion'] = 'extrapolated'
- self.children['dob_accuracy'] = 'day' # actually that is only true is the dates in input were not rounded already
- elif 'child_dob' not in self.children.columns:
- self.children = self.children.merge(
- self.recordings.drop_duplicates('child_id', keep = 'first').set_index('child_id')[['child_dob']],
- how = 'left',
- left_index = True,
- right_index = True
- )
- self.recordings.drop(columns = set(self.recordings.columns) & {'age', 'child_dob'}, inplace = True)
- self.children['experiment'] = self.experiment
- self.children.index = self.children.index.astype(str)
- self.recordings['child_id'] = self.recordings['child_id'].astype(str)
- self.recordings['session_id'] = self.recordings['session_id'].astype(str)
- self.recordings.index = self.recordings.index.astype(str)
- if not self.preserve_metadata:
- confidential = self.children[['languages']]
- confidential.to_csv('metadata/confidential/children.csv', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
- self.children.drop(columns = ['languages'], inplace = True)
- try:
- self.children['ses'] = self.children['ses'].astype(int)
- except:
- self.children['ses'] = self.children['ses'].astype(str).apply(lambda x: np.where(x.isdigit(), x, 'NA'))
- return self.children, self.recordings
|