LAAC-LSCP
/
tools


			
			
				
					
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
							import csv
import multiprocessing as mp
import os
import re
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
from dateutil.relativedelta import relativedelta
from lxml import etree

def extract_from_regex(pattern, subject):
    match = pattern.search(subject)
    return match.group(1) if match else ''

def recording_date(row):
    row['date_iso'] = (
        datetime.strptime(row['child_dob'], '%Y-%m-%d') + relativedelta(months = row['age'])
    ).strftime('%Y-%m-%d')
    return row

def fake_dob(row):
    row['child_dob'] = (datetime.strptime(row['date_iso'], '%Y-%m-%d') - relativedelta(months = row['age'])).strftime('%Y-%m-%d')
    row['child_dob'] = row['child_dob'].rjust(len('1000-01-01'), '0')
    return row

class MetadataImporter:

    def __init__(self,
        experiment,
        metadata,
        preserve_dates = False,
        recompute_ages = False,
        preserve_metadata = False,
        split_sessions = False,
        weekend_flag = False):

        self.children = pd.DataFrame()
        self.recordings = pd.DataFrame()
        
        self.experiment = experiment
        self.metadata = metadata
        self.preserve_dates = bool(preserve_dates)
        self.recompute_ages = bool(recompute_ages)
        self.preserve_metadata = bool(preserve_metadata)
        self.split_sessions = bool(split_sessions)
        self.weekend_flag = bool(weekend_flag)

    def parse_cli(self):
        parser = argparse.ArgumentParser(description = 'Extract the metadata')
        parser.add_argument('experiment', help = 'collection name')
        parser.add_argument('metadata', help = 'path to original metadata')
        parser.add_argument('--preserve-dates', help = 'preserve true dates', action = 'store_true')
        parser.add_argument('--recompute-ages', help = 'extract children date of birth from the its and recompute ages', action = 'store_true')
        parser.add_argument('--preserve-metadata', help = 'preserve metadata such as languages spoken', action = 'store_true')
        parser.add_argument('--split-sessions', help = 'split its files into 1 session per day', action = 'store_true')
        self.args = parser.parse_args()

        self.experiment = self.args.experiment
        self.metadata = self.args.metadata
        self.preserve_dates = self.args.preserve_dates
        self.recompute_ages = self.args.recompute_ages
        self.preserve_metadata = self.args.preserve_metadata

    def age(date, dob, unit = 'month'):
        delta = relativedelta(date, dob)

        if unit == 'year':
            return delta.years
        elif unit == 'month':
            return delta.years*12 + delta.months
        elif unit == 'day':
            return 

    def recover_recording(self, annotation):
        try:
            xml = etree.parse(os.path.join('annotations/its/{}raw'.format('confidential/' if os.path.exists('annotations/its/confidential/raw') else ''), annotation['its_filename']))
        except Exception as e:
            print(str(e))
            return pd.DataFrame()

        timestamp_pattern = re.compile(r"^P(?:T?)(\d+(\.\d+)?)S$")

        annotation_recordings = xml.xpath('/ITS/ProcessingUnit/Recording')
        try:
            timezone_delta = int(xml.xpath('/ITS/ProcessingUnit/UPL_Header/TransferredUPL/RecordingInformation/Audio/TimeZone')[0].get('StandardSecondsOffset'))
        except:
            try:
                local_time = xml.xpath('/ITS/ProcessingUnit/UPL_Header/TransferredUPL/RecordingInformation/TransferTime')[0].get('LocalTime')
                utc_time = xml.xpath('/ITS/ProcessingUnit/UPL_Header/TransferredUPL/RecordingInformation/TransferTime')[0].get('UTCTime')
                timezone_delta = (datetime.strptime(local_time, '%Y-%m-%dT%H:%M:%S')-datetime.strptime(utc_time, '%Y-%m-%dT%H:%M:%S')).total_seconds()
            except Exception as e:
                print('could not recover timezone')
                timezone_delta = 0

        dob = xml.xpath('/ITS/ProcessingUnit/ChildInfo')[0].get('dob')

        if dob:
            child_dob = datetime.strptime(dob, '%Y-%m-%d')
        else:
            child_dob = None
        
        first_date = None
        n_session = 1
        recordings = []

        for rec in annotation_recordings:
            date = datetime.strptime(rec.get('startClockTime')[:19], '%Y-%m-%dT%H:%M:%S')
            time_is_real = date.strftime('%H:%M') != '00:00'

            if time_is_real:
                date += timedelta(seconds = timezone_delta)

            if first_date is None:
                first_date = date

            duration_seconds = float(extract_from_regex(timestamp_pattern, rec.get('endTime')))-float(extract_from_regex(timestamp_pattern, rec.get('startTime')))
            duration = int(duration_seconds*1000)

            if self.split_sessions:
                if first_date and first_date.strftime('%Y-%m-%d') != date.strftime('%Y-%m-%d'):
                    first_date = date
                    n_session += 1

                session_id = os.path.splitext(annotation['its_filename'])[0] + '_' + str(n_session)

            else:
                session_id = os.path.splitext(annotation['its_filename'])[0]

            recording = {
                'duration': duration,
                'date_iso': date.strftime('%Y-%m-%d'),
                'start_time': date.strftime('%H:%M') if time_is_real else 'NA',
                'child_dob': child_dob,
                'session_id': session_id,
                'session_offset': int(1000*(date-first_date).total_seconds())
            }

            if self.weekend_flag:
                recording['weekend'] = date.weekday() >= 5

            if self.recompute_ages:
                recording['age'] = age(date, child_dob)
            elif 'age' in annotation:
                recording['age'] = annotation['age']
            else:
                raise Exception('ages not found, try with recompute_ages = True')
            
            recording.update(annotation)
            recording['recording_filename'] += '_{}'.format(rec.get('num'))
            recordings.append(recording)

        return pd.DataFrame(recordings)

    def process(self, its):
        self.children = its.copy()
        annotations = its.copy()
        annotations = annotations[['child_id', 'its_filename', 'age']]
        annotations['recording_filename'] = annotations['its_filename'].apply(lambda s: os.path.splitext(s)[0])

        pool = mp.Pool(processes = mp.cpu_count())
        self.recordings = pool.map(self.recover_recording, annotations.to_dict(orient = 'records'))
        self.recordings = pd.concat(self.recordings)

        self.recordings['recording_device_type'] = 'lena'
        self.recordings['experiment'] = self.experiment

        self.children.drop_duplicates('child_id', inplace = True, keep = 'first')
        self.children.set_index('child_id', inplace = True)

        if not self.preserve_dates:
            self.recordings['date_iso'] = '1000-01-01'

            self.children = self.children.merge(self.recordings.groupby('child_id').agg({'date_iso': 'min'}), left_index = True, right_index = True)
            self.children = self.children.apply(fake_dob, axis = 1)

            self.recordings = self.recordings.set_index('child_id')\
                .drop(columns = 'child_dob')\
                .merge(self.children[['child_dob']], left_index = True, right_index = True)\
                .reset_index()

            self.recordings = self.recordings.apply(recording_date, axis = 1)

            self.children['dob_criterion'] = 'extrapolated'
            self.children['dob_accuracy'] = 'month'

        elif 'child_dob' not in self.children.columns:

            self.children = self.children.merge(
                self.recordings.drop_duplicates('child_id', keep = 'first').set_index('child_id')[['child_dob']],
                how = 'left',
                left_index = True,
                right_index = True
            )

        self.recordings.drop(columns = set(self.recordings.columns) & {'age', 'child_dob'}, inplace = True)

        self.children['experiment'] = self.experiment

        self.children['child_id'] = self.children['child_id'].astype(str)
        self.recordings['child_id'] = self.recordings['child_id'].astype(str)
        self.recordings['session_id'] = self.recordings['session_id'].astype(str)
        self.recordings['recording_filename'] = self.recordings['recording_filename'].astype(str)

        if not self.preserve_metadata:
            confidential = self.children[['languages']]
            confidential.to_csv('metadata/confidential/children.csv', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
            self.children.drop(columns = ['languages'], inplace = True)

        try:
            self.children['ses'] = self.children['ses'].astype(int)
        except:
            self.children['ses'] = self.children['ses'].astype(str).apply(lambda x: np.where(x.isdigit(), x, 'NA'))

        return self.children, self.recordings