LAAC-LSCP
/
tools


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234
							import csv
import multiprocessing as mp
import os
import re
from datetime import datetime, timedelta
import numpy as np
import pandas as pd
from dateutil.relativedelta import relativedelta
from lxml import etree

def extract_from_regex(pattern, subject):
    match = pattern.search(subject)
    return match.group(1) if match else ''

def recording_date(row):
    row['date_iso'] = (
        datetime.strptime(row['child_dob'], '%Y-%m-%d') + relativedelta(months = row['age'])
    ).strftime('%Y-%m-%d')
    return row

def fake_dob(row):
    row['child_dob'] = (datetime.strptime(row['date_iso'], '%Y-%m-%d') - relativedelta(months = row['age'])).strftime('%Y-%m-%d')
    row['child_dob'] = row['child_dob'].rjust(len('1000-01-01'), '0')
    return row

def age(date, dob, unit = 'month'):
    delta = relativedelta(date, dob)

    if unit == 'year':
        return delta.years
    elif unit == 'month':
        return delta.years*12 + delta.months
    elif unit == 'day':
        return 

    
class MetadataImporter:

    def __init__(self,
        experiment: str,
        metadata: str,
        preserve_dates: bool = False,
        recompute_ages: bool = False,
        preserve_metadata: bool = False,
        split_sessions: bool = False,
        weekend_flag: bool = False):
        """MetadataImporter

        :param experiment: corpus name
        :type experiment: str
        :param metadata: path to existing metadata csv file
        :type metadata: str
        :param preserve_dates: preserve true dates instead of setting children date of birth to 1000-01-01 and all recording dates accordingly, defaults to False
        :type preserve_dates: bool, optional
        :param recompute_ages: extract children date of birth from the its and recompute ages, defaults to False
        :type recompute_ages: bool, optional
        :param preserve_metadata: preserve languages spoken metadata into children.csv instead of moving it to a confidential section, defaults to False
        :type preserve_metadata: bool, optional
        :param split_sessions: split its files into 1 session per day, defaults to False
        :type split_sessions: bool, optional
        :param weekend_flag: add a boolean flag for week-end days in recordings.csv, defaults to False
        :type weekend_flag: bool, optional
        """

        self.children = pd.DataFrame()
        self.recordings = pd.DataFrame()
        
        self.experiment = experiment
        self.metadata = metadata
        self.preserve_dates = bool(preserve_dates)
        self.recompute_ages = bool(recompute_ages)
        self.preserve_metadata = bool(preserve_metadata)
        self.split_sessions = bool(split_sessions)
        self.weekend_flag = bool(weekend_flag)

    def parse_cli(self):
        parser = argparse.ArgumentParser(description = 'Extract the metadata')
        parser.add_argument('experiment', help = 'collection name')
        parser.add_argument('metadata', help = 'path to original metadata')
        parser.add_argument('--preserve-dates', help = 'preserve true dates', action = 'store_true')
        parser.add_argument('--recompute-ages', help = 'extract children date of birth from the its and recompute ages', action = 'store_true')
        parser.add_argument('--preserve-metadata', help = 'preserve metadata such as languages spoken', action = 'store_true')
        parser.add_argument('--split-sessions', help = 'split its files into 1 session per day', action = 'store_true')
        self.args = parser.parse_args()

        self.experiment = self.args.experiment
        self.metadata = self.args.metadata
        self.preserve_dates = self.args.preserve_dates
        self.recompute_ages = self.args.recompute_ages
        self.preserve_metadata = self.args.preserve_metadata


    def recover_recording(self, annotation):
        try:
            xml = etree.parse(os.path.join('annotations/its/{}raw'.format('confidential/' if os.path.exists('annotations/its/confidential/raw') else ''), annotation['its_filename']))
        except Exception as e:
            print(str(e))
            return pd.DataFrame()

        timestamp_pattern = re.compile(r"^P(?:T?)(\d+(\.\d+)?)S$")

        annotation_recordings = xml.xpath('/ITS/ProcessingUnit/Recording')
        try:
            timezone_delta = int(xml.xpath('/ITS/ProcessingUnit/UPL_Header/TransferredUPL/RecordingInformation/Audio/TimeZone')[0].get('StandardSecondsOffset'))
        except:
            try:
                local_time = xml.xpath('/ITS/ProcessingUnit/UPL_Header/TransferredUPL/RecordingInformation/TransferTime')[0].get('LocalTime')
                utc_time = xml.xpath('/ITS/ProcessingUnit/UPL_Header/TransferredUPL/RecordingInformation/TransferTime')[0].get('UTCTime')
                timezone_delta = (datetime.strptime(local_time, '%Y-%m-%dT%H:%M:%S')-datetime.strptime(utc_time, '%Y-%m-%dT%H:%M:%S')).total_seconds()
            except Exception as e:
                print('could not recover timezone')
                timezone_delta = 0

        dob = xml.xpath('/ITS/ProcessingUnit/ChildInfo')[0].get('dob')

        if dob:
            child_dob = datetime.strptime(dob, '%Y-%m-%d')
        else:
            child_dob = None
        
        first_date = None
        n_session = 1
        recordings = []

        for rec in annotation_recordings:
            date = datetime.strptime(rec.get('startClockTime')[:19], '%Y-%m-%dT%H:%M:%S')
            time_is_real = date.strftime('%H:%M') != '00:00'

            if time_is_real:
                date += timedelta(seconds = timezone_delta)

            if first_date is None:
                first_date = date

            duration_seconds = float(extract_from_regex(timestamp_pattern, rec.get('endTime')))-float(extract_from_regex(timestamp_pattern, rec.get('startTime')))
            #round to avoid edge cases where float will get truncated to lower int
            duration = round(duration_seconds*1000)

            if self.split_sessions:
                if first_date and first_date.strftime('%Y-%m-%d') != date.strftime('%Y-%m-%d'):
                    first_date = date
                    n_session += 1

                session_id = os.path.splitext(annotation['its_filename'])[0] + '_' + str(n_session)

            else:
                session_id = os.path.splitext(annotation['its_filename'])[0]

            recording = {
                'duration': duration,
                'date_iso': date.strftime('%Y-%m-%d'),
                'start_time': date.strftime('%H:%M') if time_is_real else 'NA',
                'child_dob': child_dob,
                'session_id': session_id,
                'session_offset': int(1000*(date-first_date).total_seconds())
            }

            if self.weekend_flag:
                recording['weekend'] = date.weekday() >= 5

            if self.recompute_ages:
                recording['age'] = age(date, child_dob)
            elif 'age' in annotation:
                recording['age'] = annotation['age']
            else:
                raise Exception('ages not found, try with recompute_ages = True')
            
            recording.update(annotation)
            recording['recording_filename'] += '_{}'.format(rec.get('num'))
            recordings.append(recording)

        return pd.DataFrame(recordings)

    def process(self, its):
        self.children = its.copy()
        annotations = its.copy()
        annotations = annotations[['child_id', 'its_filename', 'age']]
        annotations['recording_filename'] = annotations['its_filename'].apply(lambda s: os.path.splitext(s)[0])

        pool = mp.Pool(processes = mp.cpu_count())
        self.recordings = pool.map(self.recover_recording, annotations.to_dict(orient = 'records'))
        self.recordings = pd.concat(self.recordings)

        self.recordings['recording_device_type'] = 'lena'
        self.recordings['experiment'] = self.experiment

        self.children.drop_duplicates('child_id', inplace = True, keep = 'first')
        self.children.set_index('child_id', inplace = True)

        if not self.preserve_dates:
            self.recordings['date_iso'] = '1000-01-01'

            self.children = self.children.merge(self.recordings.groupby('child_id').agg({'date_iso': 'min'}), left_index = True, right_index = True)
            self.children = self.children.apply(fake_dob, axis = 1)

            self.recordings = self.recordings.set_index('child_id')\
                .drop(columns = 'child_dob')\
                .merge(self.children[['child_dob']], left_index = True, right_index = True)\
                .reset_index()

            self.recordings = self.recordings.apply(recording_date, axis = 1)

            self.children['dob_criterion'] = 'extrapolated'
            self.children['dob_accuracy'] = 'day' # actually that is only true is the dates in input were not rounded already

        elif 'child_dob' not in self.children.columns:

            self.children = self.children.merge(
                self.recordings.drop_duplicates('child_id', keep = 'first').set_index('child_id')[['child_dob']],
                how = 'left',
                left_index = True,
                right_index = True
            )

        self.recordings.drop(columns = set(self.recordings.columns) & {'age', 'child_dob'}, inplace = True)

        self.children['experiment'] = self.experiment

        self.children.index = self.children.index.astype(str)
        self.recordings['child_id'] = self.recordings['child_id'].astype(str)
        self.recordings['session_id'] = self.recordings['session_id'].astype(str)
        self.recordings.index = self.recordings.index.astype(str)

        if not self.preserve_metadata:
            confidential = self.children[['languages']]
            confidential.to_csv('metadata/confidential/children.csv', quotechar='"', quoting=csv.QUOTE_NONNUMERIC)
            self.children.drop(columns = ['languages'], inplace = True)

        try:
            self.children['ses'] = self.children['ses'].astype(int)
        except:
            self.children['ses'] = self.children['ses'].astype(str).apply(lambda x: np.where(x.isdigit(), x, 'NA'))

        return self.children, self.recordings