LAAC-LSCP
/
tools


			
			
				
					
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
							from ChildProject.projects import ChildProject
from ChildProject.annotations import AnnotationManager

from datetime import datetime
import multiprocessing as mp
import os
import pandas as pd

import re

class DatasetTester:
    def __init__(self, path: str, threads: int = 1):
        self.project = ChildProject(path)
        self.am = AnnotationManager(self.project)
        self.am.read()
        
        threads = int(threads)
        self.threads = threads if threads >= 1 else mp.cpu_count()

    def test_validation(self):
        errors, warnings = self.project.validate(ignore_files = True)
        assert len(errors) == 0, 'project validation failed'

        errors, warnings = self.am.validate(threads = self.threads)
        assert len(errors) == 0, 'annotations validation failed'

    def test_age(self):
        children = self.project.children.copy()
        recordings = self.project.recordings.copy()

        recordings = recordings.merge(
            children,
            how = 'left',
            left_on = 'child_id',
            right_on = 'child_id'
        )

        recordings['date_iso'] = recordings['date_iso'].apply(
            lambda s: datetime.strptime(s, '%Y-%m-%d')
        )

        recordings['child_dob'] = recordings['child_dob'].apply(
            lambda s: datetime.strptime(s, '%Y-%m-%d')
        )

        assert all(recordings.apply(
            lambda row: row['date_iso'] > row['child_dob'],
            axis = 1
        ))

    def test_ses(self):
        children = self.project.children.copy()
    
        children = children.dropna(subset = ['ses'])
        children = children[children['ses'] != 'NA']
        children['ses'] = children['ses'].astype(int)

        assert (children['ses'].values >= 1).all() and (children['ses'].values <= 5).all(), "ses should be >= 1 and <= 5"

    def test_language(self):
        children = self.project.children.copy()

        confidential_children_md_path = os.path.join(self.project.path, 'metadata/confidential/children.csv')
        if os.path.exists(confidential_children_md_path):
            children = children.merge(
                pd.read_csv(confidential_children_md_path),
                how = 'left',
                left_on = 'child_id',
                right_on = 'child_id'
            )
    
        if 'languages' in children.columns:
            print(sorted(children['languages'].unique()))

            children['languages'] = children['languages'].apply(lambda s: s.split(','))
            is_valid = children['languages'].apply(lambda l: all([s.isalpha for s in l]))
            assert(is_valid.all())
        elif 'language' in children.columns:
            print(sorted(children['language'].unique()))
            assert(children['language'].str.isalpha().all())
        else:
            raise KeyError("neither 'languages' or 'language' present in the metadata")

        if 'monoling' in children.columns:
            assert children['monoling'].str.lower().isin(['y', 'n']), "monoling not always y or n"
        else:
            raise KeyError("missing 'monoling' field")

    def test_sex(self):
        children = self.project.children.copy()

        children['child_sex'] = children['child_sex'].str.lower()
        assert children['child_sex'].isin(['m', 'f']).all(), "children sex not always m or f"