from ChildProject.projects import ChildProject from ChildProject.annotations import AnnotationManager from datetime import datetime import multiprocessing as mp import os import pandas as pd import re import sys class DatasetTester: def __init__(self, path: str, threads: int = 1): self.project = ChildProject(path) self.am = AnnotationManager(self.project) self.am.read() threads = int(threads) self.threads = threads if threads >= 1 else mp.cpu_count() def test_metadata(self): errors, warnings = self.project.validate(ignore_files = True) assert len(errors) == 0, 'project validation failed' def test_annotations(self): errors, warnings = self.am.validate(threads = self.threads) assert len(errors) == 0, 'annotations validation failed' def test_age(self): children = self.project.children.copy() recordings = self.project.recordings.copy() recordings = recordings.merge( children, how = 'left', left_on = 'child_id', right_on = 'child_id' ) recordings['date_iso'] = recordings['date_iso'].apply( lambda s: datetime.strptime(s, '%Y-%m-%d') ) recordings['child_dob'] = recordings['child_dob'].apply( lambda s: datetime.strptime(s, '%Y-%m-%d') ) assert all(recordings.apply( lambda row: row['date_iso'] > row['child_dob'], axis = 1 )) def test_ses(self): children = self.project.children.copy() children = children.dropna(subset = ['ses']) children = children[children['ses'] != 'NA'] children['ses'] = children['ses'].astype(int) assert (children['ses'].values >= 1).all() and (children['ses'].values <= 5).all(), "ses should be >= 1 and <= 5" def test_language(self): children = self.project.children.copy() confidential_children_md_path = os.path.join(self.project.path, 'metadata/confidential/children.csv') if os.path.exists(confidential_children_md_path): children = children.merge( pd.read_csv(confidential_children_md_path), how = 'left', left_on = 'child_id', right_on = 'child_id' ) if 'languages' in children.columns: children['languages'] = children['languages'].apply(lambda s: s.split(',')) is_valid = children['languages'].apply(lambda l: all([s.isalpha for s in l])) assert(is_valid.all()) elif 'language' in children.columns: assert(children['language'].str.isalpha().all()) else: raise KeyError("neither 'languages' or 'language' present in the metadata") if 'monoling' in children.columns: assert children['monoling'].str.lower().isin(['y', 'n']).all(), "monoling not always y or n" else: raise KeyError("missing 'monoling' field") def test_sex(self): children = self.project.children.copy() children['child_sex'] = children['child_sex'].str.lower() assert children['child_sex'].isin(['m', 'f']).all(), "children sex not always m or f" def test_normativity(self): children = self.project.children.copy() if 'normative' in children.columns: assert children['normative'].str.lower().isin(['y', 'n']).all(), "normative not always y or n" else: raise KeyError("missing 'normative' field")