123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100 |
- from ChildProject.projects import ChildProject
- from ChildProject.annotations import AnnotationManager
- from datetime import datetime
- import multiprocessing as mp
- import os
- import pandas as pd
- import re
- import sys
- class DatasetTester:
- def __init__(self, path: str, threads: int = 1):
- self.project = ChildProject(path)
- self.am = AnnotationManager(self.project)
- self.am.read()
-
- threads = int(threads)
- self.threads = threads if threads >= 1 else mp.cpu_count()
- def test_metadata(self):
- errors, warnings = self.project.validate(ignore_files = True)
- assert len(errors) == 0, 'project validation failed'
- def test_annotations(self):
- errors, warnings = self.am.validate(threads = self.threads)
- assert len(errors) == 0, 'annotations validation failed'
- def test_age(self):
- children = self.project.children.copy()
- recordings = self.project.recordings.copy()
- recordings = recordings.merge(
- children,
- how = 'left',
- left_on = 'child_id',
- right_on = 'child_id'
- )
- recordings['date_iso'] = recordings['date_iso'].apply(
- lambda s: datetime.strptime(s, '%Y-%m-%d')
- )
- recordings['child_dob'] = recordings['child_dob'].apply(
- lambda s: datetime.strptime(s, '%Y-%m-%d')
- )
- assert all(recordings.apply(
- lambda row: row['date_iso'] > row['child_dob'],
- axis = 1
- ))
- def test_ses(self):
- children = self.project.children.copy()
-
- children = children.dropna(subset = ['ses'])
- children = children[children['ses'] != 'NA']
- children['ses'] = children['ses'].astype(int)
- assert (children['ses'].values >= 1).all() and (children['ses'].values <= 5).all(), "ses should be >= 1 and <= 5"
- def test_language(self):
- children = self.project.children.copy()
- confidential_children_md_path = os.path.join(self.project.path, 'metadata/confidential/children.csv')
- if os.path.exists(confidential_children_md_path):
- children = children.merge(
- pd.read_csv(confidential_children_md_path),
- how = 'left',
- left_on = 'child_id',
- right_on = 'child_id'
- )
-
- if 'languages' in children.columns:
- children['languages'] = children['languages'].apply(lambda s: s.split(','))
- is_valid = children['languages'].apply(lambda l: all([s.isalpha for s in l]))
- assert(is_valid.all())
- elif 'language' in children.columns:
- assert(children['language'].str.isalpha().all())
- else:
- raise KeyError("neither 'languages' or 'language' present in the metadata")
- if 'monoling' in children.columns:
- assert children['monoling'].str.lower().isin(['y', 'n']).all(), "monoling not always y or n"
- else:
- raise KeyError("missing 'monoling' field")
- def test_sex(self):
- children = self.project.children.copy()
- children['child_sex'] = children['child_sex'].str.lower()
- assert children['child_sex'].isin(['m', 'f']).all(), "children sex not always m or f"
- def test_normativity(self):
- children = self.project.children.copy()
- if 'normative' in children.columns:
- assert children['normative'].str.lower().isin(['y', 'n']).all(), "normative not always y or n"
- else:
- raise KeyError("missing 'normative' field")
|