tests.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293
  1. from ChildProject.projects import ChildProject
  2. from ChildProject.annotations import AnnotationManager
  3. from datetime import datetime
  4. import multiprocessing as mp
  5. import os
  6. import pandas as pd
  7. import re
  8. class DatasetTester:
  9. def __init__(self, path: str, threads: int = 1):
  10. self.project = ChildProject(path)
  11. self.am = AnnotationManager(self.project)
  12. self.am.read()
  13. threads = int(threads)
  14. self.threads = threads if threads >= 1 else mp.cpu_count()
  15. def test_validation(self):
  16. errors, warnings = self.project.validate(ignore_files = True)
  17. assert len(errors) == 0, 'project validation failed'
  18. errors, warnings = self.am.validate(threads = self.threads)
  19. assert len(errors) == 0, 'annotations validation failed'
  20. def test_age(self):
  21. children = self.project.children.copy()
  22. recordings = self.project.recordings.copy()
  23. recordings = recordings.merge(
  24. children,
  25. how = 'left',
  26. left_on = 'child_id',
  27. right_on = 'child_id'
  28. )
  29. recordings['date_iso'] = recordings['date_iso'].apply(
  30. lambda s: datetime.strptime(s, '%Y-%m-%d')
  31. )
  32. recordings['child_dob'] = recordings['child_dob'].apply(
  33. lambda s: datetime.strptime(s, '%Y-%m-%d')
  34. )
  35. assert all(recordings.apply(
  36. lambda row: row['date_iso'] > row['child_dob'],
  37. axis = 1
  38. ))
  39. def test_ses(self):
  40. children = self.project.children.copy()
  41. children = children.dropna(subset = ['ses'])
  42. children = children[children['ses'] != 'NA']
  43. children['ses'] = children['ses'].astype(int)
  44. assert (children['ses'].values >= 1).all() and (children['ses'].values <= 5).all(), "ses should be >= 1 and <= 5"
  45. def test_language(self):
  46. children = self.project.children.copy()
  47. confidential_children_md_path = os.path.join(self.project.path, 'metadata/confidential/children.csv')
  48. if os.path.exists(confidential_children_md_path):
  49. children = children.merge(
  50. pd.read_csv(confidential_children_md_path),
  51. how = 'left',
  52. left_on = 'child_id',
  53. right_on = 'child_id'
  54. )
  55. if 'languages' in children.columns:
  56. print(sorted(children['languages'].unique()))
  57. children['languages'] = children['languages'].apply(lambda s: s.split(','))
  58. is_valid = children['languages'].apply(lambda l: all([s.isalpha for s in l]))
  59. assert(is_valid.all())
  60. elif 'language' in children.columns:
  61. print(sorted(children['language'].unique()))
  62. assert(children['language'].str.isalpha().all())
  63. else:
  64. raise KeyError("neither 'languages' or 'language' present in the metadata")
  65. if 'monoling' in children.columns:
  66. assert children['monoling'].str.lower().isin(['y', 'n']), "monoling not always y or n"
  67. else:
  68. raise KeyError("missing 'monoling' field")
  69. def test_sex(self):
  70. children = self.project.children.copy()
  71. children['child_sex'] = children['child_sex'].str.lower()
  72. assert children['child_sex'].isin(['m', 'f']).all(), "children sex not always m or f"