test_importData.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
  1. """
  2. This file will test the differents modules in import_data
  3. You must have the pip package pytest installed
  4. """
  5. import os
  6. import sys
  7. import pytest
  8. import shutil
  9. import pandas as pd
  10. from ChildProject.projects import ChildProject
  11. from ChildProject.annotations import AnnotationManager
  12. fpath = os.path.join(os.path.dirname(__file__),'..', 'import_data')
  13. sys.path.append(fpath)
  14. #from import_data.utils import get_raw_filename
  15. #from utils import get_raw_filename
  16. try:
  17. sys.modules.pop('utils') #need to do this to avoid utils conflict of importation
  18. except:
  19. pass
  20. import utils as id_utils
  21. import prepare_data_set as pds
  22. import import_recordings as ir
  23. import import_annotations as ia
  24. import custom_converters as cc
  25. CHILDREN_FILE = os.path.join('dataset-test','metadata','children.csv')
  26. DATASET_PATH = os.path.join('tests','existing_dataset') #dataset already existing, used for tests that don't change it
  27. RECS_PATH = os.path.join(DATASET_PATH,'metadata','recordings.csv')
  28. CHILDREN_PATH = os.path.join(DATASET_PATH,'metadata','children.csv')
  29. ANN_PATH = os.path.join(DATASET_PATH,'metadata','annotations.csv')
  30. NEW_DATASET_PATH = os.path.join('tests','new_dataset') #dataset created by test from scratch
  31. NON_EXISTING_PATH = 'non_existing'
  32. TEST_DATASET_PATH = os.path.join('tests','test_dataset') #dataset to use for tests changing it
  33. TEST_RECS_META = os.path.join(TEST_DATASET_PATH,'metadata','recordings.csv')
  34. TEST_RECS_PATH = os.path.join(TEST_DATASET_PATH,'recordings','raw')
  35. TEST_CHI_META = os.path.join(TEST_DATASET_PATH,'metadata','children.csv')
  36. TEST_ANN_META = os.path.join(TEST_DATASET_PATH,'metadata','annotations.csv')
  37. INPUT_RECS = os.path.join('tests','data','recs')
  38. def set_up_dataset():
  39. if os.path.exists(TEST_DATASET_PATH):
  40. shutil.rmtree(TEST_DATASET_PATH)
  41. shutil.copytree(DATASET_PATH, TEST_DATASET_PATH)
  42. ################### utils ################
  43. @pytest.mark.parametrize('file,result',
  44. [(CHILDREN_FILE, "children"),])
  45. def test_get_raw_filename(file, result):
  46. assert id_utils.get_raw_filename(file) == result
  47. #assert get_raw_filename(file) == result
  48. @pytest.mark.parametrize('path,ext,full_path,result',
  49. [(DATASET_PATH, ["csv","rttm"], True,
  50. sorted([os.path.join(DATASET_PATH,'annotations/vtc/raw/VTC_20220103.rttm'),
  51. os.path.join(DATASET_PATH,'annotations/vtc/raw/VTC_20220124.rttm'),
  52. os.path.join(DATASET_PATH,'metadata/annotations.csv'),
  53. os.path.join(DATASET_PATH,'metadata/children.csv'),
  54. os.path.join(DATASET_PATH,'metadata/recordings.csv'),
  55. os.path.join(DATASET_PATH,'annotations/vtc/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv'),
  56. os.path.join(DATASET_PATH,'annotations/alice/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv'),
  57. os.path.join(DATASET_PATH,'annotations/vcm/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv'),
  58. os.path.join(DATASET_PATH,'annotations/acoustic/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv'),
  59. os.path.join(DATASET_PATH,'annotations/conversations/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv'),
  60. os.path.join(DATASET_PATH,'annotations/alice_vtc/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv'),
  61. os.path.join(DATASET_PATH,'annotations/acoustic/raw/ACOUSTIC_VTC_20220103.csv'),
  62. os.path.join(DATASET_PATH,'annotations/conversations/raw/CONVERSATIONS_VTC_20220103.csv'),
  63. os.path.join(DATASET_PATH,'extra/messages/generated/messages_20220103.csv'),
  64. os.path.join(DATASET_PATH,'extra/metrics/metrics.csv'),
  65. ])),
  66. (DATASET_PATH, ["csv","rttm"], False,
  67. sorted(['annotations/vtc/raw/VTC_20220103.rttm',
  68. 'annotations/vtc/raw/VTC_20220124.rttm',
  69. 'metadata/annotations.csv',
  70. 'metadata/children.csv',
  71. 'metadata/recordings.csv',
  72. 'annotations/vtc/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv',
  73. 'annotations/alice/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv',
  74. 'annotations/vcm/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv',
  75. 'annotations/conversations/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv',
  76. 'annotations/acoustic/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv',
  77. 'annotations/alice_vtc/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv',
  78. 'annotations/acoustic/raw/ACOUSTIC_VTC_20220103.csv',
  79. 'annotations/conversations/raw/CONVERSATIONS_VTC_20220103.csv',
  80. 'extra/messages/generated/messages_20220103.csv',
  81. 'extra/metrics/metrics.csv',
  82. ])),
  83. (os.path.join(DATASET_PATH,'metadata'), [], False,
  84. sorted(['annotations.csv',
  85. 'children.csv',
  86. 'recordings.csv',
  87. 'readme.md',
  88. ])),
  89. (NON_EXISTING_PATH, ["csv","rttm"], False,
  90. []),
  91. (DATASET_PATH, "invented", True,
  92. []),
  93. ])
  94. def test_walk_dir(path, ext, full_path, result):
  95. r = id_utils.walk_dir(path, ext, full_path)
  96. print(r)
  97. assert r == result
  98. ##########################################
  99. ############# prepare_data_set ###########
  100. #tree when creating a new empty dataset from the NEW_DATASET_PATH var
  101. CREATION_TREE = [(NEW_DATASET_PATH, ['metadata', 'extra', 'annotations', 'recordings'], []),
  102. (os.path.join(NEW_DATASET_PATH,'metadata'), [], ['.gitkeep']),
  103. (os.path.join(NEW_DATASET_PATH,'extra'), [], ['.gitkeep']),
  104. (os.path.join(NEW_DATASET_PATH,'annotations'), ['vtc', 'acoustic', 'vcm', 'conversations', 'alice'], ['.gitkeep']),
  105. (os.path.join(NEW_DATASET_PATH,'annotations/vtc'), ['raw'], []),
  106. (os.path.join(NEW_DATASET_PATH,'annotations/vtc/raw'), [], ['.gitkeep']),
  107. (os.path.join(NEW_DATASET_PATH,'annotations/acoustic'), ['raw'], []),
  108. (os.path.join(NEW_DATASET_PATH,'annotations/acoustic/raw'), [], ['.gitkeep']),
  109. (os.path.join(NEW_DATASET_PATH,'annotations/vcm'), ['raw'], []),
  110. (os.path.join(NEW_DATASET_PATH,'annotations/vcm/raw'), [], ['.gitkeep']),
  111. (os.path.join(NEW_DATASET_PATH,'annotations/conversations'), ['raw'], []),
  112. (os.path.join(NEW_DATASET_PATH,'annotations/conversations/raw'),[], ['.gitkeep']),
  113. (os.path.join(NEW_DATASET_PATH,'annotations/alice'), ['raw'], []),
  114. (os.path.join(NEW_DATASET_PATH,'annotations/alice/raw'), [], ['.gitkeep']),
  115. (os.path.join(NEW_DATASET_PATH,'recordings'), ['raw'], []),
  116. (os.path.join(NEW_DATASET_PATH,'recordings/raw'), [], ['.gitkeep']),
  117. ]
  118. def test_create_child_project_directories():
  119. pds.create_child_project_directories(NEW_DATASET_PATH)
  120. tree = list(os.walk(NEW_DATASET_PATH))
  121. shutil.rmtree(NEW_DATASET_PATH)
  122. assert tree == CREATION_TREE
  123. ##########################################
  124. ########## import_recordings #############
  125. @pytest.mark.parametrize('path,result',
  126. [(RECS_PATH, pd.read_csv(RECS_PATH)),
  127. (NON_EXISTING_PATH, pd.DataFrame(columns = ['experiment', 'experiment_stage', 'child_id', 'date_iso', 'start_time',
  128. 'recording_device_type', 'recording_filename', 'session_id'])),])
  129. def test_get_recordings(path,result):
  130. recs = ir._get_recordings(path)
  131. pd.testing.assert_frame_equal(recs,result)
  132. @pytest.mark.parametrize('path,result',
  133. [(CHILDREN_PATH, pd.read_csv(CHILDREN_PATH)),
  134. (NON_EXISTING_PATH, pd.DataFrame(columns = ['experiment', 'child_id', 'child_dob'])),])
  135. def test_get_children(path,result):
  136. childn = ir._get_children(path)
  137. pd.testing.assert_frame_equal(childn,result)
  138. INCORRECT_RECS_PATH = os.path.join('test','data','incorrect-recs')
  139. @pytest.mark.parametrize('path,rec,result',
  140. [(RECS_PATH, "14T_Audio-1-familia-14T-CP-2020-02_20220103_000000.wav", {'experiment': 'test', 'experiment_stage': 'Audio-1-familia-14T-CP-2020-02', 'child_id': 'chi_14T', 'date_iso': '2022-01-03', 'start_time': '00:00:00', 'recording_device_type': 'lena', 'recording_filename': '14T_Audio-1-familia-14T-CP-2020-02_20220103_000000.wav', 'session_id': 'chi_14T_20220103', 'duration': 0, 'imported_at': '2022-10-26 14:49:10'}),
  141. (INCORRECT_RECS_PATH, "14T_Audio-1-familia-14T-CP-2020-02_20220103-000000.wav", False),
  142. (INCORRECT_RECS_PATH, "14T_Audio-1-familia-14T-CP-2020-02_20221403_000000.wav", False),
  143. (INCORRECT_RECS_PATH, "14T_Audio-1-familia-14T-CP-2020-02_20220103_520000.wav", False),])
  144. def test_build_recording_metadata(path, rec, result):
  145. metadata = ir._build_recording_metadata(path,rec,'test', 'lena')
  146. if result:
  147. result['imported_at'] = metadata['imported_at'] #forced to get the imported at value for consistency
  148. assert metadata == result
  149. #dependent on _get_recordings - _build_recording_metadata
  150. def test_import_recordings():
  151. truth = os.path.join('tests','data','truth','new_recs_import.csv')
  152. set_up_dataset()
  153. #copy a new rec into the dataset
  154. shutil.copy2(os.path.join(INPUT_RECS,'23T_Audio-3-familia-23T-lectura-01_20220124_000000.wav'),TEST_RECS_PATH)
  155. #os.path.join(TEST_RECS_PATH,'23T_Audio-3-familia-23T-lectura-01_20220124_000000.wav'))
  156. ir.import_recordings(TEST_DATASET_PATH,'test','lena')
  157. pd.testing.assert_frame_equal(pd.read_csv(TEST_RECS_META).drop(columns=['imported_at']),pd.read_csv(truth).drop(columns=['imported_at']),check_like=True)
  158. #dependent on _get_recordings - _get_children - import_recordings
  159. def test_import_children():
  160. truth = os.path.join('tests','data','truth','new_children_import.csv')
  161. set_up_dataset()
  162. #copy a new rec into the dataset
  163. shutil.copy2(os.path.join(INPUT_RECS,'23T_Audio-3-familia-23T-lectura-01_20220124_000000.wav'),TEST_RECS_PATH)
  164. ir.import_recordings(TEST_DATASET_PATH,'test','lena')
  165. #now import the child info
  166. ir.import_children(TEST_DATASET_PATH, 'test')
  167. pd.testing.assert_frame_equal(pd.read_csv(TEST_CHI_META),pd.read_csv(truth),check_like=True)
  168. #COMMENTED OUT : as of now, is just import_recordings followed by import_children, so already tested
  169. #dependent on import_recordings - import_children
  170. #def test_data_importation():
  171. # truth_r = os.path.join('tests','data','truth','new_recs_import.csv')
  172. # truth_c = os.path.join('tests','data','truth','new_children_import.csv')
  173. #
  174. # #copy a new rec into the dataset
  175. # shutil.copy2(os.path.join(INPUT_RECS,'23T_Audio-3-familia-23T-lectura-01_20220124_000000.wav'),TEST_RECS_PATH)
  176. #
  177. # ir.test_data_importation(TEST_DATASET_PATH,'test','lena')
  178. #
  179. # pd.testing.assert_frame_equal(pd.read_csv(TEST_RECS_META).drop(columns=['imported_at']),pd.read_csv(truth_r).drop(columns=['imported_at']),check_like=True)
  180. # pd.testing.assert_frame_equal(pd.read_csv(TEST_CHI_META),pd.read_csv(truth_c),check_like=True)
  181. #####################################################
  182. ################# import_annotations ################
  183. @pytest.mark.parametrize('remove',
  184. [(False),
  185. (True),])
  186. def test_filter_missing_annotation_files(remove):
  187. set_up_dataset()
  188. load = pd.read_csv(TEST_ANN_META)
  189. load = load[~load['set'].isin({'alice_vtc'})] #remove alice_vtc lines as thei don't have raw files with them, so this would give a bad check
  190. res = load.copy()
  191. if remove:
  192. rm1 = 'vtc'
  193. rm2 = 'acoustic'
  194. res = res[~res['set'].isin([rm1,rm2])]
  195. shutil.rmtree(os.path.join(TEST_DATASET_PATH,'annotations',rm1,'raw'))
  196. os.remove(os.path.join(TEST_DATASET_PATH,'annotations',rm2,'raw','ACOUSTIC_VTC_20220103.csv'))
  197. res['exists'] = True #the function always appends a 'exists' column, maybe drop it?
  198. pd.testing.assert_frame_equal(ia._filter_missing_annotation_files(TEST_DATASET_PATH,load), res, check_like=True)
  199. def test_check_importation():
  200. #this function is here to generate warnings and checks, will have to add a test to check logging output
  201. pass
  202. @pytest.mark.parametrize('recording,empty',
  203. [('14T_Audio-1-familia-14T-CP-2020-02_20220103_000000.wav',False),
  204. ('does_not_exist',True),])
  205. def test_ia_get_recordings(recording,empty):
  206. annot_set = 'vtc'
  207. project = ChildProject(DATASET_PATH)
  208. project.read()
  209. if empty:
  210. res = pd.DataFrame(columns=['recording_filename','set','format','time_seek','range_onset', 'range_offset'])
  211. res = res.astype(dtype={'time_seek': 'int' , 'range_onset': 'int','range_offset': 'int'})
  212. else:
  213. res = pd.read_csv(TEST_ANN_META)[['recording_filename','set','format','time_seek','range_onset', 'range_offset']]
  214. res = res[res['set'] == annot_set]
  215. df = ia._get_recordings(project, annot_set,'vtc_rttm',recording)
  216. df.drop(columns='child_id', inplace=True)
  217. pd.testing.assert_frame_equal(res.reset_index(drop=True),df.reset_index(drop=True),check_like=True, check_index_type=False)
  218. @pytest.mark.parametrize('ann_set,name,result',
  219. [('vtc','','14T_Audio-1-familia-14T-CP-2020-02_20220103_000000.rttm'),
  220. ('acoustic','filename','filename'),])
  221. def test_build_raw_filename(ann_set,name,result):
  222. recs = pd.read_csv(RECS_PATH)[['recording_filename']]
  223. annots = pd.read_csv(ANN_PATH)
  224. annots = annots[annots['set'] == ann_set][['recording_filename','raw_filename','filter']]
  225. annots['raw_filename'] = result
  226. df = ia._build_raw_filename(recs, ann_set, name)
  227. pd.testing.assert_frame_equal(annots.reset_index(drop=True),df.reset_index(drop=True), check_like=True)
  228. #TODO add test for file non existing(as currently the import is aborted and prints a warning but does not fail)
  229. @pytest.mark.parametrize('ann_set,file,rfaf,recording',
  230. [('vtc','VTC_20220103.rttm',None,None),
  231. ('acoustic','ACOUSTIC_VTC_20220103.csv','VTC_20220103.rttm',None),])
  232. def test_import_annotation(ann_set,file,rfaf,recording):
  233. dtypes_forces = {'merged_from': 'str'}
  234. set_up_dataset()
  235. p = ChildProject(TEST_DATASET_PATH)
  236. am = AnnotationManager(p)
  237. if ann_set in {'vtc'} : am.remove_set('alice_vtc') #remove sets that were merged because dataset will fail with missing sets used in merges
  238. #TODO make this more general, or force childproject to remove sets used
  239. am.remove_set(ann_set)
  240. p_truth = ChildProject(DATASET_PATH)
  241. am_truth = AnnotationManager(p_truth)
  242. annots_truth = am_truth.annotations.drop(columns=['imported_at'])
  243. if ann_set in {'vtc'} : annots_truth = annots_truth[~annots_truth['set'].isin({'alice_vtc'})] #same reason
  244. ia._import_annotation(p,am, ann_set, file,rfaf,recording)
  245. annots = am.annotations.drop(columns=['imported_at']).sort_values(by='set').astype(dtypes_forces)
  246. annots_truth = annots_truth.sort_values(by='set').astype(dtypes_forces)
  247. print(annots)
  248. print(annots_truth)
  249. pd.testing.assert_frame_equal(annots_truth.reset_index(drop=True),annots.reset_index(drop=True),check_like=True)
  250. #COMMENTED OUT : as of now, just calls _import_annotation
  251. #def test_import_annotations():
  252. # pass
  253. ###################################################
  254. ############## custom_converters ##################
  255. single_rec_csv = os.path.join(DATASET_PATH,'annotations','acoustic','converted','14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv')
  256. multi_rec_csv = os.path.join('tests','data', 'csv','multi_rec_vtc.csv')
  257. no_rec_name = os.path.join(DATASET_PATH,'annotations','vtc','converted','14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv')
  258. @pytest.mark.parametrize('file,filter,fails,error',
  259. [(single_rec_csv,'14T_Audio-1-familia-14T-CP-2020-02_20220103_000000.wav',False,None),
  260. (single_rec_csv,'',False,None),
  261. (multi_rec_csv,'VTC_20220124.wav',False,None),
  262. (NON_EXISTING_PATH,'',True,FileNotFoundError),
  263. (no_rec_name,'',True,KeyError),
  264. ])
  265. def test_filteredCsvConverter(file, filter, fails,error):
  266. if fails:
  267. with pytest.raises(error):
  268. df = cc.FilteredCsvConverter.convert(file, filter)
  269. else:
  270. df = cc.FilteredCsvConverter.convert(file, filter)
  271. truth = pd.read_csv(file)
  272. if filter:
  273. truth = truth[truth["recording_filename"].str.contains(filter)]
  274. pd.testing.assert_frame_equal(df, truth, check_like=True)
  275. ###################################################