""" This file will test the differents modules in import_data You must have the pip package pytest installed """ import os import sys import pytest import shutil import pandas as pd from ChildProject.projects import ChildProject from ChildProject.annotations import AnnotationManager fpath = os.path.join(os.path.dirname(__file__),'..', 'import_data') sys.path.append(fpath) #from import_data.utils import get_raw_filename #from utils import get_raw_filename try: sys.modules.pop('utils') #need to do this to avoid utils conflict of importation except: pass import utils as id_utils import prepare_data_set as pds import import_recordings as ir import import_annotations as ia import custom_converters as cc CHILDREN_FILE = os.path.join('dataset-test','metadata','children.csv') DATASET_PATH = os.path.join('tests','existing_dataset') #dataset already existing, used for tests that don't change it RECS_PATH = os.path.join(DATASET_PATH,'metadata','recordings.csv') CHILDREN_PATH = os.path.join(DATASET_PATH,'metadata','children.csv') ANN_PATH = os.path.join(DATASET_PATH,'metadata','annotations.csv') NEW_DATASET_PATH = os.path.join('tests','new_dataset') #dataset created by test from scratch NON_EXISTING_PATH = 'non_existing' TEST_DATASET_PATH = os.path.join('tests','test_dataset') #dataset to use for tests changing it TEST_RECS_META = os.path.join(TEST_DATASET_PATH,'metadata','recordings.csv') TEST_RECS_PATH = os.path.join(TEST_DATASET_PATH,'recordings','raw') TEST_CHI_META = os.path.join(TEST_DATASET_PATH,'metadata','children.csv') TEST_ANN_META = os.path.join(TEST_DATASET_PATH,'metadata','annotations.csv') INPUT_RECS = os.path.join('tests','data','recs') def set_up_dataset(): if os.path.exists(TEST_DATASET_PATH): shutil.rmtree(TEST_DATASET_PATH) shutil.copytree(DATASET_PATH, TEST_DATASET_PATH) ################### utils ################ @pytest.mark.parametrize('file,result', [(CHILDREN_FILE, "children"),]) def test_get_raw_filename(file, result): assert id_utils.get_raw_filename(file) == result #assert get_raw_filename(file) == result @pytest.mark.parametrize('path,ext,full_path,result', [(DATASET_PATH, ["csv","rttm"], True, sorted([os.path.join(DATASET_PATH,'annotations/vtc/raw/VTC_20220103.rttm'), os.path.join(DATASET_PATH,'annotations/vtc/raw/VTC_20220124.rttm'), os.path.join(DATASET_PATH,'metadata/annotations.csv'), os.path.join(DATASET_PATH,'metadata/children.csv'), os.path.join(DATASET_PATH,'metadata/recordings.csv'), os.path.join(DATASET_PATH,'annotations/vtc/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv'), os.path.join(DATASET_PATH,'annotations/alice/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv'), os.path.join(DATASET_PATH,'annotations/vcm/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv'), os.path.join(DATASET_PATH,'annotations/acoustic/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv'), os.path.join(DATASET_PATH,'annotations/conversations/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv'), os.path.join(DATASET_PATH,'annotations/alice_vtc/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv'), os.path.join(DATASET_PATH,'annotations/acoustic/raw/ACOUSTIC_VTC_20220103.csv'), os.path.join(DATASET_PATH,'annotations/conversations/raw/CONVERSATIONS_VTC_20220103.csv'), os.path.join(DATASET_PATH,'extra/messages/generated/messages_20220103.csv'), os.path.join(DATASET_PATH,'extra/metrics/metrics.csv'), ])), (DATASET_PATH, ["csv","rttm"], False, sorted(['annotations/vtc/raw/VTC_20220103.rttm', 'annotations/vtc/raw/VTC_20220124.rttm', 'metadata/annotations.csv', 'metadata/children.csv', 'metadata/recordings.csv', 'annotations/vtc/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv', 'annotations/alice/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv', 'annotations/vcm/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv', 'annotations/conversations/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv', 'annotations/acoustic/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv', 'annotations/alice_vtc/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv', 'annotations/acoustic/raw/ACOUSTIC_VTC_20220103.csv', 'annotations/conversations/raw/CONVERSATIONS_VTC_20220103.csv', 'extra/messages/generated/messages_20220103.csv', 'extra/metrics/metrics.csv', ])), (os.path.join(DATASET_PATH,'metadata'), [], False, sorted(['annotations.csv', 'children.csv', 'recordings.csv', 'readme.md', ])), (NON_EXISTING_PATH, ["csv","rttm"], False, []), (DATASET_PATH, "invented", True, []), ]) def test_walk_dir(path, ext, full_path, result): r = id_utils.walk_dir(path, ext, full_path) print(r) assert r == result ########################################## ############# prepare_data_set ########### #tree when creating a new empty dataset from the NEW_DATASET_PATH var CREATION_TREE = [(NEW_DATASET_PATH, ['metadata', 'extra', 'annotations', 'recordings'], []), (os.path.join(NEW_DATASET_PATH,'metadata'), [], ['.gitkeep']), (os.path.join(NEW_DATASET_PATH,'extra'), [], ['.gitkeep']), (os.path.join(NEW_DATASET_PATH,'annotations'), ['vtc', 'acoustic', 'vcm', 'conversations', 'alice'], ['.gitkeep']), (os.path.join(NEW_DATASET_PATH,'annotations/vtc'), ['raw'], []), (os.path.join(NEW_DATASET_PATH,'annotations/vtc/raw'), [], ['.gitkeep']), (os.path.join(NEW_DATASET_PATH,'annotations/acoustic'), ['raw'], []), (os.path.join(NEW_DATASET_PATH,'annotations/acoustic/raw'), [], ['.gitkeep']), (os.path.join(NEW_DATASET_PATH,'annotations/vcm'), ['raw'], []), (os.path.join(NEW_DATASET_PATH,'annotations/vcm/raw'), [], ['.gitkeep']), (os.path.join(NEW_DATASET_PATH,'annotations/conversations'), ['raw'], []), (os.path.join(NEW_DATASET_PATH,'annotations/conversations/raw'),[], ['.gitkeep']), (os.path.join(NEW_DATASET_PATH,'annotations/alice'), ['raw'], []), (os.path.join(NEW_DATASET_PATH,'annotations/alice/raw'), [], ['.gitkeep']), (os.path.join(NEW_DATASET_PATH,'recordings'), ['raw'], []), (os.path.join(NEW_DATASET_PATH,'recordings/raw'), [], ['.gitkeep']), ] def test_create_child_project_directories(): pds.create_child_project_directories(NEW_DATASET_PATH) tree = list(os.walk(NEW_DATASET_PATH)) shutil.rmtree(NEW_DATASET_PATH) assert tree == CREATION_TREE ########################################## ########## import_recordings ############# @pytest.mark.parametrize('path,result', [(RECS_PATH, pd.read_csv(RECS_PATH)), (NON_EXISTING_PATH, pd.DataFrame(columns = ['experiment', 'experiment_stage', 'child_id', 'date_iso', 'start_time', 'recording_device_type', 'recording_filename', 'session_id'])),]) def test_get_recordings(path,result): recs = ir._get_recordings(path) pd.testing.assert_frame_equal(recs,result) @pytest.mark.parametrize('path,result', [(CHILDREN_PATH, pd.read_csv(CHILDREN_PATH)), (NON_EXISTING_PATH, pd.DataFrame(columns = ['experiment', 'child_id', 'child_dob'])),]) def test_get_children(path,result): childn = ir._get_children(path) pd.testing.assert_frame_equal(childn,result) INCORRECT_RECS_PATH = os.path.join('test','data','incorrect-recs') @pytest.mark.parametrize('path,rec,result', [(RECS_PATH, "14T_Audio-1-familia-14T-CP-2020-02_20220103_000000.wav", {'experiment': 'test', 'experiment_stage': 'Audio-1-familia-14T-CP-2020-02', 'child_id': 'chi_14T', 'date_iso': '2022-01-03', 'start_time': '00:00:00', 'recording_device_type': 'lena', 'recording_filename': '14T_Audio-1-familia-14T-CP-2020-02_20220103_000000.wav', 'session_id': 'chi_14T_20220103', 'duration': 0, 'imported_at': '2022-10-26 14:49:10'}), (INCORRECT_RECS_PATH, "14T_Audio-1-familia-14T-CP-2020-02_20220103-000000.wav", False), (INCORRECT_RECS_PATH, "14T_Audio-1-familia-14T-CP-2020-02_20221403_000000.wav", False), (INCORRECT_RECS_PATH, "14T_Audio-1-familia-14T-CP-2020-02_20220103_520000.wav", False),]) def test_build_recording_metadata(path, rec, result): metadata = ir._build_recording_metadata(path,rec,'test', 'lena') if result: result['imported_at'] = metadata['imported_at'] #forced to get the imported at value for consistency assert metadata == result #dependent on _get_recordings - _build_recording_metadata def test_import_recordings(): truth = os.path.join('tests','data','truth','new_recs_import.csv') set_up_dataset() #copy a new rec into the dataset shutil.copy2(os.path.join(INPUT_RECS,'23T_Audio-3-familia-23T-lectura-01_20220124_000000.wav'),TEST_RECS_PATH) #os.path.join(TEST_RECS_PATH,'23T_Audio-3-familia-23T-lectura-01_20220124_000000.wav')) ir.import_recordings(TEST_DATASET_PATH,'test','lena') pd.testing.assert_frame_equal(pd.read_csv(TEST_RECS_META).drop(columns=['imported_at']),pd.read_csv(truth).drop(columns=['imported_at']),check_like=True) #dependent on _get_recordings - _get_children - import_recordings def test_import_children(): truth = os.path.join('tests','data','truth','new_children_import.csv') set_up_dataset() #copy a new rec into the dataset shutil.copy2(os.path.join(INPUT_RECS,'23T_Audio-3-familia-23T-lectura-01_20220124_000000.wav'),TEST_RECS_PATH) ir.import_recordings(TEST_DATASET_PATH,'test','lena') #now import the child info ir.import_children(TEST_DATASET_PATH, 'test') pd.testing.assert_frame_equal(pd.read_csv(TEST_CHI_META),pd.read_csv(truth),check_like=True) #COMMENTED OUT : as of now, is just import_recordings followed by import_children, so already tested #dependent on import_recordings - import_children #def test_data_importation(): # truth_r = os.path.join('tests','data','truth','new_recs_import.csv') # truth_c = os.path.join('tests','data','truth','new_children_import.csv') # # #copy a new rec into the dataset # shutil.copy2(os.path.join(INPUT_RECS,'23T_Audio-3-familia-23T-lectura-01_20220124_000000.wav'),TEST_RECS_PATH) # # ir.test_data_importation(TEST_DATASET_PATH,'test','lena') # # pd.testing.assert_frame_equal(pd.read_csv(TEST_RECS_META).drop(columns=['imported_at']),pd.read_csv(truth_r).drop(columns=['imported_at']),check_like=True) # pd.testing.assert_frame_equal(pd.read_csv(TEST_CHI_META),pd.read_csv(truth_c),check_like=True) ##################################################### ################# import_annotations ################ @pytest.mark.parametrize('remove', [(False), (True),]) def test_filter_missing_annotation_files(remove): set_up_dataset() load = pd.read_csv(TEST_ANN_META) load = load[~load['set'].isin({'alice_vtc'})] #remove alice_vtc lines as thei don't have raw files with them, so this would give a bad check res = load.copy() if remove: rm1 = 'vtc' rm2 = 'acoustic' res = res[~res['set'].isin([rm1,rm2])] shutil.rmtree(os.path.join(TEST_DATASET_PATH,'annotations',rm1,'raw')) os.remove(os.path.join(TEST_DATASET_PATH,'annotations',rm2,'raw','ACOUSTIC_VTC_20220103.csv')) res['exists'] = True #the function always appends a 'exists' column, maybe drop it? pd.testing.assert_frame_equal(ia._filter_missing_annotation_files(TEST_DATASET_PATH,load), res, check_like=True) def test_check_importation(): #this function is here to generate warnings and checks, will have to add a test to check logging output pass @pytest.mark.parametrize('recording,empty', [('14T_Audio-1-familia-14T-CP-2020-02_20220103_000000.wav',False), ('does_not_exist',True),]) def test_ia_get_recordings(recording,empty): annot_set = 'vtc' project = ChildProject(DATASET_PATH) project.read() if empty: res = pd.DataFrame(columns=['recording_filename','set','format','time_seek','range_onset', 'range_offset']) res = res.astype(dtype={'time_seek': 'int' , 'range_onset': 'int','range_offset': 'int'}) else: res = pd.read_csv(TEST_ANN_META)[['recording_filename','set','format','time_seek','range_onset', 'range_offset']] res = res[res['set'] == annot_set] df = ia._get_recordings(project, annot_set,'vtc_rttm',recording) df.drop(columns='child_id', inplace=True) pd.testing.assert_frame_equal(res.reset_index(drop=True),df.reset_index(drop=True),check_like=True, check_index_type=False) @pytest.mark.parametrize('ann_set,name,result', [('vtc','','14T_Audio-1-familia-14T-CP-2020-02_20220103_000000.rttm'), ('acoustic','filename','filename'),]) def test_build_raw_filename(ann_set,name,result): recs = pd.read_csv(RECS_PATH)[['recording_filename']] annots = pd.read_csv(ANN_PATH) annots = annots[annots['set'] == ann_set][['recording_filename','raw_filename','filter']] annots['raw_filename'] = result df = ia._build_raw_filename(recs, ann_set, name) pd.testing.assert_frame_equal(annots.reset_index(drop=True),df.reset_index(drop=True), check_like=True) #TODO add test for file non existing(as currently the import is aborted and prints a warning but does not fail) @pytest.mark.parametrize('ann_set,file,rfaf,recording', [('vtc','VTC_20220103.rttm',None,None), ('acoustic','ACOUSTIC_VTC_20220103.csv','VTC_20220103.rttm',None),]) def test_import_annotation(ann_set,file,rfaf,recording): dtypes_forces = {'merged_from': 'str'} set_up_dataset() p = ChildProject(TEST_DATASET_PATH) am = AnnotationManager(p) if ann_set in {'vtc'} : am.remove_set('alice_vtc') #remove sets that were merged because dataset will fail with missing sets used in merges #TODO make this more general, or force childproject to remove sets used am.remove_set(ann_set) p_truth = ChildProject(DATASET_PATH) am_truth = AnnotationManager(p_truth) annots_truth = am_truth.annotations.drop(columns=['imported_at']) if ann_set in {'vtc'} : annots_truth = annots_truth[~annots_truth['set'].isin({'alice_vtc'})] #same reason ia._import_annotation(p,am, ann_set, file,rfaf,recording) annots = am.annotations.drop(columns=['imported_at']).sort_values(by='set').astype(dtypes_forces) annots_truth = annots_truth.sort_values(by='set').astype(dtypes_forces) print(annots) print(annots_truth) pd.testing.assert_frame_equal(annots_truth.reset_index(drop=True),annots.reset_index(drop=True),check_like=True) #COMMENTED OUT : as of now, just calls _import_annotation #def test_import_annotations(): # pass ################################################### ############## custom_converters ################## single_rec_csv = os.path.join(DATASET_PATH,'annotations','acoustic','converted','14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv') multi_rec_csv = os.path.join('tests','data', 'csv','multi_rec_vtc.csv') no_rec_name = os.path.join(DATASET_PATH,'annotations','vtc','converted','14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv') @pytest.mark.parametrize('file,filter,fails,error', [(single_rec_csv,'14T_Audio-1-familia-14T-CP-2020-02_20220103_000000.wav',False,None), (single_rec_csv,'',False,None), (multi_rec_csv,'VTC_20220124.wav',False,None), (NON_EXISTING_PATH,'',True,FileNotFoundError), (no_rec_name,'',True,KeyError), ]) def test_filteredCsvConverter(file, filter, fails,error): if fails: with pytest.raises(error): df = cc.FilteredCsvConverter.convert(file, filter) else: df = cc.FilteredCsvConverter.convert(file, filter) truth = pd.read_csv(file) if filter: truth = truth[truth["recording_filename"].str.contains(filter)] pd.testing.assert_frame_equal(df, truth, check_like=True) ###################################################