123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344 |
- """
- This file will test the differents modules in import_data
- You must have the pip package pytest installed
- """
- import os
- import sys
- import pytest
- import shutil
- import pandas as pd
- from ChildProject.projects import ChildProject
- from ChildProject.annotations import AnnotationManager
- fpath = os.path.join(os.path.dirname(__file__),'..', 'import_data')
- sys.path.append(fpath)
- #from import_data.utils import get_raw_filename
- #from utils import get_raw_filename
- try:
- sys.modules.pop('utils') #need to do this to avoid utils conflict of importation
- except:
- pass
- import utils as id_utils
- import prepare_data_set as pds
- import import_recordings as ir
- import import_annotations as ia
- import custom_converters as cc
- CHILDREN_FILE = os.path.join('dataset-test','metadata','children.csv')
- DATASET_PATH = os.path.join('tests','existing_dataset') #dataset already existing, used for tests that don't change it
- RECS_PATH = os.path.join(DATASET_PATH,'metadata','recordings.csv')
- CHILDREN_PATH = os.path.join(DATASET_PATH,'metadata','children.csv')
- ANN_PATH = os.path.join(DATASET_PATH,'metadata','annotations.csv')
- NEW_DATASET_PATH = os.path.join('tests','new_dataset') #dataset created by test from scratch
- NON_EXISTING_PATH = 'non_existing'
- TEST_DATASET_PATH = os.path.join('tests','test_dataset') #dataset to use for tests changing it
- TEST_RECS_META = os.path.join(TEST_DATASET_PATH,'metadata','recordings.csv')
- TEST_RECS_PATH = os.path.join(TEST_DATASET_PATH,'recordings','raw')
- TEST_CHI_META = os.path.join(TEST_DATASET_PATH,'metadata','children.csv')
- TEST_ANN_META = os.path.join(TEST_DATASET_PATH,'metadata','annotations.csv')
- INPUT_RECS = os.path.join('tests','data','recs')
- def set_up_dataset():
- if os.path.exists(TEST_DATASET_PATH):
- shutil.rmtree(TEST_DATASET_PATH)
- shutil.copytree(DATASET_PATH, TEST_DATASET_PATH)
-
- ################### utils ################
- @pytest.mark.parametrize('file,result',
- [(CHILDREN_FILE, "children"),])
- def test_get_raw_filename(file, result):
- assert id_utils.get_raw_filename(file) == result
- #assert get_raw_filename(file) == result
-
- @pytest.mark.parametrize('path,ext,full_path,result',
- [(DATASET_PATH, ["csv","rttm"], True,
- sorted([os.path.join(DATASET_PATH,'annotations/vtc/raw/VTC_20220103.rttm'),
- os.path.join(DATASET_PATH,'annotations/vtc/raw/VTC_20220124.rttm'),
- os.path.join(DATASET_PATH,'metadata/annotations.csv'),
- os.path.join(DATASET_PATH,'metadata/children.csv'),
- os.path.join(DATASET_PATH,'metadata/recordings.csv'),
- os.path.join(DATASET_PATH,'annotations/vtc/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv'),
- os.path.join(DATASET_PATH,'annotations/alice/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv'),
- os.path.join(DATASET_PATH,'annotations/vcm/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv'),
- os.path.join(DATASET_PATH,'annotations/acoustic/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv'),
- os.path.join(DATASET_PATH,'annotations/conversations/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv'),
- os.path.join(DATASET_PATH,'annotations/alice_vtc/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv'),
- os.path.join(DATASET_PATH,'annotations/acoustic/raw/ACOUSTIC_VTC_20220103.csv'),
- os.path.join(DATASET_PATH,'annotations/conversations/raw/CONVERSATIONS_VTC_20220103.csv'),
- os.path.join(DATASET_PATH,'extra/messages/generated/messages_20220103.csv'),
- os.path.join(DATASET_PATH,'extra/metrics/metrics.csv'),
- ])),
- (DATASET_PATH, ["csv","rttm"], False,
- sorted(['annotations/vtc/raw/VTC_20220103.rttm',
- 'annotations/vtc/raw/VTC_20220124.rttm',
- 'metadata/annotations.csv',
- 'metadata/children.csv',
- 'metadata/recordings.csv',
- 'annotations/vtc/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv',
- 'annotations/alice/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv',
- 'annotations/vcm/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv',
- 'annotations/conversations/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv',
- 'annotations/acoustic/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv',
- 'annotations/alice_vtc/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv',
- 'annotations/acoustic/raw/ACOUSTIC_VTC_20220103.csv',
- 'annotations/conversations/raw/CONVERSATIONS_VTC_20220103.csv',
- 'extra/messages/generated/messages_20220103.csv',
- 'extra/metrics/metrics.csv',
- ])),
- (os.path.join(DATASET_PATH,'metadata'), [], False,
- sorted(['annotations.csv',
- 'children.csv',
- 'recordings.csv',
- 'readme.md',
- ])),
- (NON_EXISTING_PATH, ["csv","rttm"], False,
- []),
- (DATASET_PATH, "invented", True,
- []),
- ])
- def test_walk_dir(path, ext, full_path, result):
- r = id_utils.walk_dir(path, ext, full_path)
- print(r)
- assert r == result
-
- ##########################################
-
- ############# prepare_data_set ###########
- #tree when creating a new empty dataset from the NEW_DATASET_PATH var
- CREATION_TREE = [(NEW_DATASET_PATH, ['metadata', 'extra', 'annotations', 'recordings'], []),
- (os.path.join(NEW_DATASET_PATH,'metadata'), [], ['.gitkeep']),
- (os.path.join(NEW_DATASET_PATH,'extra'), [], ['.gitkeep']),
- (os.path.join(NEW_DATASET_PATH,'annotations'), ['vtc', 'acoustic', 'vcm', 'conversations', 'alice'], ['.gitkeep']),
- (os.path.join(NEW_DATASET_PATH,'annotations/vtc'), ['raw'], []),
- (os.path.join(NEW_DATASET_PATH,'annotations/vtc/raw'), [], ['.gitkeep']),
- (os.path.join(NEW_DATASET_PATH,'annotations/acoustic'), ['raw'], []),
- (os.path.join(NEW_DATASET_PATH,'annotations/acoustic/raw'), [], ['.gitkeep']),
- (os.path.join(NEW_DATASET_PATH,'annotations/vcm'), ['raw'], []),
- (os.path.join(NEW_DATASET_PATH,'annotations/vcm/raw'), [], ['.gitkeep']),
- (os.path.join(NEW_DATASET_PATH,'annotations/conversations'), ['raw'], []),
- (os.path.join(NEW_DATASET_PATH,'annotations/conversations/raw'),[], ['.gitkeep']),
- (os.path.join(NEW_DATASET_PATH,'annotations/alice'), ['raw'], []),
- (os.path.join(NEW_DATASET_PATH,'annotations/alice/raw'), [], ['.gitkeep']),
- (os.path.join(NEW_DATASET_PATH,'recordings'), ['raw'], []),
- (os.path.join(NEW_DATASET_PATH,'recordings/raw'), [], ['.gitkeep']),
- ]
- def test_create_child_project_directories():
- pds.create_child_project_directories(NEW_DATASET_PATH)
- tree = list(os.walk(NEW_DATASET_PATH))
- shutil.rmtree(NEW_DATASET_PATH)
-
- assert tree == CREATION_TREE
-
- ##########################################
-
- ########## import_recordings #############
- @pytest.mark.parametrize('path,result',
- [(RECS_PATH, pd.read_csv(RECS_PATH)),
- (NON_EXISTING_PATH, pd.DataFrame(columns = ['experiment', 'experiment_stage', 'child_id', 'date_iso', 'start_time',
- 'recording_device_type', 'recording_filename', 'session_id'])),])
- def test_get_recordings(path,result):
- recs = ir._get_recordings(path)
- pd.testing.assert_frame_equal(recs,result)
-
-
-
- @pytest.mark.parametrize('path,result',
- [(CHILDREN_PATH, pd.read_csv(CHILDREN_PATH)),
- (NON_EXISTING_PATH, pd.DataFrame(columns = ['experiment', 'child_id', 'child_dob'])),])
- def test_get_children(path,result):
- childn = ir._get_children(path)
- pd.testing.assert_frame_equal(childn,result)
- INCORRECT_RECS_PATH = os.path.join('test','data','incorrect-recs')
- @pytest.mark.parametrize('path,rec,result',
- [(RECS_PATH, "14T_Audio-1-familia-14T-CP-2020-02_20220103_000000.wav", {'experiment': 'test', 'experiment_stage': 'Audio-1-familia-14T-CP-2020-02', 'child_id': 'chi_14T', 'date_iso': '2022-01-03', 'start_time': '00:00:00', 'recording_device_type': 'lena', 'recording_filename': '14T_Audio-1-familia-14T-CP-2020-02_20220103_000000.wav', 'session_id': 'chi_14T_20220103', 'duration': 0, 'imported_at': '2022-10-26 14:49:10'}),
- (INCORRECT_RECS_PATH, "14T_Audio-1-familia-14T-CP-2020-02_20220103-000000.wav", False),
- (INCORRECT_RECS_PATH, "14T_Audio-1-familia-14T-CP-2020-02_20221403_000000.wav", False),
- (INCORRECT_RECS_PATH, "14T_Audio-1-familia-14T-CP-2020-02_20220103_520000.wav", False),])
- def test_build_recording_metadata(path, rec, result):
- metadata = ir._build_recording_metadata(path,rec,'test', 'lena')
- if result:
- result['imported_at'] = metadata['imported_at'] #forced to get the imported at value for consistency
- assert metadata == result
-
-
-
- #dependent on _get_recordings - _build_recording_metadata
- def test_import_recordings():
- truth = os.path.join('tests','data','truth','new_recs_import.csv')
- set_up_dataset()
-
- #copy a new rec into the dataset
- shutil.copy2(os.path.join(INPUT_RECS,'23T_Audio-3-familia-23T-lectura-01_20220124_000000.wav'),TEST_RECS_PATH)
- #os.path.join(TEST_RECS_PATH,'23T_Audio-3-familia-23T-lectura-01_20220124_000000.wav'))
- ir.import_recordings(TEST_DATASET_PATH,'test','lena')
-
- pd.testing.assert_frame_equal(pd.read_csv(TEST_RECS_META).drop(columns=['imported_at']),pd.read_csv(truth).drop(columns=['imported_at']),check_like=True)
-
-
- #dependent on _get_recordings - _get_children - import_recordings
- def test_import_children():
- truth = os.path.join('tests','data','truth','new_children_import.csv')
- set_up_dataset()
-
- #copy a new rec into the dataset
- shutil.copy2(os.path.join(INPUT_RECS,'23T_Audio-3-familia-23T-lectura-01_20220124_000000.wav'),TEST_RECS_PATH)
- ir.import_recordings(TEST_DATASET_PATH,'test','lena')
-
- #now import the child info
- ir.import_children(TEST_DATASET_PATH, 'test')
-
- pd.testing.assert_frame_equal(pd.read_csv(TEST_CHI_META),pd.read_csv(truth),check_like=True)
-
- #COMMENTED OUT : as of now, is just import_recordings followed by import_children, so already tested
- #dependent on import_recordings - import_children
- #def test_data_importation():
- # truth_r = os.path.join('tests','data','truth','new_recs_import.csv')
- # truth_c = os.path.join('tests','data','truth','new_children_import.csv')
- #
- # #copy a new rec into the dataset
- # shutil.copy2(os.path.join(INPUT_RECS,'23T_Audio-3-familia-23T-lectura-01_20220124_000000.wav'),TEST_RECS_PATH)
- #
- # ir.test_data_importation(TEST_DATASET_PATH,'test','lena')
- #
- # pd.testing.assert_frame_equal(pd.read_csv(TEST_RECS_META).drop(columns=['imported_at']),pd.read_csv(truth_r).drop(columns=['imported_at']),check_like=True)
- # pd.testing.assert_frame_equal(pd.read_csv(TEST_CHI_META),pd.read_csv(truth_c),check_like=True)
-
- #####################################################
-
- ################# import_annotations ################
-
- @pytest.mark.parametrize('remove',
- [(False),
- (True),])
- def test_filter_missing_annotation_files(remove):
- set_up_dataset()
- load = pd.read_csv(TEST_ANN_META)
- load = load[~load['set'].isin({'alice_vtc'})] #remove alice_vtc lines as thei don't have raw files with them, so this would give a bad check
- res = load.copy()
- if remove:
- rm1 = 'vtc'
- rm2 = 'acoustic'
- res = res[~res['set'].isin([rm1,rm2])]
- shutil.rmtree(os.path.join(TEST_DATASET_PATH,'annotations',rm1,'raw'))
- os.remove(os.path.join(TEST_DATASET_PATH,'annotations',rm2,'raw','ACOUSTIC_VTC_20220103.csv'))
-
- res['exists'] = True #the function always appends a 'exists' column, maybe drop it?
- pd.testing.assert_frame_equal(ia._filter_missing_annotation_files(TEST_DATASET_PATH,load), res, check_like=True)
- def test_check_importation():
- #this function is here to generate warnings and checks, will have to add a test to check logging output
- pass
- @pytest.mark.parametrize('recording,empty',
- [('14T_Audio-1-familia-14T-CP-2020-02_20220103_000000.wav',False),
- ('does_not_exist',True),])
- def test_ia_get_recordings(recording,empty):
- annot_set = 'vtc'
- project = ChildProject(DATASET_PATH)
- project.read()
- if empty:
- res = pd.DataFrame(columns=['recording_filename','set','format','time_seek','range_onset', 'range_offset'])
- res = res.astype(dtype={'time_seek': 'int' , 'range_onset': 'int','range_offset': 'int'})
- else:
- res = pd.read_csv(TEST_ANN_META)[['recording_filename','set','format','time_seek','range_onset', 'range_offset']]
- res = res[res['set'] == annot_set]
-
- df = ia._get_recordings(project, annot_set,'vtc_rttm',recording)
- df.drop(columns='child_id', inplace=True)
-
- pd.testing.assert_frame_equal(res.reset_index(drop=True),df.reset_index(drop=True),check_like=True, check_index_type=False)
- @pytest.mark.parametrize('ann_set,name,result',
- [('vtc','','14T_Audio-1-familia-14T-CP-2020-02_20220103_000000.rttm'),
- ('acoustic','filename','filename'),])
- def test_build_raw_filename(ann_set,name,result):
- recs = pd.read_csv(RECS_PATH)[['recording_filename']]
- annots = pd.read_csv(ANN_PATH)
- annots = annots[annots['set'] == ann_set][['recording_filename','raw_filename','filter']]
-
- annots['raw_filename'] = result
-
- df = ia._build_raw_filename(recs, ann_set, name)
-
- pd.testing.assert_frame_equal(annots.reset_index(drop=True),df.reset_index(drop=True), check_like=True)
-
-
- #TODO add test for file non existing(as currently the import is aborted and prints a warning but does not fail)
- @pytest.mark.parametrize('ann_set,file,rfaf,recording',
- [('vtc','VTC_20220103.rttm',None,None),
- ('acoustic','ACOUSTIC_VTC_20220103.csv','VTC_20220103.rttm',None),])
- def test_import_annotation(ann_set,file,rfaf,recording):
- dtypes_forces = {'merged_from': 'str'}
-
- set_up_dataset()
- p = ChildProject(TEST_DATASET_PATH)
- am = AnnotationManager(p)
- if ann_set in {'vtc'} : am.remove_set('alice_vtc') #remove sets that were merged because dataset will fail with missing sets used in merges
- #TODO make this more general, or force childproject to remove sets used
- am.remove_set(ann_set)
-
- p_truth = ChildProject(DATASET_PATH)
- am_truth = AnnotationManager(p_truth)
- annots_truth = am_truth.annotations.drop(columns=['imported_at'])
- if ann_set in {'vtc'} : annots_truth = annots_truth[~annots_truth['set'].isin({'alice_vtc'})] #same reason
-
- ia._import_annotation(p,am, ann_set, file,rfaf,recording)
- annots = am.annotations.drop(columns=['imported_at']).sort_values(by='set').astype(dtypes_forces)
- annots_truth = annots_truth.sort_values(by='set').astype(dtypes_forces)
- print(annots)
- print(annots_truth)
-
- pd.testing.assert_frame_equal(annots_truth.reset_index(drop=True),annots.reset_index(drop=True),check_like=True)
- #COMMENTED OUT : as of now, just calls _import_annotation
- #def test_import_annotations():
- # pass
- ###################################################
-
- ############## custom_converters ##################
-
- single_rec_csv = os.path.join(DATASET_PATH,'annotations','acoustic','converted','14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv')
- multi_rec_csv = os.path.join('tests','data', 'csv','multi_rec_vtc.csv')
- no_rec_name = os.path.join(DATASET_PATH,'annotations','vtc','converted','14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv')
- @pytest.mark.parametrize('file,filter,fails,error',
- [(single_rec_csv,'14T_Audio-1-familia-14T-CP-2020-02_20220103_000000.wav',False,None),
- (single_rec_csv,'',False,None),
- (multi_rec_csv,'VTC_20220124.wav',False,None),
- (NON_EXISTING_PATH,'',True,FileNotFoundError),
- (no_rec_name,'',True,KeyError),
- ])
- def test_filteredCsvConverter(file, filter, fails,error):
- if fails:
- with pytest.raises(error):
- df = cc.FilteredCsvConverter.convert(file, filter)
- else:
- df = cc.FilteredCsvConverter.convert(file, filter)
-
- truth = pd.read_csv(file)
- if filter:
- truth = truth[truth["recording_filename"].str.contains(filter)]
-
- pd.testing.assert_frame_equal(df, truth, check_like=True)
-
- ###################################################
-
|