LAAC-LSCP
/
URUMETRICS-CODE


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344
							"""
This file will test the differents modules in import_data
You  must have the pip package pytest installed
"""

import os
import sys
import pytest
import shutil
import pandas as pd

from ChildProject.projects import ChildProject
from ChildProject.annotations import AnnotationManager

fpath = os.path.join(os.path.dirname(__file__),'..', 'import_data')
sys.path.append(fpath)

#from import_data.utils import get_raw_filename
#from utils import get_raw_filename
try:
    sys.modules.pop('utils') #need to do this to avoid utils conflict of importation
except:
    pass

import utils as id_utils
import prepare_data_set as pds
import import_recordings as ir
import import_annotations as ia
import custom_converters as cc


CHILDREN_FILE = os.path.join('dataset-test','metadata','children.csv')
DATASET_PATH = os.path.join('tests','existing_dataset') #dataset already existing, used for tests that don't change it
RECS_PATH = os.path.join(DATASET_PATH,'metadata','recordings.csv')
CHILDREN_PATH = os.path.join(DATASET_PATH,'metadata','children.csv')
ANN_PATH = os.path.join(DATASET_PATH,'metadata','annotations.csv')

NEW_DATASET_PATH = os.path.join('tests','new_dataset') #dataset created by test from scratch

NON_EXISTING_PATH = 'non_existing'
TEST_DATASET_PATH = os.path.join('tests','test_dataset') #dataset to use for tests changing it
TEST_RECS_META = os.path.join(TEST_DATASET_PATH,'metadata','recordings.csv')
TEST_RECS_PATH = os.path.join(TEST_DATASET_PATH,'recordings','raw')
TEST_CHI_META = os.path.join(TEST_DATASET_PATH,'metadata','children.csv')
TEST_ANN_META = os.path.join(TEST_DATASET_PATH,'metadata','annotations.csv')
INPUT_RECS = os.path.join('tests','data','recs')

def set_up_dataset():
    if os.path.exists(TEST_DATASET_PATH):
        shutil.rmtree(TEST_DATASET_PATH)
    shutil.copytree(DATASET_PATH, TEST_DATASET_PATH)    
    
################### utils ################

@pytest.mark.parametrize('file,result',
    [(CHILDREN_FILE, "children"),])
def test_get_raw_filename(file, result):
    assert id_utils.get_raw_filename(file) == result
    #assert get_raw_filename(file) == result
    
@pytest.mark.parametrize('path,ext,full_path,result',
    [(DATASET_PATH, ["csv","rttm"], True,
      sorted([os.path.join(DATASET_PATH,'annotations/vtc/raw/VTC_20220103.rttm'),
              os.path.join(DATASET_PATH,'annotations/vtc/raw/VTC_20220124.rttm'),
              os.path.join(DATASET_PATH,'metadata/annotations.csv'),
              os.path.join(DATASET_PATH,'metadata/children.csv'),
              os.path.join(DATASET_PATH,'metadata/recordings.csv'),
              os.path.join(DATASET_PATH,'annotations/vtc/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv'),
              os.path.join(DATASET_PATH,'annotations/alice/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv'),
              os.path.join(DATASET_PATH,'annotations/vcm/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv'),
              os.path.join(DATASET_PATH,'annotations/acoustic/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv'),
              os.path.join(DATASET_PATH,'annotations/conversations/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv'),             
              os.path.join(DATASET_PATH,'annotations/alice_vtc/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv'),             
              os.path.join(DATASET_PATH,'annotations/acoustic/raw/ACOUSTIC_VTC_20220103.csv'),             
              os.path.join(DATASET_PATH,'annotations/conversations/raw/CONVERSATIONS_VTC_20220103.csv'),
              os.path.join(DATASET_PATH,'extra/messages/generated/messages_20220103.csv'),
              os.path.join(DATASET_PATH,'extra/metrics/metrics.csv'), 
              ])),
     (DATASET_PATH, ["csv","rttm"], False,
      sorted(['annotations/vtc/raw/VTC_20220103.rttm',
              'annotations/vtc/raw/VTC_20220124.rttm',
              'metadata/annotations.csv',
              'metadata/children.csv',
              'metadata/recordings.csv',
              'annotations/vtc/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv',
              'annotations/alice/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv',
              'annotations/vcm/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv',
              'annotations/conversations/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv',
              'annotations/acoustic/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv',
              'annotations/alice_vtc/converted/14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv',
              'annotations/acoustic/raw/ACOUSTIC_VTC_20220103.csv',
              'annotations/conversations/raw/CONVERSATIONS_VTC_20220103.csv',
              'extra/messages/generated/messages_20220103.csv',
              'extra/metrics/metrics.csv',
              ])),
     (os.path.join(DATASET_PATH,'metadata'), [], False,
      sorted(['annotations.csv',
              'children.csv',
              'recordings.csv',
              'readme.md',
              ])),
     (NON_EXISTING_PATH, ["csv","rttm"], False,
      []),
     (DATASET_PATH, "invented", True,
      []),
      ])
def test_walk_dir(path, ext, full_path, result):
    r = id_utils.walk_dir(path, ext, full_path)
    print(r)
    assert r == result
    
##########################################
    
############# prepare_data_set ###########

#tree when creating a new empty dataset from the NEW_DATASET_PATH var
CREATION_TREE = [(NEW_DATASET_PATH, ['metadata', 'extra', 'annotations', 'recordings'], []),
                 (os.path.join(NEW_DATASET_PATH,'metadata'), [], ['.gitkeep']),
                 (os.path.join(NEW_DATASET_PATH,'extra'), [], ['.gitkeep']),
                 (os.path.join(NEW_DATASET_PATH,'annotations'), ['vtc', 'acoustic', 'vcm', 'conversations', 'alice'], ['.gitkeep']),            
                 (os.path.join(NEW_DATASET_PATH,'annotations/vtc'), ['raw'], []),
                 (os.path.join(NEW_DATASET_PATH,'annotations/vtc/raw'), [], ['.gitkeep']),
                 (os.path.join(NEW_DATASET_PATH,'annotations/acoustic'), ['raw'], []), 
                 (os.path.join(NEW_DATASET_PATH,'annotations/acoustic/raw'), [], ['.gitkeep']),
                 (os.path.join(NEW_DATASET_PATH,'annotations/vcm'), ['raw'], []),
                 (os.path.join(NEW_DATASET_PATH,'annotations/vcm/raw'), [], ['.gitkeep']),
                 (os.path.join(NEW_DATASET_PATH,'annotations/conversations'), ['raw'], []),
                 (os.path.join(NEW_DATASET_PATH,'annotations/conversations/raw'),[], ['.gitkeep']),
                 (os.path.join(NEW_DATASET_PATH,'annotations/alice'), ['raw'], []),
                 (os.path.join(NEW_DATASET_PATH,'annotations/alice/raw'), [], ['.gitkeep']),
                 (os.path.join(NEW_DATASET_PATH,'recordings'), ['raw'], []),
                 (os.path.join(NEW_DATASET_PATH,'recordings/raw'), [], ['.gitkeep']),
                 ]

def test_create_child_project_directories():
    pds.create_child_project_directories(NEW_DATASET_PATH)
    tree = list(os.walk(NEW_DATASET_PATH))
    shutil.rmtree(NEW_DATASET_PATH)
    
    assert tree == CREATION_TREE
    
##########################################
    
########## import_recordings #############

@pytest.mark.parametrize('path,result',
    [(RECS_PATH, pd.read_csv(RECS_PATH)),
     (NON_EXISTING_PATH, pd.DataFrame(columns = ['experiment', 'experiment_stage', 'child_id', 'date_iso', 'start_time',
                   'recording_device_type', 'recording_filename', 'session_id'])),])  
def test_get_recordings(path,result):
    recs = ir._get_recordings(path)
    pd.testing.assert_frame_equal(recs,result)
    
    
@pytest.mark.parametrize('path,result',
    [(CHILDREN_PATH, pd.read_csv(CHILDREN_PATH)),
     (NON_EXISTING_PATH, pd.DataFrame(columns = ['experiment', 'child_id', 'child_dob'])),])  
def test_get_children(path,result):
    childn = ir._get_children(path)
    pd.testing.assert_frame_equal(childn,result)


INCORRECT_RECS_PATH = os.path.join('test','data','incorrect-recs')
@pytest.mark.parametrize('path,rec,result',
    [(RECS_PATH, "14T_Audio-1-familia-14T-CP-2020-02_20220103_000000.wav", {'experiment': 'test', 'experiment_stage': 'Audio-1-familia-14T-CP-2020-02', 'child_id': 'chi_14T', 'date_iso': '2022-01-03', 'start_time': '00:00:00', 'recording_device_type': 'lena', 'recording_filename': '14T_Audio-1-familia-14T-CP-2020-02_20220103_000000.wav', 'session_id': 'chi_14T_20220103', 'duration': 0, 'imported_at': '2022-10-26 14:49:10'}),
     (INCORRECT_RECS_PATH, "14T_Audio-1-familia-14T-CP-2020-02_20220103-000000.wav", False),
     (INCORRECT_RECS_PATH, "14T_Audio-1-familia-14T-CP-2020-02_20221403_000000.wav", False),
     (INCORRECT_RECS_PATH, "14T_Audio-1-familia-14T-CP-2020-02_20220103_520000.wav", False),])   
def test_build_recording_metadata(path, rec, result):
    metadata = ir._build_recording_metadata(path,rec,'test', 'lena')
    if result:
        result['imported_at'] = metadata['imported_at'] #forced to get the imported at value for consistency
    assert metadata == result
    
    
#dependent on _get_recordings - _build_recording_metadata
def test_import_recordings():
    truth = os.path.join('tests','data','truth','new_recs_import.csv')
    set_up_dataset()
    
    #copy a new rec into the dataset
    shutil.copy2(os.path.join(INPUT_RECS,'23T_Audio-3-familia-23T-lectura-01_20220124_000000.wav'),TEST_RECS_PATH)
    #os.path.join(TEST_RECS_PATH,'23T_Audio-3-familia-23T-lectura-01_20220124_000000.wav'))
    ir.import_recordings(TEST_DATASET_PATH,'test','lena')
    
    pd.testing.assert_frame_equal(pd.read_csv(TEST_RECS_META).drop(columns=['imported_at']),pd.read_csv(truth).drop(columns=['imported_at']),check_like=True)
    
    
#dependent on _get_recordings - _get_children - import_recordings
def test_import_children():
    truth = os.path.join('tests','data','truth','new_children_import.csv')
    set_up_dataset()
    
    #copy a new rec into the dataset
    shutil.copy2(os.path.join(INPUT_RECS,'23T_Audio-3-familia-23T-lectura-01_20220124_000000.wav'),TEST_RECS_PATH) 
    ir.import_recordings(TEST_DATASET_PATH,'test','lena')
    
    #now import the child info
    ir.import_children(TEST_DATASET_PATH, 'test')
    
    pd.testing.assert_frame_equal(pd.read_csv(TEST_CHI_META),pd.read_csv(truth),check_like=True)
    
#COMMENTED OUT : as of now, is just import_recordings followed by import_children, so already tested  
#dependent on import_recordings - import_children
#def test_data_importation():
#    truth_r = os.path.join('tests','data','truth','new_recs_import.csv')
#    truth_c = os.path.join('tests','data','truth','new_children_import.csv')
#    
#    #copy a new rec into the dataset
#    shutil.copy2(os.path.join(INPUT_RECS,'23T_Audio-3-familia-23T-lectura-01_20220124_000000.wav'),TEST_RECS_PATH) 
#    
#    ir.test_data_importation(TEST_DATASET_PATH,'test','lena')
#    
#    pd.testing.assert_frame_equal(pd.read_csv(TEST_RECS_META).drop(columns=['imported_at']),pd.read_csv(truth_r).drop(columns=['imported_at']),check_like=True)
#    pd.testing.assert_frame_equal(pd.read_csv(TEST_CHI_META),pd.read_csv(truth_c),check_like=True)
    
#####################################################
    
################# import_annotations ################
    
@pytest.mark.parametrize('remove',
    [(False),
     (True),])   
def test_filter_missing_annotation_files(remove):
    set_up_dataset()
    load = pd.read_csv(TEST_ANN_META)
    load = load[~load['set'].isin({'alice_vtc'})] #remove alice_vtc lines as thei don't have raw files with them, so this would give a bad check
    res = load.copy()
    if remove:
        rm1 = 'vtc'
        rm2 = 'acoustic'
        res = res[~res['set'].isin([rm1,rm2])]
        shutil.rmtree(os.path.join(TEST_DATASET_PATH,'annotations',rm1,'raw'))
        os.remove(os.path.join(TEST_DATASET_PATH,'annotations',rm2,'raw','ACOUSTIC_VTC_20220103.csv'))
        
    res['exists'] = True #the function always appends a 'exists' column, maybe drop it?
    pd.testing.assert_frame_equal(ia._filter_missing_annotation_files(TEST_DATASET_PATH,load), res, check_like=True)

def test_check_importation():
    #this function is here to generate warnings and checks, will have to add a test to check logging output
    pass

@pytest.mark.parametrize('recording,empty',
    [('14T_Audio-1-familia-14T-CP-2020-02_20220103_000000.wav',False),
     ('does_not_exist',True),]) 
def test_ia_get_recordings(recording,empty):
    annot_set = 'vtc'
    project = ChildProject(DATASET_PATH)
    project.read()
    if empty:
        res = pd.DataFrame(columns=['recording_filename','set','format','time_seek','range_onset', 'range_offset'])
        res = res.astype(dtype={'time_seek': 'int' , 'range_onset': 'int','range_offset': 'int'})
    else:
        res = pd.read_csv(TEST_ANN_META)[['recording_filename','set','format','time_seek','range_onset', 'range_offset']]
        res = res[res['set'] == annot_set]
    
    df = ia._get_recordings(project, annot_set,'vtc_rttm',recording)
    df.drop(columns='child_id', inplace=True)
    
    pd.testing.assert_frame_equal(res.reset_index(drop=True),df.reset_index(drop=True),check_like=True, check_index_type=False)


@pytest.mark.parametrize('ann_set,name,result',
    [('vtc','','14T_Audio-1-familia-14T-CP-2020-02_20220103_000000.rttm'),
     ('acoustic','filename','filename'),]) 
def test_build_raw_filename(ann_set,name,result):
    recs = pd.read_csv(RECS_PATH)[['recording_filename']]
    annots = pd.read_csv(ANN_PATH)
    annots = annots[annots['set'] == ann_set][['recording_filename','raw_filename','filter']]
    
    annots['raw_filename'] = result
        
    df = ia._build_raw_filename(recs, ann_set, name)
    
    pd.testing.assert_frame_equal(annots.reset_index(drop=True),df.reset_index(drop=True), check_like=True)
    
    
#TODO add test for file non existing(as currently the import is aborted and prints a warning but does not fail)
@pytest.mark.parametrize('ann_set,file,rfaf,recording',
    [('vtc','VTC_20220103.rttm',None,None),
     ('acoustic','ACOUSTIC_VTC_20220103.csv','VTC_20220103.rttm',None),])
def test_import_annotation(ann_set,file,rfaf,recording):
    dtypes_forces = {'merged_from': 'str'}
    
    set_up_dataset()
    p = ChildProject(TEST_DATASET_PATH)
    am = AnnotationManager(p)
    if ann_set in {'vtc'} : am.remove_set('alice_vtc') #remove sets that were merged because dataset will fail with missing sets used in merges
    #TODO make this more general, or force childproject to remove sets used
    am.remove_set(ann_set)
    
    p_truth = ChildProject(DATASET_PATH)
    am_truth = AnnotationManager(p_truth)
    annots_truth = am_truth.annotations.drop(columns=['imported_at'])
    if ann_set in {'vtc'} : annots_truth = annots_truth[~annots_truth['set'].isin({'alice_vtc'})] #same reason
    
    ia._import_annotation(p,am, ann_set, file,rfaf,recording)
    annots = am.annotations.drop(columns=['imported_at']).sort_values(by='set').astype(dtypes_forces)
    annots_truth = annots_truth.sort_values(by='set').astype(dtypes_forces)
    print(annots)
    print(annots_truth)
    
    pd.testing.assert_frame_equal(annots_truth.reset_index(drop=True),annots.reset_index(drop=True),check_like=True)

#COMMENTED OUT : as of now, just calls _import_annotation
#def test_import_annotations():
#    pass

###################################################
    

############## custom_converters ##################
    
single_rec_csv = os.path.join(DATASET_PATH,'annotations','acoustic','converted','14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv')
multi_rec_csv = os.path.join('tests','data', 'csv','multi_rec_vtc.csv')
no_rec_name = os.path.join(DATASET_PATH,'annotations','vtc','converted','14T_Audio-1-familia-14T-CP-2020-02_20220103_000000_0_60000.csv')
@pytest.mark.parametrize('file,filter,fails,error',
    [(single_rec_csv,'14T_Audio-1-familia-14T-CP-2020-02_20220103_000000.wav',False,None),
     (single_rec_csv,'',False,None),
     (multi_rec_csv,'VTC_20220124.wav',False,None),
     (NON_EXISTING_PATH,'',True,FileNotFoundError),
     (no_rec_name,'',True,KeyError),
     ]) 
def test_filteredCsvConverter(file, filter, fails,error):
    if fails:
        with pytest.raises(error):
            df = cc.FilteredCsvConverter.convert(file, filter)
    else:
        df = cc.FilteredCsvConverter.convert(file, filter)
    
        truth = pd.read_csv(file)
        if filter:
            truth = truth[truth["recording_filename"].str.contains(filter)]
            
        pd.testing.assert_frame_equal(df, truth, check_like=True)
    

###################################################