#!usr/bin/env python # -*- coding: utf8 -*- # # Author: William N. Havard # import os import pandas as pd def _raw_filename(path): return os.path.splitext(os.path.basename(path))[0] def _list_duplicated(list_): return list(set([x for x in list_ if list_.count(x) > 1])) def walk_dir(path, ext='', return_full_path=True): files = [] for p, d, f in os.walk(path): for ff in f: if ff.endswith(ext): path_suffix = p.replace(os.path.commonprefix([path, p]), '') if return_full_path: files.append(os.path.join(path, path_suffix, ff)) else: files.append(os.path.join(path_suffix, ff)) return sorted(files) def main(): RECORDINGS = './recordings/raw' ANNOTATIONS_CHA = './annotations/cha/raw' METADATA = './metadata' assert all(map(os.path.exists, [RECORDINGS, ANNOTATIONS_CHA, METADATA])), IOError("Run me from the project's root!") wav_files = list(map(_raw_filename, walk_dir(RECORDINGS, ext='.wav', return_full_path=False))) cha_files = list(map(_raw_filename, walk_dir(ANNOTATIONS_CHA, ext='.cha', return_full_path=False))) metadata = pd.read_csv(os.path.join(METADATA, 'recordings.csv')) metadata['recording_filename_raw'] = metadata['recording_filename'].apply(_raw_filename) # Check there are no duplicate WAV or CHA files assert not len(_list_duplicated(wav_files)), \ ValueError('Duplicate WAV files! ({})'.format(_list_duplicated(wav_files))) assert not len(_list_duplicated(cha_files)), \ ValueError('Duplicate CHA files! ({})'.format(_list_duplicated(cha_files))) # Check that each WAV files is mapped to a CHA file and vice-versa assert set(wav_files).issubset(set(cha_files)), \ ValueError('Missing CHA files for some WAV files!\n{}'. format('\n\t- '.join(sorted(set(wav_files)-set(cha_files))))) assert set(cha_files).issubset(set(wav_files)), \ ValueError('Missing WAV files for some CHA files!\n{}'. format('\n\t- '.join(sorted(set(cha_files)-set(wav_files))))) # Check that each WAV files appears in the metadata and vice-versa assert set(metadata['recording_filename_raw']).issubset(set(wav_files)), \ ValueError('Missing WAV files that are mentioned in the metadata!\n{}'. format('\n\t- '.join(sorted(set(wav_files)-set(metadata['recording_filename_raw']))))) assert set(wav_files).issubset(set(metadata['recording_filename_raw'])), \ ValueError('Missing `recording_filename_raw` in metadata corresponding to WAV files found in ./recordings!\n{}'. format('\n\t- '.join(sorted(set(metadata['recording_filename_raw'])-set(wav_files))))) print('Everything is looking good. Congrats!') if __name__ == '__main__': main()