1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465 |
- #!usr/bin/env python
- # -*- coding: utf8 -*-
- #
- # Author: William N. Havard
- #
- import os
- import pandas as pd
- def _raw_filename(path):
- return os.path.splitext(os.path.basename(path))[0]
- def _list_duplicated(list_):
- return list(set([x for x in list_ if list_.count(x) > 1]))
- def walk_dir(path, ext='', return_full_path=True):
- files = []
- for p, d, f in os.walk(path):
- for ff in f:
- if ff.endswith(ext):
- path_suffix = p.replace(os.path.commonprefix([path, p]), '')
- if return_full_path:
- files.append(os.path.join(path, path_suffix, ff))
- else:
- files.append(os.path.join(path_suffix, ff))
- return sorted(files)
- def main():
- RECORDINGS = './recordings/raw'
- ANNOTATIONS_CHA = './annotations/cha/raw'
- METADATA = './metadata'
- assert all(map(os.path.exists, [RECORDINGS, ANNOTATIONS_CHA, METADATA])), IOError("Run me from the project's root!")
- wav_files = list(map(_raw_filename, walk_dir(RECORDINGS, ext='.wav', return_full_path=False)))
- cha_files = list(map(_raw_filename, walk_dir(ANNOTATIONS_CHA, ext='.cha', return_full_path=False)))
- metadata = pd.read_csv(os.path.join(METADATA, 'recordings.csv'))
- metadata['recording_filename_raw'] = metadata['recording_filename'].apply(_raw_filename)
- # Check there are no duplicate WAV or CHA files
- assert not len(_list_duplicated(wav_files)), \
- ValueError('Duplicate WAV files! ({})'.format(_list_duplicated(wav_files)))
- assert not len(_list_duplicated(cha_files)), \
- ValueError('Duplicate CHA files! ({})'.format(_list_duplicated(cha_files)))
- # Check that each WAV files is mapped to a CHA file and vice-versa
- assert set(wav_files).issubset(set(cha_files)), \
- ValueError('Missing CHA files for some WAV files!\n{}'.
- format('\n\t- '.join(sorted(set(wav_files)-set(cha_files)))))
- assert set(cha_files).issubset(set(wav_files)), \
- ValueError('Missing WAV files for some CHA files!\n{}'.
- format('\n\t- '.join(sorted(set(cha_files)-set(wav_files)))))
- # Check that each WAV files appears in the metadata and vice-versa
- assert set(metadata['recording_filename_raw']).issubset(set(wav_files)), \
- ValueError('Missing WAV files that are mentioned in the metadata!\n{}'.
- format('\n\t- '.join(sorted(set(wav_files)-set(metadata['recording_filename_raw'])))))
- assert set(wav_files).issubset(set(metadata['recording_filename_raw'])), \
- ValueError('Missing `recording_filename_raw` in metadata corresponding to WAV files found in ./recordings!\n{}'.
- format('\n\t- '.join(sorted(set(metadata['recording_filename_raw'])-set(wav_files)))))
- print('Everything is looking good. Congrats!')
- if __name__ == '__main__':
- main()
|