prevalidate_wav_cha.py 2.8 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. #!usr/bin/env python
  2. # -*- coding: utf8 -*-
  3. #
  4. # Author: William N. Havard
  5. #
  6. import os
  7. import pandas as pd
  8. def _raw_filename(path):
  9. return os.path.splitext(os.path.basename(path))[0]
  10. def _list_duplicated(list_):
  11. return list(set([x for x in list_ if list_.count(x) > 1]))
  12. def walk_dir(path, ext='', return_full_path=True):
  13. files = []
  14. for p, d, f in os.walk(path):
  15. for ff in f:
  16. if ff.endswith(ext):
  17. path_suffix = p.replace(os.path.commonprefix([path, p]), '')
  18. if return_full_path:
  19. files.append(os.path.join(path, path_suffix, ff))
  20. else:
  21. files.append(os.path.join(path_suffix, ff))
  22. return sorted(files)
  23. def main():
  24. RECORDINGS = './recordings/raw'
  25. ANNOTATIONS_CHA = './annotations/cha/raw'
  26. METADATA = './metadata'
  27. assert all(map(os.path.exists, [RECORDINGS, ANNOTATIONS_CHA, METADATA])), IOError("Run me from the project's root!")
  28. wav_files = list(map(_raw_filename, walk_dir(RECORDINGS, ext='.wav', return_full_path=False)))
  29. cha_files = list(map(_raw_filename, walk_dir(ANNOTATIONS_CHA, ext='.cha', return_full_path=False)))
  30. metadata = pd.read_csv(os.path.join(METADATA, 'recordings.csv'))
  31. metadata['recording_filename_raw'] = metadata['recording_filename'].apply(_raw_filename)
  32. # Check there are no duplicate WAV or CHA files
  33. assert not len(_list_duplicated(wav_files)), \
  34. ValueError('Duplicate WAV files! ({})'.format(_list_duplicated(wav_files)))
  35. assert not len(_list_duplicated(cha_files)), \
  36. ValueError('Duplicate CHA files! ({})'.format(_list_duplicated(cha_files)))
  37. # Check that each WAV files is mapped to a CHA file and vice-versa
  38. assert set(wav_files).issubset(set(cha_files)), \
  39. ValueError('Missing CHA files for some WAV files!\n{}'.
  40. format('\n\t- '.join(sorted(set(wav_files)-set(cha_files)))))
  41. assert set(cha_files).issubset(set(wav_files)), \
  42. ValueError('Missing WAV files for some CHA files!\n{}'.
  43. format('\n\t- '.join(sorted(set(cha_files)-set(wav_files)))))
  44. # Check that each WAV files appears in the metadata and vice-versa
  45. assert set(metadata['recording_filename_raw']).issubset(set(wav_files)), \
  46. ValueError('Missing WAV files that are mentioned in the metadata!\n{}'.
  47. format('\n\t- '.join(sorted(set(wav_files)-set(metadata['recording_filename_raw'])))))
  48. assert set(wav_files).issubset(set(metadata['recording_filename_raw'])), \
  49. ValueError('Missing `recording_filename_raw` in metadata corresponding to WAV files found in ./recordings!\n{}'.
  50. format('\n\t- '.join(sorted(set(metadata['recording_filename_raw'])-set(wav_files)))))
  51. print('Everything is looking good. Congrats!')
  52. if __name__ == '__main__':
  53. main()