utils_annotations.py 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121
  1. #!usr/bin/env python
  2. # -*- coding: utf8 -*-
  3. # -----------------------------------------------------------------------------
  4. # File: utils_annotations.py (as part of project URUMETRICS)
  5. # Created: 01/06/2022 17:15
  6. # Last Modified: 01/06/2022 17:15
  7. # -----------------------------------------------------------------------------
  8. # Author: William N. Havard
  9. # Postdoctoral Researcher
  10. #
  11. # Mail : william.havard@ens.fr / william.havard@gmail.com
  12. #
  13. # Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
  14. #
  15. # ------------------------------------------------------------------------------
  16. # Description:
  17. # •
  18. # -----------------------------------------------------------------------------
  19. import librosa
  20. import numpy as np
  21. import pandas as pd
  22. import utils_audio
  23. from math import ceil, floor
  24. from utils_audio import get_audio_slice, read_audio
  25. def _compute_file_acoustic_annotation(audio_path, audio_segments, target_sr):
  26. """
  27. Compute the acoustic annotations for the segments audio_segment of the file audio_path with sr target_sr
  28. :param audio_path: path to the audio file to be read
  29. :type audio_path: str
  30. :param audio_segments: dataframe of segments for which we want to compute annotations
  31. :type audio_segments: pd.DataFrame
  32. :param target_sr: target sampling rate of the recording
  33. :type target_sr: int
  34. :return: annotations
  35. :rtype: pd.DataFrame
  36. """
  37. audio_time_series, sampling_rate = read_audio(audio_path, target_sr=target_sr)
  38. # Computes the start frame and end frame of the given segments given is on/offset in seconds
  39. audio_segments['frame_onset'] = audio_segments['segment_onset'].apply(
  40. lambda onset: floor(onset / 1000 * sampling_rate))
  41. audio_segments['frame_offset'] = audio_segments['segment_offset'].apply(
  42. lambda offset: ceil(offset / 1000 * sampling_rate))
  43. # Find better solution if more acoustic annotations are added in the future (concat dfs)
  44. annotations = _annotation_pitch(audio_segments, audio_time_series, target_sr)
  45. annotations.drop(columns=['frame_onset',
  46. 'frame_offset'],
  47. inplace=True)
  48. return annotations
  49. def _annotation_pitch(audio_segments, audio_time_series, sampling_rate):
  50. """
  51. Extract pitch related information for the audio segments audio_segments of the time series audio_time_series
  52. with sr sampling_rate
  53. :param audio_segments: dataframe of segments for which we want to compute annotations
  54. :type audio_segments: pd.DataFrame
  55. :param audio_time_series: audio time series
  56. :type audio_time_series: np.matrix
  57. :param sampling_rate: sampling rate
  58. :type sampling_rate: int
  59. :return: pitch annotations
  60. :rtype: pd.DataFrame
  61. """
  62. pitch = pd.DataFrame.from_records(audio_segments.apply(lambda row:
  63. get_pitch(
  64. get_audio_slice(
  65. audio_time_series, row['frame_onset'],
  66. row['frame_offset']
  67. ),
  68. sampling_rate, func=utils_audio.f2st
  69. ), axis=1).tolist())
  70. # Drop raw pitch values
  71. pitch.drop(list(pitch.filter(regex='raw_')), axis=1, inplace=True)
  72. pitch.index = audio_segments.index
  73. audio_segments = pd.concat([audio_segments, pitch], axis=1)
  74. return audio_segments
  75. def get_pitch(audio_time_series, sampling_rate, func=None):
  76. """
  77. Returns pitch-related annotations.
  78. Regarding pitch range, we use the 5-th percentile as the bottom of the range, and the 95-th percentile as the top.
  79. (see https://www.ibm.com/docs/en/wvs/6.1.1?topic=guide-introduction-pitch-its-use-ssml or
  80. https://languagelog.ldc.upenn.edu/nll/?p=40788 who also use the same methodology)
  81. :param audio_time_series: real-valued vector
  82. :type audio_time_series: np.array
  83. :param sampling_rate: sampling rate
  84. :type sampling_rate: int
  85. :param func: transformation function to apply to the fundamental frequency
  86. :type func: callable
  87. :return: raw pitch, mean pitch, median pitch, 5-th percentile, 95-th percentile, pitch range
  88. :rtype: dict
  89. """
  90. f0 = librosa.yin(audio_time_series,
  91. fmin=60,
  92. fmax=500,
  93. sr=sampling_rate) # pyin does not work, why?
  94. pitch = func(f0) if callable(func) else f0
  95. mean_pitch, median_pitch, p5_pitch, p95_pitch = pitch.mean(), np.quantile(pitch, .5), \
  96. np.percentile(pitch, 5), np.percentile(pitch, 95)
  97. pitch_type = "f0" if not callable(func) else func.__name__
  98. return {"raw_pitch_{}".format(pitch_type): f0,
  99. "mean_pitch_{}".format(pitch_type): mean_pitch,
  100. "median_pitch_{}".format(pitch_type): median_pitch,
  101. "p5_pitch_{}".format(pitch_type): p5_pitch,
  102. "p95_pitch_{}".format(pitch_type): p95_pitch,
  103. "pitch_range_{}".format(pitch_type): p95_pitch - p5_pitch}