123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224 |
- #!usr/bin/env python
- # -*- coding: utf8 -*-
- # -----------------------------------------------------------------------------
- # File: metrics.py (as part of project URUMETRICS)
- # Created: 28/07/2022 13:58
- # Last Modified: 28/07/2022 13:58
- # -----------------------------------------------------------------------------
- # Author: William N. Havard
- # Postdoctoral Researcher
- #
- # Mail : william.havard@ens.fr / william.havard@gmail.com
- #
- # Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
- #
- # ------------------------------------------------------------------------------
- # Description:
- # •
- # -----------------------------------------------------------------------------
- import logging
- import os
- import sys
- from typing import List, Union
- import numpy as np
- import pandas as pd
- from ChildProject.annotations import AnnotationManager
- from ChildProject.pipelines.metrics import Metrics
- from ChildProject.projects import ChildProject
- from metrics_functions import avg_pr_pm_speaker, avg_sr_pm_speaker, avg_wr_pm_speaker, \
- mean_mean_pitch_speaker, mean_pitch_range_speaker, \
- mlup_speaker, mlus_speaker, mluw_speaker, \
- chi_adu_turn_transitions
- logger = logging.getLogger(__name__)
- class UruguayMetrics(Metrics):
- SUBCOMMAND = "uruguay"
- def __init__(
- self,
- project: ChildProject,
- vtc: str = "vtc",
- alice: str = "alice",
- vcm: str = "vcm",
- acoustic: str="acoustic",
- recordings: Union[str, List[str], pd.DataFrame] = None,
- from_time: str = None,
- to_time: str = None,
- rec_cols: str = None,
- child_cols: str = None,
- period: str = None,
- by: str = "recording_filename",
- threads: int = 1,
- ):
- self.vtc = vtc
- self.alice = alice
- self.vcm = vcm
- self.acoustic = acoustic
- self.alice_vtc = 'alice_vtc'
- self.conversations = 'conversations'
- # Creating merged alice_vtc set
- am = AnnotationManager(project)
- am.read()
- am.merge_sets(
- left_set="vtc",
- right_set="alice",
- left_columns=["speaker_type"],
- right_columns=["phonemes", "syllables", "words"],
- output_set=self.alice_vtc,
- full_set_merge=False,
- skip_existing=True,
- )
- METRICS = np.array([]).reshape(0, 3)
- # VTC
- if (expected_set:= self.vtc) not in am.annotations["set"].values:
- print(f"The expected set '{expected_set}' was not found in the index.")
- else:
- METRICS = np.concatenate((METRICS, np.array(
- [["voc_speaker_ph", expected_set, 'FEM'],
- ["voc_speaker_ph", expected_set, 'CHI'],
- ["voc_dur_speaker_ph", expected_set, 'FEM'],
- ["voc_dur_speaker_ph", expected_set, 'CHI'],
- ["avg_voc_dur_speaker", expected_set, 'FEM'],
- ["avg_voc_dur_speaker", expected_set, 'CHI'],
- ])))
- # VCM
- if (expected_set:= self.vcm )not in am.annotations["set"].values:
- print(f"The expected set '{expected_set}' was not found in the index.")
- else:
- METRICS = np.concatenate((METRICS, np.array(
- [["cry_voc_speaker_ph", expected_set, 'CHI'],
- ["cry_voc_dur_speaker_ph", expected_set, 'CHI'],
- ["avg_cry_voc_dur_speaker", expected_set, 'CHI'],
- ["can_voc_speaker_ph", expected_set, 'CHI'],
- ["can_voc_dur_speaker_ph", expected_set, 'CHI'],
- ["avg_can_voc_dur_speaker", expected_set, 'CHI'],
- ["non_can_voc_speaker_ph", expected_set, 'CHI'],
- ["non_can_voc_dur_speaker_ph", expected_set, 'CHI'],
- ["avg_non_can_voc_dur_speaker", expected_set, 'CHI'],
- ["lp_n", expected_set, pd.NA],
- ["lp_dur", expected_set, pd.NA],
- ["cp_n", expected_set, pd.NA],
- ["cp_dur", expected_set, pd.NA],
- ])))
- # ALICE+VTC
- if (expected_set := self.alice_vtc) not in am.annotations["set"].values:
- print(f"The expected set '{expected_set}' was not found in the index.")
- else:
- METRICS = np.concatenate((METRICS, np.array(
- [
- [avg_pr_pm_speaker, expected_set, 'FEM'],
- [avg_sr_pm_speaker, expected_set, 'FEM'],
- [avg_wr_pm_speaker, expected_set, 'FEM'],
- ["wc_speaker_ph", expected_set, 'FEM'],
- ["sc_speaker_ph", expected_set, 'FEM'],
- ["pc_speaker_ph", expected_set, 'FEM'],
- [mluw_speaker, expected_set, 'FEM'],
- [mlus_speaker, expected_set, 'FEM'],
- [mlup_speaker, expected_set, 'FEM'],
- ])))
- # Acoustic
- if (expected_set := self.acoustic) not in am.annotations["set"].values:
- print(f"The expected set '{expected_set}' was not found in the index.")
- else:
- METRICS = np.concatenate((METRICS, np.array(
- [
- [mean_mean_pitch_speaker, expected_set, 'FEM'],
- [mean_mean_pitch_speaker, expected_set, 'CHI'],
- [mean_pitch_range_speaker, expected_set, 'FEM'],
- [mean_pitch_range_speaker, expected_set, 'CHI'],
- ])))
- # Turns
- if (expected_set := self.conversations) not in am.annotations["set"].values:
- print(f"The expected set '{expected_set}' was not found in the index.")
- else:
- METRICS = np.concatenate((METRICS, np.array(
- [
- [chi_adu_turn_transitions, expected_set, None],
- ])))
- METRICS = pd.DataFrame(METRICS, columns=["callable", "set", "speaker"])
- super().__init__(project, METRICS, by=by, recordings=recordings,
- period=period, from_time=from_time, to_time=to_time,
- rec_cols=rec_cols, child_cols=child_cols, threads=threads)
- def get_metrics(project_path, metrics_file):
- project = ChildProject(project_path)
- project.read()
- metrics_path = os.path.dirname(metrics_file)
- if not os.path.exists(metrics_path):
- os.makedirs(metrics_path)
- logger.info('Created {}'.format(metrics_path))
- if not os.path.exists(metrics_file):
- new_recordings = set(project.recordings['recording_filename'])
- existing_metrics = None
- else:
- existing_metrics = pd.read_csv(metrics_file)
- new_recordings = set(project.recordings['recording_filename']) - set(existing_metrics['recording_filename'])
- if new_recordings:
- urumetrics = UruguayMetrics(project=project, recordings=list(new_recordings))
- metrics = urumetrics.extract()
- if isinstance(existing_metrics, pd.DataFrame):
- metrics = pd.concat([existing_metrics, metrics])
- metrics.to_csv(metrics_file, index=False)
- def main(project_path):
- # Check if running the script from the root of the data set
- expected_metrics_path = os.path.join(project_path, 'extra')
- assert os.path.exists(expected_metrics_path), \
- ValueError('Expected metrics ({}) path not found. Are you sure to be running this '
- 'command from the root of the data set?'.format(expected_metrics_path))
- metrics_file = os.path.join(project_path, 'extra', 'metrics', 'metrics.csv')
- get_metrics(project_path, metrics_file)
- logger.info('Saved to {}.'.format(metrics_file))
- def _parse_args(argv):
- import argparse
- parser = argparse.ArgumentParser(description='Create a ChildProject data set.')
- parser.add_argument('--project-path', required=False, type=str, default='',
- help="Path to a ChildProject/datalad project (useful for debugging purposes).")
- args = parser.parse_args(argv)
- return vars(args)
- if __name__ == '__main__':
- import sys
- pgrm_name, argv = sys.argv[0], sys.argv[1:]
- args = _parse_args(argv)
- logging.basicConfig(level=logging.INFO)
- try:
- main(**args)
- sys.exit(0)
- except Exception as e:
- print(e)
- sys.exit(1)
|