123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184 |
- #!/usr/bin/env python3
- # Original file: https://gin.g-node.org/EL1000/metrics
- import argparse
- import datetime
- from functools import reduce
- import os
- import warnings
- import datalad.api
- import pandas as pd
- from ChildProject.projects import ChildProject
- from ChildProject.annotations import AnnotationManager
- from ChildProject.pipelines.metrics import LenaMetrics, AclewMetrics
- def date_is_valid(date, fmt):
- try:
- datetime.datetime.strptime(date, fmt)
- except:
- return False
- return True
- def compute_metrics(args):
- if len(args.experiments):
- experiments = args.experiments
- else:
- datasets = datalad.api.subdatasets(path='DATASETS/')
- experiments = [os.path.basename(dataset["path"]) for dataset in datasets]
- print(
- "pipeline '\033[1m{}\033[0m' will run on experiments '\033[1m{}\033[0m'".format(
- args.pipeline, ",".join(experiments)
- )
- )
- data = []
- columns = []
- for experiment in experiments:
- project = ChildProject(os.path.join("DATASETS", experiment), enforce_dtypes=True)
- am = AnnotationManager(project)
- if args.pipeline == "aclew":
- if "vtc" not in am.annotations["set"].tolist():
- print(f"skipping {experiment} (no VTC annotation)")
- continue
- metrics = AclewMetrics(
- project,
- vtc="vtc",
- alice="alice",
- vcm="vcm",
- by="session_id",
- threads=args.threads,
- ).extract()
- elif args.pipeline == "lena":
- metrics = LenaMetrics(
- project, set="its", types=["OLN"], by="session_id", threads=args.threads
- ).extract()
- elif args.pipeline == "children":
- data.append(project.children.assign(experiment=experiment))
- columns.append(project.children.columns)
- continue
- else:
- raise ValueError("undefined pipeline '{}'".format(args.pipeline))
- metrics = metrics.assign(experiment=experiment)
- if not len(metrics):
- print(
- "warning: experiment '{}' did not return any metrics for pipeline '{}'".format(
- experiment, args.pipeline
- )
- )
- continue
- # compute ages
- metrics = metrics.merge(
- project.recordings[["session_id", "date_iso"]].drop_duplicates(
- "session_id", keep="first"
- ),
- how="left",
- left_on="session_id",
- right_on="session_id",
- )
- metrics = metrics.merge(
- project.children[["child_id", "child_dob"]],
- how="left",
- left_on="child_id",
- right_on="child_id",
- )
- metrics["age"] = (
- metrics[["date_iso", "child_dob"]]
- .apply(
- lambda r: (
- datetime.datetime.strptime(r["date_iso"], "%Y-%m-%d")
- - datetime.datetime.strptime(r["child_dob"], "%Y-%m-%d")
- )
- if (
- date_is_valid(r["child_dob"], "%Y-%m-%d")
- and date_is_valid(r["date_iso"], "%Y-%m-%d")
- )
- else None,
- axis=1,
- )
- .apply(lambda dt: dt.days / (365.25 / 12) if dt else None)
- .apply(lambda a: int(a) if not pd.isnull(a) else "NA")
- )
- recordings = project.recordings
- if "session_offset" not in recordings.columns:
- recordings = recordings.assign(session_offset=0)
- # compute missing audio
- metrics = metrics.merge(
- recordings[["session_id", "session_offset", "duration"]]
- .sort_values("session_offset")
- .groupby("session_id")
- .agg(
- last_offset=("session_offset", lambda x: x.iloc[-1]),
- last_duration=("duration", lambda x: x.iloc[-1]),
- total=("duration", "sum"),
- )
- .reset_index(),
- how="left",
- left_on="session_id",
- right_on="session_id",
- )
- metrics["missing_audio"] = (
- metrics["last_offset"] + metrics["last_duration"] - metrics["total"]
- )
- metrics.drop(columns=["last_offset", "last_duration", "total"], inplace=True)
- data.append(metrics)
- if args.pipeline != "children":
- pd.concat(data).set_index(["experiment", "session_id", "child_id"]).to_csv(
- args.output
- )
- else:
- data = pd.concat(data)
- columns = reduce(lambda x, y: x & set(y), columns, columns[0]) | {
- "normative",
- "ses",
- }
- data = data[columns]
- data.set_index("child_id").to_csv(args.output)
- def main(args):
- compute_metrics(args)
- def _parse_args(argv):
- warnings.filterwarnings("ignore")
- parser = argparse.ArgumentParser(description="compute metrics")
- parser.add_argument(
- "pipeline", help="pipeline to run", choices=["aclew", "lena", "children", "period"]
- )
- parser.add_argument("output", help="output file")
- parser.add_argument("--experiments", nargs="+", default=[])
- parser.add_argument("--threads", default=0, type=int)
- parser.add_argument("--period", default=None, type=str)
- args = parser.parse_args(argv)
- return args
- if __name__ == '__main__':
- import sys
- pgrm_name, argv = sys.argv[0], sys.argv[1:]
- args = _parse_args(argv)
- main(**args)
|