|
@@ -6,6 +6,7 @@ import numpy as np
|
|
|
from ChildProject.projects import ChildProject
|
|
|
from ChildProject.annotations import AnnotationManager
|
|
|
from ChildProject.metrics import segments_to_grid, segments_to_annotation
|
|
|
+from ChildProject.pipelines.metrics import AclewMetrics
|
|
|
from ChildProject.utils import TimeInterval
|
|
|
|
|
|
from cmdstanpy import CmdStanModel
|
|
@@ -42,14 +43,15 @@ parser = argparse.ArgumentParser()
|
|
|
parser.add_argument("--corpora", default=["input/bergelson"], nargs="+")
|
|
|
parser.add_argument("--duration", type=int, help="duration in hours", default=8)
|
|
|
parser.add_argument("--run")
|
|
|
-parser.add_argument("--chains", type=int, default=4)
|
|
|
-parser.add_argument("--warmup", type=int, default=500)
|
|
|
-parser.add_argument("--samples", type=int, default=2500)
|
|
|
+parser.add_argument("--chains", type=int, default=1)
|
|
|
+parser.add_argument("--warmup", type=int, default=250)
|
|
|
+parser.add_argument("--samples", type=int, default=1000)
|
|
|
parser.add_argument("--threads-per-chain", type=int, default=4)
|
|
|
-parser.add_argument("--models", nargs='+')
|
|
|
+parser.add_argument("--models", default=["analysis"], nargs='+')
|
|
|
+parser.add_argument("--validation", type=float, default=0)
|
|
|
args = parser.parse_args()
|
|
|
|
|
|
-
|
|
|
+speakers = ["CHI", "OCH", "FEM", "MAL"]
|
|
|
|
|
|
def extrude(self, removed, mode: str = "intersection"):
|
|
|
if isinstance(removed, Segment):
|
|
@@ -68,7 +70,6 @@ def extrude(self, removed, mode: str = "intersection"):
|
|
|
def compute_counts(parameters):
|
|
|
corpus = parameters["corpus"]
|
|
|
annotator = parameters["annotator"]
|
|
|
- speakers = ["CHI", "OCH", "FEM", "MAL"]
|
|
|
|
|
|
project = ChildProject(parameters["path"])
|
|
|
am = AnnotationManager(project)
|
|
@@ -177,10 +178,44 @@ def compute_counts(parameters):
|
|
|
corpus=corpus,
|
|
|
)
|
|
|
|
|
|
+def rates(parameters):
|
|
|
+ corpus = parameters["corpus"]
|
|
|
+ annotator = parameters["annotator"]
|
|
|
+ speakers = ["CHI", "OCH", "FEM", "MAL"]
|
|
|
+
|
|
|
+ project = ChildProject(parameters["path"])
|
|
|
+ am = AnnotationManager(project)
|
|
|
+ am.read()
|
|
|
+
|
|
|
+ pipeline = AclewMetrics(
|
|
|
+ project,
|
|
|
+ vtc=annotator,
|
|
|
+ alice=None,
|
|
|
+ vcm=None,
|
|
|
+ from_time="10:00:00",
|
|
|
+ to_time="18:00:00",
|
|
|
+ by="child_id",
|
|
|
+ )
|
|
|
+ metrics = pipeline.extract()
|
|
|
+ metrics = pd.DataFrame(metrics).assign(corpus=corpus,annotator=annotator)
|
|
|
+ metrics["duration"] = metrics[f"duration_{annotator}"]/1000/3600
|
|
|
+ metrics = metrics[metrics["duration"] > 0.01]
|
|
|
+
|
|
|
+ speakers = ['CHI', 'OCH', 'FEM', 'MAL']
|
|
|
+
|
|
|
+ # metrics.dropna(subset={f"voc_{speaker.lower()}_ph" for speaker in speakers}&set(metrics.columns), inplace=True)
|
|
|
+
|
|
|
+ for i, speaker in enumerate(speakers):
|
|
|
+ # if f"voc_{speaker.lower()}_ph" not in metrics.columns:
|
|
|
+ # metrics[f"speech_rate_{i}"] = pd.NA
|
|
|
+ # else:
|
|
|
+ metrics[f"speech_rate_{i}"] = (metrics[f"voc_{speaker.lower()}_ph"]*(metrics["duration"])).fillna(0).astype(int)
|
|
|
+
|
|
|
+ return metrics
|
|
|
|
|
|
def run_model(data, run, model_name):
|
|
|
model = CmdStanModel(
|
|
|
- stan_file=f"analyses/aggregates_{model_name}.stan",
|
|
|
+ stan_file=f"code/models/{model_name}.stan",
|
|
|
cpp_options={"STAN_THREADS": "TRUE"},
|
|
|
compile="force",
|
|
|
)
|
|
@@ -248,6 +283,17 @@ def compile_recordings(corpus):
|
|
|
|
|
|
duration = args.duration * 3600 * 1000
|
|
|
|
|
|
+ _annotations["path"] = _annotations.apply(
|
|
|
+ lambda r: opj(
|
|
|
+ project.path, "annotations", r["set"], "converted", r["annotation_filename"]
|
|
|
+ ),
|
|
|
+ axis=1,
|
|
|
+ )
|
|
|
+
|
|
|
+ missing_annotations = _annotations[~_annotations["path"].map(exists)]
|
|
|
+ if len(missing_annotations):
|
|
|
+ datalad.api.get(list(missing_annotations["path"].unique()))
|
|
|
+
|
|
|
segments = am.get_segments(_annotations)
|
|
|
segments["segment_onset"] -= segments["segment_onset"].min()
|
|
|
segments = segments[segments["segment_onset"] >= 0]
|
|
@@ -257,23 +303,17 @@ def compile_recordings(corpus):
|
|
|
continue
|
|
|
|
|
|
segments = segments[segments["speaker_type"].isin(["CHI", "OCH", "FEM", "MAL"])]
|
|
|
- # segments["segment_onset"] += segments["session_offset"]
|
|
|
-
|
|
|
- adu_onsets = segments[segments["speaker_type"].isin(["FEM", "MAL"])][
|
|
|
- "segment_onset"
|
|
|
- ].values
|
|
|
- chi_onsets = segments[segments["speaker_type"] == "CHI"]["segment_onset"].values
|
|
|
-
|
|
|
- recs.append(
|
|
|
- {
|
|
|
- "recording": recording_filename,
|
|
|
- "adu": len(adu_onsets),
|
|
|
- "chi": len(chi_onsets),
|
|
|
- "age": age,
|
|
|
- "children": f"{corpus}_{child_id}",
|
|
|
- "corpus": project.children["experiment"].iloc[0]
|
|
|
- }
|
|
|
- )
|
|
|
+
|
|
|
+ rec = {
|
|
|
+ f"vtc_{i}": len(segments[segments["speaker_type"] == speaker])
|
|
|
+ for i, speaker in enumerate(speakers)
|
|
|
+ }
|
|
|
+ rec["recording"] = recording_filename
|
|
|
+ rec["children"] = f"{corpus}_{child_id}"
|
|
|
+ rec["corpus"] = basename(corpus)
|
|
|
+ rec["age"] = age
|
|
|
+
|
|
|
+ recs.append(rec)
|
|
|
|
|
|
recs = pd.DataFrame(recs)
|
|
|
return recs
|
|
@@ -285,7 +325,7 @@ if __name__ == "__main__":
|
|
|
annotators = pd.read_csv("input/annotators.csv")
|
|
|
annotators["path"] = annotators["corpus"].apply(lambda c: opj("input", c))
|
|
|
|
|
|
- with mp.Pool(processes=8) as pool:
|
|
|
+ with mp.Pool(processes=args.chains*args.threads_per_chain) as pool:
|
|
|
data = pd.concat(pool.map(compute_counts, annotators.to_dict(orient="records")))
|
|
|
|
|
|
data = data.sample(frac=1)
|
|
@@ -298,27 +338,17 @@ if __name__ == "__main__":
|
|
|
|
|
|
# speech rates at the child level
|
|
|
annotators = annotators[~annotators['annotator'].str.startswith('eaf_2021')]
|
|
|
- with mp.Pool(processes=8) as pool:
|
|
|
+ with mp.Pool(processes=args.chains*args.threads_per_chain) as pool:
|
|
|
speech_rates = pd.concat(pool.map(rates, annotators.to_dict(orient="records")))
|
|
|
|
|
|
speech_rates.reset_index(inplace=True)
|
|
|
speech_rates = speech_rates.groupby(["corpus", "child_id"]).sample(1)
|
|
|
speech_rate_matrix = np.transpose([speech_rates[f"speech_rate_{i}"].values for i in range(4)])
|
|
|
- voc_duration_matrix = np.transpose([speech_rates[f"voc_duration_{i}"].values for i in range(4)])
|
|
|
|
|
|
speech_rates.to_csv("rates.csv")
|
|
|
|
|
|
-
|
|
|
print(vtc.shape)
|
|
|
|
|
|
- fixed_rates = pd.read_csv("output/speech_dist.csv")
|
|
|
-
|
|
|
- training_set = data.groupby("corpus").agg(
|
|
|
- duration=("duration", "sum"), children=("child", lambda x: x.nunique())
|
|
|
- )
|
|
|
- training_set["duration"] /= 3600
|
|
|
- training_set.to_csv("output/training_set.csv")
|
|
|
-
|
|
|
data["corpus"] = data["corpus"].astype("category")
|
|
|
corpora = data["corpus"].cat.codes.values
|
|
|
corpora_codes = dict(enumerate(data["corpus"].cat.categories))
|
|
@@ -330,54 +360,34 @@ if __name__ == "__main__":
|
|
|
"n_groups": data["child"].nunique(),
|
|
|
"n_corpora": data["corpus"].nunique(),
|
|
|
"n_validation": max(1, int(truth.shape[0] * args.validation)),
|
|
|
- "n_sim": args.simulated_children,
|
|
|
"group": 1 + data["child"].astype("category").cat.codes.values,
|
|
|
- "corpus": 1 + corpora,
|
|
|
- "selected_corpus": (
|
|
|
- 1 + corpora_codes[args.apply_bias_from]
|
|
|
- if args.apply_bias_from in corpora_codes
|
|
|
- else 0
|
|
|
- ),
|
|
|
+ "conf_corpus": 1 + corpora,
|
|
|
"truth": truth.astype(int),
|
|
|
"vtc": vtc.astype(int),
|
|
|
+ "speech_rates": speech_rate_matrix.astype(int),
|
|
|
"group_corpus": 1+speech_rates["corpus"].map(corpora_codes).astype(int).values,
|
|
|
- "duration": speech_rates["duration"].values,
|
|
|
+ "durations": speech_rates["duration"].values,
|
|
|
"n_rates": len(speech_rates)
|
|
|
}
|
|
|
|
|
|
n_recs = len(recs)
|
|
|
+
|
|
|
+ children_corpus = recs.groupby("children").agg(corpus=("corpus", "first")).sort_index()
|
|
|
+ children_corpus = 1+children_corpus.corpus.map(corpora_codes).astype(int).values
|
|
|
+
|
|
|
analysis_data = {
|
|
|
"n_recs": n_recs,
|
|
|
"n_children": len(recs["children"].unique()),
|
|
|
"children": recs["children"],
|
|
|
- "adu": recs["adu"],
|
|
|
- "chi": recs["chi"],
|
|
|
+ "vocs": np.transpose([recs[f"vtc_{i}"].values for i in range(4)]),
|
|
|
"age": recs["age"],
|
|
|
- "corpus": recs["corpus"]
|
|
|
+ "corpus": children_corpus,
|
|
|
+ "duration": args.duration,
|
|
|
}
|
|
|
|
|
|
-data = analysis_data.update(confusion_data)
|
|
|
-
|
|
|
-output = {}
|
|
|
-
|
|
|
-for model_name in args.models:
|
|
|
- output[model_name] = run_model(data, args.run, model_name)
|
|
|
-
|
|
|
-for model in output:
|
|
|
- print(model)
|
|
|
- print("total_evidence:", output[model]["total_evidence"])
|
|
|
- print("evidence:", output[model]["evidence"])
|
|
|
- print("priors:", output[model]["priors"])
|
|
|
-
|
|
|
- samples = output[model]["samples"]
|
|
|
-
|
|
|
- if "beta_a" in samples:
|
|
|
- plt.clf()
|
|
|
- plt.hist(samples["beta_a"], bins=np.linspace(-0.5,0.5,100), histtype="step")
|
|
|
- plt.savefig(f"output/{args.run}_{model}_beta_a.png", bbox_inches="tight")
|
|
|
+ data = {**analysis_data, **confusion_data}
|
|
|
|
|
|
- if "lambda" in samples:
|
|
|
- plt.clf()
|
|
|
- plt.hist(samples["lambda"], bins=np.linspace(-0.5,0.5,100), histtype="step")
|
|
|
- plt.savefig(f"output/{args.run}_{model}_lambda.png", bbox_inches="tight")
|
|
|
+ output = {}
|
|
|
|
|
|
+ for model_name in args.models:
|
|
|
+ output[model_name] = run_model(data, args.run, model_name)
|