123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214 |
- from email.errors import MalformedHeaderDefect
- from logging import logMultiprocessing
- import pandas as pd
- import numpy as np
- from matplotlib import pyplot as plt
- import matplotlib
- matplotlib.use("pgf")
- matplotlib.rcParams.update(
- {
- "pgf.texsystem": "pdflatex",
- "font.family": "serif",
- "font.serif": "Times New Roman",
- "text.usetex": True,
- "pgf.rcfonts": False,
- }
- )
- import pickle
- from os.path import join as opj
- import argparse
- parser = argparse.ArgumentParser()
- parser.add_argument("--vtc")
- parser.add_argument("--lena")
- args = parser.parse_args()
- speakers = ["CHI", "OCH", "FEM", "MAL"]
- cb_colors = [
- "#377eb8",
- "#ff7f00",
- "#f781bf",
- "#4daf4a",
- "#a65628",
- "#984ea3",
- "#999999",
- "#e41a1c",
- "#dede00",
- ]
- corpora = {
- 'bergelson': 0, 'cougar': 1, 'fausey-trio': 2, 'lucid': 3, 'warlaumont': 4, 'winnipeg': 5
- }
- corpora_names = {
- corpora[corpus]: corpus
- for corpus in corpora
- }
- def correlations(data, samples):
- mu = {
- x: samples[x]["mu_child_level"]
- for x in ["vtc_raw", "lena_raw", "vtc_calibrated", "lena_calibrated"]
- }
- beta_och = {
- x: samples[x]["beta_sib_och"]
- for x in ["vtc_raw", "lena_raw", "vtc_calibrated", "lena_calibrated"]
- }
- beta_adu = {
- x: samples[x]["beta_sib_adu"]
- for x in ["vtc_raw", "lena_raw", "vtc_calibrated", "lena_calibrated"]
- }
- sibs = np.zeros(data["vtc_raw"]["n_children"])
- for k in range(data["vtc_raw"]["n_recs"]):
- sibs[data["vtc_raw"]["children"].iloc[k]-1] = data["vtc_raw"]["siblings"].iloc[k]
-
- has_sibs_data = sibs>=0
- fig, axes = plt.subplots(nrows=2, ncols=2, sharex=True, sharey=True, figsize=([6.4, 6.4]))
- for i in range(2):
- for j in range(2):
- if i<1-j:
- axes[i, j].axis('off')
- continue
- a = 1-i+1
- b = j+2
- mu_r = {}
- low = {}
- high = {}
- r = {}
- bins = np.linspace(-1,1,100)
- for x in ["vtc_raw", "lena_raw", "vtc_calibrated", "lena_calibrated"]:
- beta_a = beta_och[x] if a==1 else beta_adu[x]/10
- beta_b = beta_och[x] if b==1 else beta_adu[x]/10
- n_samples = mu[x].shape[0]
-
- mu_r[x] = [np.corrcoef(mu[x][k,has_sibs_data,a-1]*np.exp((sibs[has_sibs_data]==0)*beta_a[k]), mu[x][k,has_sibs_data,b-1]*np.exp((sibs[has_sibs_data]==0)*beta_b[k]))[0,1] for k in range(n_samples)]
-
- low[x] = np.quantile(mu_r[x],q=0.05/2)
- high[x] = np.quantile(mu_r[x],q=1-0.05/2)
- r[x] = np.mean(mu_r[x])
- axes[i,j].axvline(r["vtc_raw"], color=cb_colors[0], lw=1, ls="dashed")
- axes[i,j].axvspan(low["vtc_raw"], high["vtc_raw"], color=cb_colors[0], alpha=0.2)
- axes[i,j].axvline(r["lena_raw"], color=cb_colors[1], lw=1, ls="dashed")
- axes[i,j].axvspan(low["lena_raw"], high["lena_raw"], color=cb_colors[1], alpha=0.2)
- # axes[i,j].axvline(calibration_r, color="olive", lw=0.5, ls="dashed")
- axes[i,j].hist(mu_r["vtc_calibrated"], bins=bins, histtype="step", density=True, color=cb_colors[0])
- axes[i,j].text(
- 1-0.05, 0.95, "\\scriptsize\\textbf{VTC}:", ha="right", transform=axes[i,j].transAxes, color=cb_colors[0]
- )
- axes[i,j].text(
- 1-0.05, 0.9,
- f"\\scriptsize $r(\\mathrm{{{speakers[a]}}},\\mathrm{{{speakers[b]}}})={r['vtc_calibrated']:.2f}$",
- ha="right",
- transform=axes[i,j].transAxes,
- color="black"
- )
- axes[i,j].text(
- 1-0.05, 0.85,
- f"\\scriptsize $\\mathrm{{CI}}_{{95\\%}}$[{low['vtc_calibrated']:.2f}, {high['vtc_calibrated']:.2f}]",
- ha="right",
- transform = axes[i,j].transAxes,
- color="black"
- )
- axes[i,j].hist(mu_r["lena_calibrated"], bins=bins, histtype="step", density=True, color=cb_colors[1])
- axes[i,j].text(
- 1-0.05, 0.7, "\\scriptsize\\textbf{LENA}:", ha="right", transform=axes[i,j].transAxes, color=cb_colors[1]
- )
- axes[i,j].text(
- 1-0.05, 0.65,
- f"\\scriptsize $r(\\mathrm{{{speakers[a]}}},\\mathrm{{{speakers[b]}}})={r['lena_calibrated']:.2f}$",
- ha="right",
- transform=axes[i,j].transAxes,
- color="black"
- )
- axes[i,j].text(
- 1-0.05, 0.6,
- f"\\scriptsize $\\mathrm{{CI}}_{{95\\%}}$[{low['lena_calibrated']:.2f}, {high['lena_calibrated']:.2f}]",
- ha="right",
- transform = axes[i,j].transAxes,
- color="black"
- )
- axes[i,j].text(
- 1-0.05, 0.8,
- f"\\scriptsize $r(\\mathrm{{uncalib.}})={r['vtc_raw']:.2f}$",
- ha="right",
- transform = axes[i,j].transAxes,
- color="black"
- )
- axes[i,j].text(
- 1-0.05, 0.55,
- f"\\scriptsize $r(\\mathrm{{uncalib.}})={r['lena_raw']:.2f}$",
- ha="right",
- transform = axes[i,j].transAxes,
- color="black"
- )
- # axes[i,j].text(
- # 0.05, 0.6,
- # f"\\scriptsize $r(\\mathrm{{human}})={calibration_r:.2f}\\ast$",
- # ha="left",
- # transform = axes[i,j].transAxes,
- # color="olive"
- # )
- axes[i,j].set_yticks([])
- axes[i,j].set_yticklabels([])
- axes[i,j].set_xlim(-0.5, 0.5)
- plt.subplots_adjust(wspace=0.05, hspace=0.05)
- fig.savefig("output/correlations_child_level_all.eps", bbox_inches="tight")
- fig.savefig("output/correlations_child_level_all.png", bbox_inches="tight", dpi=720)
- fig.savefig("output/correlations_child_level_all.pdf", bbox_inches="tight")
- data = {
- "truth": "output/aggregates_lena_age24_human.pickle",
- "lena_raw": "output/aggregates_lena_age24_algo.pickle",
- "vtc_raw": "output/aggregates_vtc_age24_algo.pickle",
- "lena_calibrated": "output/aggregates_lena_age24_dev_siblings_binomial_hurdle_fast.pickle",
- "vtc_calibrated": "output/aggregates_vtc_age24_dev_siblings_binomial_hurdle_fast.pickle",
- }
- for key in data:
- with open(data[key], "rb") as f:
- data[key] = pickle.load(f)
- samples = {
- "truth": np.load("output/aggregates_lena_age24_human.npz"),
- "lena_raw": np.load("output/aggregates_lena_age24_algo.npz"),
- "vtc_raw": np.load("output/aggregates_vtc_age24_algo.npz"),
- # "lena_calibrated": np.load("output/aggregates_lena_dev_siblings_effect.npz"),
- # "vtc_calibrated": np.load("output/aggregates_vtc_dev_siblings_effect.npz"),
- "lena_calibrated": np.load("output/aggregates_lena_age24_dev_siblings_binomial_hurdle_fast.npz"),
- "vtc_calibrated": np.load("output/aggregates_vtc_age24_dev_siblings_binomial_hurdle_fast.npz")
- }
- labels = {
- "prior": "Prior",
- "truth": "Manual annotations",
- "lena_raw": "LENA (uncalibrated)",
- "vtc_raw": "VTC (uncalibrated)",
- "lena_calibrated": "LENA (calibrated)",
- "vtc_calibrated": "VTC (calibrated)"
- }
- correlations(data, samples)
|