LAAC-LSCP
/
speaker-confusion-model


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200
							from email.errors import MalformedHeaderDefect
from logging import logMultiprocessing
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
import matplotlib

matplotlib.use("pgf")
matplotlib.rcParams.update(
    {
        "pgf.texsystem": "pdflatex",
        "font.family": "serif",
        "font.serif": "Times New Roman",
        "text.usetex": True,
        "pgf.rcfonts": False,
    }
)


import pickle
from os.path import join as opj

import argparse

parser = argparse.ArgumentParser()
parser.add_argument("--vtc")
parser.add_argument("--lena")
args = parser.parse_args()

data = {
    "truth": "output/aggregates_lena_age24_human.pickle",
    "lena_raw": "output/aggregates_lena_age24_algo.pickle",
    "vtc_raw": "output/aggregates_vtc_age24_algo.pickle",
    "lena_calibrated": "output/aggregates_lena_age24_dev_siblings_binomial_hurdle_fast.pickle",    
    "vtc_calibrated": "output/aggregates_vtc_age24_dev_siblings_binomial_hurdle_fast.pickle",    
}

for key in data:
    with open(data[key], "rb") as f:
        data[key] = pickle.load(f)

samples = {
    "truth": np.load("output/aggregates_lena_age24_human.npz"),
    "lena_raw": np.load("output/aggregates_lena_age24_algo.npz"),
    "vtc_raw": np.load("output/aggregates_vtc_age24_algo.npz"),
    # "lena_calibrated": np.load("output/aggregates_lena_dev_siblings_effect.npz"),
    # "vtc_calibrated": np.load("output/aggregates_vtc_dev_siblings_effect.npz"),
    "lena_calibrated": np.load("output/aggregates_lena_age24_dev_siblings_binomial_hurdle_fast.npz"),
    "vtc_calibrated": np.load("output/aggregates_vtc_age24_dev_siblings_binomial_hurdle_fast.npz")
}

labels = {
    "prior": "Prior",
    "truth": "Manual annotations",
    "lena_raw": "LENA (uncalibrated)",
    "vtc_raw": "VTC (uncalibrated)",
    "lena_calibrated": "LENA (calibrated)",
    "vtc_calibrated": "VTC (calibrated)"
}

speakers = ["CHI", "OCH", "FEM", "MAL"]
cb_colors = [
    "#377eb8",
    "#ff7f00",
    "#f781bf",
    "#4daf4a",
    "#a65628",
    "#984ea3",
    "#999999",
    "#e41a1c",
    "#dede00",
]

corpora = {
    'bergelson': 0, 'cougar': 1, 'fausey-trio': 2, 'lucid': 3, 'warlaumont': 4, 'winnipeg': 5
}
corpora_names = {
    corpora[corpus]: corpus
    for corpus in corpora
}

def correlations(data, samples):
    # keep_calibration = (data["group_corpus"]==1)|(data["group_corpus"]==4)
    # calibration_truth = data["speech_rates"][keep_calibration,:]
    vtc = data["vtc_calibrated"]["vocs"]
    lena = data["lena_calibrated"]["vocs"]
    truth_vtc = samples["vtc_calibrated"]["truth_vocs"]
    truth_lena = samples["lena_calibrated"]["truth_vocs"]
    n_samples_vtc = truth_vtc.shape[0]
    n_samples_lena = truth_lena.shape[0]

    fig, axes = plt.subplots(nrows=3, ncols=3, sharex=True, sharey=True, figsize=([9.6, 9.6]))


    #                 FEM,MAL
    #         OCH,FEM OCH,MAL
    # CHI,OCH CHI,FEM CHI,MAL

    for i in range(3):
        for j in range(3):
            if i<2-j:
                axes[i, j].axis('off')
                continue

            a = 2-i
            b = j+1

            vtc_r = np.corrcoef(vtc[:,a], vtc[:,b])[0,1]
            lena_r = np.corrcoef(lena[:,a], lena[:,b])[0,1]
            # calibration_r = np.corrcoef(calibration_truth[:,a], calibration_truth[:,b])[0,1]
            truth_vtc_r = [np.corrcoef(truth_vtc[k,:,a], truth_vtc[k,:,b])[0,1] for k in range(n_samples_vtc)]
            truth_lena_r = [np.corrcoef(truth_lena[k,:,a], truth_lena[k,:,b])[0,1] for k in range(n_samples_lena)]
            bins = np.linspace(-1,1,100)

            low_vtc = np.quantile(truth_vtc_r,q=0.05/2)
            high_vtc = np.quantile(truth_vtc_r,q=1-0.05/2)
            r_vtc = np.mean(truth_vtc_r)

            low_lena = np.quantile(truth_lena_r,q=0.05/2)
            high_lena = np.quantile(truth_lena_r,q=1-0.05/2)
            r_lena = np.mean(truth_lena_r)

            axes[i,j].axvline(vtc_r, color=cb_colors[0], lw=1, ls="dashed")
            axes[i,j].axvline(lena_r, color=cb_colors[1], lw=1, ls="dashed")
            # axes[i,j].axvline(calibration_r, color="olive", lw=0.5, ls="dashed")

            axes[i,j].hist(truth_vtc_r, bins=bins, histtype="step", density=True, color=cb_colors[0])
            axes[i,j].text(
                1-0.05, 0.95, "\\scriptsize\\textbf{VTC}:", ha="right", transform=axes[i,j].transAxes, color=cb_colors[0]
            )
            axes[i,j].text(
                1-0.05, 0.9,
                f"\\scriptsize $r(\\mathrm{{{speakers[a]}}},\\mathrm{{{speakers[b]}}})={r_vtc:.2f}$",
                ha="right",
                transform=axes[i,j].transAxes,
                color="black"
            )

            axes[i,j].text(
                1-0.05, 0.85,
                f"\\scriptsize $\\mathrm{{CI}}_{{95\\%}}$[{low_vtc:.2f}, {high_vtc:.2f}]",
                ha="right",
                transform = axes[i,j].transAxes,
                color="black"
            )

            axes[i,j].hist(truth_lena_r, bins=bins, histtype="step", density=True, color=cb_colors[1])
            axes[i,j].text(
                1-0.05, 0.7, "\\scriptsize\\textbf{LENA}:", ha="right", transform=axes[i,j].transAxes, color=cb_colors[1]
            )
            axes[i,j].text(
                1-0.05, 0.65,
                f"\\scriptsize $r(\\mathrm{{{speakers[a]}}},\\mathrm{{{speakers[b]}}})={r_lena:.2f}$",
                ha="right",
                transform=axes[i,j].transAxes,
                color="black"
            )

            axes[i,j].text(
                1-0.05, 0.6,
                f"\\scriptsize $\\mathrm{{CI}}_{{95\\%}}$[{low_lena:.2f}, {high_lena:.2f}]",
                ha="right",
                transform = axes[i,j].transAxes,
                color="black"
            )

            axes[i,j].text(
                1-0.05, 0.8,
                f"\\scriptsize $r(\\mathrm{{uncalib.}})={vtc_r:.2f}$",
                ha="right",
                transform = axes[i,j].transAxes,
                color="black"
            )
            axes[i,j].text(
                1-0.05, 0.55,
                f"\\scriptsize $r(\\mathrm{{uncalib.}})={lena_r:.2f}$",
                ha="right",
                transform = axes[i,j].transAxes,
                color="black"
            )
            # axes[i,j].text(
            #     0.05, 0.6,
            #     f"\\scriptsize $r(\\mathrm{{human}})={calibration_r:.2f}\\ast$",
            #     ha="left",
            #     transform = axes[i,j].transAxes,
            #     color="olive"
            # )

            axes[i,j].set_yticks([])
            axes[i,j].set_yticklabels([])
            axes[i,j].set_xlim(-0.25, 0.8)

    plt.subplots_adjust(wspace=0.05, hspace=0.05)
    fig.savefig("output/correlations_all.eps", bbox_inches="tight")
    fig.savefig("output/correlations_all.png", bbox_inches="tight", dpi=720)


correlations(data, samples)