123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137 |
- from email.errors import MalformedHeaderDefect
- from logging import logMultiprocessing
- import pandas as pd
- import numpy as np
- from matplotlib import pyplot as plt
- import matplotlib
- matplotlib.use("pgf")
- matplotlib.rcParams.update(
- {
- "pgf.texsystem": "pdflatex",
- "font.family": "serif",
- "font.serif": "Times New Roman",
- "text.usetex": True,
- "pgf.rcfonts": False,
- }
- )
- import pickle
- from os.path import join as opj
- import argparse
- parser = argparse.ArgumentParser()
- parser.add_argument("--vtc")
- parser.add_argument("--lena")
- args = parser.parse_args()
- data = {
- "lena": opj("output", f"{args.lena}.pickle"),
- "vtc": opj("output", f"{args.vtc}.pickle")
- }
- for key in data:
- with open(data[key], "rb") as f:
- data[key] = pickle.load(f)
- samples = {
- "vtc": np.load(opj("output", f"{args.vtc}.npz")),
- "lena": np.load(opj("output", f"{args.lena}.npz"))
- }
- labels = {
- "lena": "LENA",
- "vtc": "VTC"
- }
- speakers = ["CHI", "OCH", "FEM", "MAL"]
- cb_colors = [
- "#377eb8",
- "#ff7f00",
- "#f781bf",
- "#4daf4a",
- "#a65628",
- "#984ea3",
- "#999999",
- "#e41a1c",
- "#dede00",
- ]
- corpora = {
- 'bergelson': 0, 'cougar': 1, 'fausey-trio': 2, 'lucid': 3, 'warlaumont': 4, 'winnipeg': 5
- }
- corpora_names = {
- corpora[corpus]: corpus
- for corpus in corpora
- }
- def truth_vocs_predictions(data, samples, algo):
- fig, axes = plt.subplots(nrows=2, ncols=2, sharex=True, sharey=True)
- for row in range(2):
- for col in range(2):
- i = row + 2 * col
- for corpus in set(data["corpus"]):
- mask = np.array([data["corpus"][k - 1] == corpus for k in data["children"]])
- vocs = data["vocs"][mask,i]
- truth = samples["truth_vocs"][:, mask, i].mean(axis=0)
- axes[row, col].scatter(
- vocs,
- truth,
- s=0.5,
- color=cb_colors[corpus - 1],
- )
- axes[row, col].errorbar(
- vocs,
- truth,
- yerr=(
- truth
- - np.quantile(samples["truth_vocs"][:, mask, i], axis=0, q=0.05 / 2),
- np.quantile(samples["truth_vocs"][:, mask, i], axis=0, q=1 - 0.05 / 2)
- - truth,
- ),
- lw=0.1,
- ls="none",
- color=cb_colors[corpus - 1],
- label=corpora_names[corpus-1],
- alpha=0.25,
- )
- if row == 1 and col == 0:
- axes[row, col].set_xlabel("VTC")
- axes[row, col].set_ylabel("estimated truth")
- axes[row,col].set_xscale("log")
- axes[row,col].set_yscale("log")
- axes[row,col].set_xlim(50,4500)
- axes[row,col].set_ylim(20,10000)
- axes[row, col].axline((100, 100), (1000, 1000), color="black")
- axes[row, col].set_title(speakers[i], y=1, pad=-14)
- plt.subplots_adjust(wspace=0, hspace=0)
- fig.savefig(f"output/vocs_predictions_{algo}.eps", bbox_inches="tight")
- fig.savefig(f"output/vocs_predictions_{algo}.png", bbox_inches="tight", dpi=300)
- fig.savefig(f"output/vocs_predictions_{algo}.pdf", bbox_inches="tight")
- vocs = {}
- truth = {}
- for algo in data:
- vocs[algo] = data[algo]["vocs"]
- truth[algo] = samples[algo]["truth_vocs"].mean(axis=0)
- for i, speaker in enumerate(speakers):
- print(speaker)
- print(f"truth: {np.corrcoef(truth['lena'][:,i], truth['vtc'][:,i])[0,1]:.2f}")
- print(f"vocs: {np.corrcoef(vocs['lena'][:,i], vocs['vtc'][:,i])[0,1]:.2f}")
- truth_vocs_predictions(data["lena"], samples["lena"], "lena")
- truth_vocs_predictions(data["vtc"], samples["vtc"], "vtc")
|