123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389 |
- from email.errors import MalformedHeaderDefect
- from logging import logMultiprocessing
- import pandas as pd
- import numpy as np
- from matplotlib import pyplot as plt
- import matplotlib
- matplotlib.use("pgf")
- matplotlib.rcParams.update(
- {
- "pgf.texsystem": "pdflatex",
- "font.family": "serif",
- "font.serif": "Times New Roman",
- "text.usetex": True,
- "pgf.rcfonts": False,
- }
- )
- import pickle
- import argparse
- parser = argparse.ArgumentParser()
- parser.add_argument("--run")
- parser.add_argument("--model")
- args = parser.parse_args()
- speakers = ["CHI", "OCH", "FEM", "MAL"]
- cb_colors = [
- "#377eb8",
- "#ff7f00",
- "#f781bf",
- "#4daf4a",
- "#a65628",
- "#984ea3",
- "#999999",
- "#e41a1c",
- "#dede00",
- ]
- corpora = {
- 'bergelson': 0, 'cougar': 1, 'fausey-trio': 2, 'lucid': 3, 'warlaumont': 4, 'winnipeg': 5
- }
- corpora_names = {
- corpora[corpus]: corpus
- for corpus in corpora
- }
- def normality_test(data, log=False):
- from scipy.stats import normaltest
- bins = np.linspace(2, 4, 20) if log else np.linspace(0, 5000, 40)
- fig, ax = plt.subplots()
- for i in range(4):
- values = np.log10(data["vocs"][:, i]) if log else data["vocs"][:, i]
- res = normaltest(values)
- ax.hist(
- values,
- bins=bins,
- label=f"{speakers[i]}, p={res.pvalue:.4f}",
- histtype="step",
- )
- fig.legend()
- suffix = "_log" if log else ""
- fig.savefig(f"output/normality{suffix}.eps")
- def truth_vocs_predictions(data, samples):
- fig, axes = plt.subplots(nrows=2, ncols=2, sharex=True, sharey=True)
- for row in range(2):
- for col in range(2):
- i = row + 2 * col
- truth = samples["truth_vocs"][:, :, i].mean(axis=0)
- axes[row, col].scatter(
- data["vocs"][:, i],
- truth,
- s=0.5,
- color=[cb_colors[data["corpus"][k - 1] - 1] for k in data["children"]],
- )
- axes[row, col].errorbar(
- data["vocs"][:, i],
- truth,
- yerr=(
- truth
- - np.quantile(samples["truth_vocs"][:, :, i], axis=0, q=0.1 / 2),
- np.quantile(samples["truth_vocs"][:, :, i], axis=0, q=1 - 0.1 / 2)
- - truth,
- ),
- lw=0.1,
- ls="none",
- color=[cb_colors[data["corpus"][k - 1] - 1] for k in data["children"]],
- label=[corpora_names[data["corpus"][k - 1] - 1] for k in data["children"]],
- alpha=0.5,
- )
- if row == 1 and col == 0:
- axes[row, col].set_xlabel("VTC")
- axes[row, col].set_ylabel("est. truth")
- axes[row,col].set_xscale("log")
- axes[row,col].set_yscale("log")
- axes[row,col].set_ylim(20,10000)
- axes[row, col].axline((100, 100), (1000, 1000), color="black")
- axes[row, col].set_title(speakers[i], y=1, pad=-14)
- plt.subplots_adjust(wspace=0, hspace=0)
- fig.savefig("output/vocs_predictions.eps", bbox_inches="tight")
- fig.savefig("output/vocs_predictions.png", bbox_inches="tight", dpi=300)
- def age_distrib(data):
- fig, ax = plt.subplots()
- age = data["age"]
- corpus = np.array([data["corpus"][k - 1] - 1 for k in data["children"]])
- bins = np.arange(age.min(), age.max() + 1)
- for c in np.unique(corpus):
- ax.hist(
- age[corpus == c],
- bins=bins,
- color=cb_colors[c],
- histtype="step",
- label=corpora_names[c]
- )
- ax.set_xlabel("age (months)")
- ax.legend()
- fig.savefig("output/age_distrib.eps", bbox_inches="tight")
- def correlations(data, samples):
- # keep_calibration = (data["group_corpus"]==1)|(data["group_corpus"]==4)
- # calibration_truth = data["speech_rates"][keep_calibration,:]
- vtc = data["vocs"]
- truth = samples["truth_vocs"]
- n_samples = truth.shape[0]
- fig, axes = plt.subplots(nrows=3, ncols=3, sharex=True, sharey=True)
- # FEM,MAL
- # OCH,FEM OCH,MAL
- # CHI,OCH CHI,FEM CHI,MAL
- for i in range(3):
- for j in range(3):
- if i<2-j:
- axes[i, j].axis('off')
- continue
- a = 2-i
- b = j+1
- vtc_r = np.corrcoef(vtc[:,a], vtc[:,b])[0,1]
- # calibration_r = np.corrcoef(calibration_truth[:,a], calibration_truth[:,b])[0,1]
- truth_r = [np.corrcoef(truth[k,:,a], truth[k,:,b])[0,1] for k in range(n_samples)]
- bins = np.linspace(-1,1,100)
- low = np.quantile(truth_r,q=0.05/2)
- high = np.quantile(truth_r,q=1-0.05/2)
- r = np.mean(truth_r)
- axes[i,j].axvline(vtc_r, color=cb_colors[0], lw=1)
- # axes[i,j].axvline(calibration_r, color="olive", lw=0.5, ls="dashed")
- axes[i,j].hist(truth_r, bins=bins, histtype="step", density=True, color="black")
- axes[i,j].text(
- 0.05, 0.9,
- f"\\tiny $R(\\mathrm{{{speakers[a]}}},\\mathrm{{{speakers[b]}}})={r:.2f}$",
- ha="left",
- transform=axes[i,j].transAxes,
- color="black"
- )
- axes[i,j].text(
- 0.05, 0.8,
- f"\\tiny $\\mathrm{{CI}}_{{95\\%}}$[{low:.2f}, {high:.2f}]",
- ha="left",
- transform = axes[i,j].transAxes,
- color="black"
- )
- axes[i,j].text(
- 0.05, 0.7,
- f"\\tiny $R(\\mathrm{{VTC}})={vtc_r:.2f}$",
- ha="left",
- transform = axes[i,j].transAxes,
- color=cb_colors[0]
- )
- # axes[i,j].text(
- # 0.05, 0.6,
- # f"\\tiny $R(\\mathrm{{human}})={calibration_r:.2f}\\ast$",
- # ha="left",
- # transform = axes[i,j].transAxes,
- # color="olive"
- # )
- axes[i,j].set_yticks([])
- axes[i,j].set_yticklabels([])
- axes[i,j].set_xlim(-0.8, 0.8)
- plt.subplots_adjust(wspace=0.1, hspace=0.1)
- fig.savefig("output/correlations.eps", bbox_inches="tight")
- def child_level_correlations(data, samples):
- mu = samples["mu_child_level"]
- sibs = np.zeros(data["n_children"])
- for k in range(data["n_recs"]):
- sibs[data["children"].iloc[k]-1] = data["siblings"].iloc[k]
-
- has_sibs_data = sibs>=0
- beta = samples["beta_sib_och"]
- n_samples = mu.shape[0]
- fig, axes = plt.subplots(nrows=2, ncols=2, sharex=True, sharey=True)
- # FEM,MAL
- # OCH,FEM OCH,MAL
-
- for i in range(2):
- for j in range(2):
- if i<1-j:
- axes[i, j].axis('off')
- continue
- a = 1-i+1
- b = j+2
- if (a==1 or b==1):
- if a==1:
- mu_r = [np.corrcoef(mu[k,has_sibs_data,a-1]*np.exp((sibs[has_sibs_data]>0)*beta[k]), mu[k,has_sibs_data,b-1])[0,1] for k in range(n_samples)]
- else:
- mu_r = [np.corrcoef(mu[k,has_sibs_data,a-1], mu[k,has_sibs_data,b-1]*np.exp((sibs[has_sibs_data]>0)*beta[k]))[0,1] for k in range(n_samples)]
- else:
- mu_r = [np.corrcoef(mu[k,:,a-1], mu[k,:,b-1])[0,1] for k in range(n_samples)]
- bins = np.linspace(-1,1,100)
- low = np.quantile(mu_r,q=0.05/2)
- high = np.quantile(mu_r,q=1-0.05/2)
- r = np.mean(mu_r)
- axes[i,j].hist(mu_r, bins=bins, histtype="step", density=True, color="black")
- axes[i,j].text(
- 0.05, 0.9,
- f"$R(\\mathrm{{{speakers[a]}}},\\mathrm{{{speakers[b]}}})={r:.2f}$",
- ha="left",
- transform=axes[i,j].transAxes,
- color="black"
- )
- axes[i,j].text(
- 0.05, 0.8,
- f"$\\mathrm{{CI}}_{{95\\%}}$[{low:.2f}, {high:.2f}]",
- ha="left",
- transform = axes[i,j].transAxes,
- color="black"
- )
- axes[i,j].set_yticks([])
- axes[i,j].set_yticklabels([])
- axes[i,j].set_xlim(-0.7, 0.7)
- plt.subplots_adjust(wspace=0.1, hspace=0.1)
- fig.savefig("output/child_level_correlations.eps", bbox_inches="tight")
- def dev_random_effect(data, samples):
- beta = samples["child_dev_age"]
- n_children = beta.shape[1]
- mu_beta = beta.mean(axis=0)
- low_beta = np.quantile(beta, axis=0, q=0.05/2)
- high_beta = np.quantile(beta, axis=0, q=1-0.05/2)
- order = np.argsort(mu_beta)
- corpora = data["corpus"][np.arange(n_children)]-1
- corpora = corpora[order]
- fig, ax = plt.subplots()
- ax.scatter(mu_beta[order], np.arange(n_children), s=1, color=[cb_colors[corpus] for corpus in corpora])
- ax.errorbar(
- mu_beta[order],
- np.arange(n_children),
- xerr=(mu_beta[order]-low_beta[order],high_beta[order]-mu_beta[order]),
- ls="none",
- color=[cb_colors[corpus] for corpus in corpora],
- label=[corpora_names[corpus] for corpus in corpora],
- lw=1
- )
- ax.set_xlabel("$\\beta_c$")
- ax.set_ylabel("child number")
- fig.savefig("output/dev_random_effect.eps", bbox_inches="tight")
- def input_effect(data, samples):
- fig, ax = plt.subplots()
- m = np.mean(samples["beta_dev"])
- low = np.quantile(samples["beta_dev"], q=0.05/2)
- high = np.quantile(samples["beta_dev"], q=1-0.05/2)
- bins = np.linspace(-2.5, 2.5, 25)
- ax.hist(
- samples["beta_dev"],
- bins=bins,
- histtype="step",
- density=True
- )
- ax.text(0.05, 0.9, f"$\\mu(\\delta)={m:.1f}$",
- ha="left",
- transform = ax.transAxes
- )
- ax.text(0.05, 0.8, f"$\\mathrm{{CI}}_{{95\\%}}$[{low:.1f}, {high:.1f}]",
- ha="left",
- transform = ax.transAxes
- )
- ax.set_xlabel("$\\delta$")
- fig.savefig("output/input_effect.eps", bbox_inches="tight")
- def dev_total_rate(data, samples):
- beta = samples["child_dev_age"]
- delta = samples["beta_dev"]
- adu = samples["mu_child_level"][:,:,1]+samples["mu_child_level"][:,:,2]
- adu_pop = samples["mu_pop_level"][:,2]+samples["mu_pop_level"][:,3]
- total = beta+delta[:,np.newaxis]*(adu-adu_pop[:,np.newaxis])
- n_children = total.shape[1]
- mu = total.mean(axis=0)
- low = np.quantile(total, axis=0, q=0.05/2)
- high = np.quantile(total, axis=0, q=1-0.05/2)
- order = np.argsort(mu)
- corpora = data["corpus"][np.arange(n_children)]-1
- corpora = corpora[order]
- fig, ax = plt.subplots()
- ax.scatter(mu[order], np.arange(n_children), s=1, color=[cb_colors[corpus] for corpus in corpora])
- ax.errorbar(
- mu[order],
- np.arange(n_children),
- xerr=(mu[order]-low[order],high[order]-mu[order]),
- ls="none",
- color=[cb_colors[corpus] for corpus in corpora],
- label=[corpora_names[corpus] for corpus in corpora],
- lw=1
- )
- ax.set_xlabel("$\\beta_c+\\delta (\\mu^{\\mathrm{child}}_{c,\mathrm{ADU}}-\\mu^{\\mathrm{pop}}_{\mathrm{ADU}})$")
- ax.set_ylabel("child number")
- fig.savefig("output/dev_total_rate.eps", bbox_inches="tight")
- with open(f"output/aggregates_{args.run}_{args.model}.pickle", "rb") as f:
- data = pickle.load(f)
- samples = np.load(f"output/aggregates_{args.run}_{args.model}.npz")
- dev_random_effect(data, samples)
- dev_total_rate(data, samples)
- input_effect(data, samples)
- correlations(data, samples)
- child_level_correlations(data, samples)
- normality_test(data, log=True)
- normality_test(data, log=False)
- truth_vocs_predictions(data, samples)
- age_distrib(data)
|