123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179 |
- #!/usr/bin/env python3
- import pandas as pd
- import pickle
- import numpy as np
- from scipy.special import logit, expit
- import argparse
- import matplotlib
- import matplotlib.pyplot as plt
- from scipy.stats import gamma
- from os.path import basename
- matplotlib.use("pgf")
- matplotlib.rcParams.update(
- {
- "pgf.texsystem": "pdflatex",
- "font.family": "serif",
- "font.serif": "Times New Roman",
- "text.usetex": True,
- "pgf.rcfonts": False,
- }
- )
- def set_size(width, fraction=1, ratio=None):
- fig_width_pt = width * fraction
- inches_per_pt = 1 / 72.27
- if ratio is None:
- ratio = (5**0.5 - 1) / 2
- fig_width_in = fig_width_pt * inches_per_pt
- fig_height_in = fig_width_in * ratio
- return fig_width_in, fig_height_in
- parser = argparse.ArgumentParser(description="speech_rate")
- parser.add_argument("data")
- parser.add_argument("fit")
- parser.add_argument("output")
- parser.add_argument("--selected-corpus", type=int, default=-1)
- args = parser.parse_args()
- with open(args.data, "rb") as fp:
- data = pickle.load(fp)
- fit = np.load(args.fit)
- n_samples = fit["speech_rate_mu"].shape[0]
- # corpora = pd.read_csv("output/training_set.csv")
- # corpora = corpora["corpus"].map(basename).tolist()
- corpora = ["1", "2", "3", "4"]
- speakers = ["CHI", "OCH", "FEM", "MAL"]
- n_groups = data["n_groups"]
- n_corpora = data["n_corpora"]
- colors = ['#377eb8', '#ff7f00', '#4daf4a', '#f781bf']
- mu = np.zeros((data['n_corpora'], 4))
- mu_low = np.zeros((data['n_corpora'], 4))
- mu_high = np.zeros((data['n_corpora'], 4))
- inputs = np.zeros((data['n_corpora'], n_samples))
- input = np.zeros(data['n_corpora'])
- input_low = np.zeros(data['n_corpora'])
- input_high = np.zeros(data['n_corpora'])
- output_speech = np.zeros((data['n_corpora'], n_samples))
- input_speech = np.zeros((data['n_corpora'], n_samples))
- for c in range(data['n_corpora']):
- for i in range(4):
- mus = 1000*fit["speech_rate_mu"][:,i,c]
- mu[c,i] = np.mean(mus)
- mu_low[c,i] = np.quantile(mus, q=0.05/2)
- mu_high[c,i] = np.quantile(mus, q=1-0.05/2)
- inputs[c,:] += mus
- input[c] = inputs[c,:].mean()
- input_low[c] = np.quantile(inputs[c,:], q=0.05/2)
- input_high[c] = np.quantile(inputs[c,:], q=1-0.05/2)
- keep = [c not in ["winnipeg", "tsimane2017"] for c in corpora]
- corpora = [corpora[i] for i in range(len(corpora)) if keep[i]]
- mu = mu[keep,:]
- mu_low = mu_low[keep,:]
- mu_high = mu_high[keep,:]
- input = input[keep]
- input_low = input_low[keep]
- input_high = input_high[keep]
- print(corpora)
- print(mu.shape)
- fig, axes = plt.subplots(2, 2, sharex=True, sharey=True)
- for row in range (2):
- for col in range(2):
- i = row+2*col
- axes[row,col].scatter(np.arange(mu.shape[0]), mu[:,i])
- axes[row,col].errorbar(np.arange(mu.shape[0]), mu[:,i], (mu[:,i]-mu_low[:,i], mu_high[:,i]-mu[:,i]) ,ls="none")
- axes[row,col].set_title(speakers[i])
- axes[row,col].set_ylim(0,1200)
- axes[row,col].set_xticks(np.arange(mu.shape[0]))
- axes[row,col].set_xticklabels(corpora)
- axes[row,col].xaxis.set_tick_params(rotation=90)
-
- if col==0:
- axes[row,col].set_ylabel("voc/h")
- fig.suptitle("Latent population mean voc/h\n(human annotations)")
- fig.savefig("output/quantities.png", bbox_inches="tight")
- fig, axes = plt.subplots(1, 2, sharex=True, sharey=True)
- for col in range(2):
- if col == 0:
- axes[col].scatter(np.arange(mu.shape[0]), mu[:,0])
- axes[col].errorbar(np.arange(mu.shape[0]), mu[:,0], (mu[:,0]-mu_low[:,0], mu_high[:,0]-mu[:,0]) ,ls="none")
- else:
- axes[col].scatter(np.arange(len(input)), input)
- axes[col].errorbar(np.arange(len(input)), input, (input-input_low, input_high-input) ,ls="none")
- axes[col].set_title("output" if col == 0 else "input")
- axes[col].set_ylim(0,2000)
- axes[col].set_xticks(np.arange(mu.shape[0]))
- axes[col].set_xticklabels(corpora)
- axes[col].xaxis.set_tick_params(rotation=90)
-
- if col==0:
- axes[col].set_ylabel("voc/h")
- fig.suptitle("Latent population mean voc/h\n(human annotations)")
- fig.savefig("output/input_output.png", bbox_inches="tight")
- fig, ax = plt.subplots(1,1)
- for i in range(4):
- alphas = fit[f"speech_rate_alpha"][:,i,args.selected_corpus]
- scale = 1000*fit[f"speech_rate_mu"][:,i,args.selected_corpus]/alphas
- x = np.linspace(0,500,200,True)
- pdf = np.zeros((len(x), len(alphas)))
- for k in range(len(x)):
- pdf[k,:] = gamma.pdf(x[k], alphas, np.zeros(len(alphas)), scale)
- pdf_low = np.quantile(pdf, q=0.05, axis=1)
- pdf_high = np.quantile(pdf, q=0.95, axis=1)
- pdf_mean = np.mean(pdf, axis=1)
- ax.plot(x, pdf_mean, color=colors[i], label=speakers[i])
- ax.fill_between(x, pdf_low, pdf_high, color=colors[i], alpha=0.2)
- ax.set_xlim(0, 500)
- ax.set_xlabel("voc/h")
- # ax.axvline(np.mean(data), linestyle="--", linewidth=0.5, color="#333", alpha=1)
- # ax.text(0.5, 4.5, f"{low:.2f} - {high:.2f}", ha="center", va="center")
- ax.legend()
- corpus = corpora[args.selected_corpus]
- fig.suptitle(f"voc/h distribution for each kind of speaker ({corpus})")
- fig.savefig(f"output/speech_distribution_{corpus}.png", bbox_inches="tight")
- plt.show()
|