diagnostics.py 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389
  1. from email.errors import MalformedHeaderDefect
  2. from logging import logMultiprocessing
  3. import pandas as pd
  4. import numpy as np
  5. from matplotlib import pyplot as plt
  6. import matplotlib
  7. matplotlib.use("pgf")
  8. matplotlib.rcParams.update(
  9. {
  10. "pgf.texsystem": "pdflatex",
  11. "font.family": "serif",
  12. "font.serif": "Times New Roman",
  13. "text.usetex": True,
  14. "pgf.rcfonts": False,
  15. }
  16. )
  17. import pickle
  18. import argparse
  19. parser = argparse.ArgumentParser()
  20. parser.add_argument("--run")
  21. parser.add_argument("--model")
  22. args = parser.parse_args()
  23. speakers = ["CHI", "OCH", "FEM", "MAL"]
  24. cb_colors = [
  25. "#377eb8",
  26. "#ff7f00",
  27. "#f781bf",
  28. "#4daf4a",
  29. "#a65628",
  30. "#984ea3",
  31. "#999999",
  32. "#e41a1c",
  33. "#dede00",
  34. ]
  35. corpora = {
  36. 'bergelson': 0, 'cougar': 1, 'fausey-trio': 2, 'lucid': 3, 'warlaumont': 4, 'winnipeg': 5
  37. }
  38. corpora_names = {
  39. corpora[corpus]: corpus
  40. for corpus in corpora
  41. }
  42. def normality_test(data, log=False):
  43. from scipy.stats import normaltest
  44. bins = np.linspace(2, 4, 20) if log else np.linspace(0, 5000, 40)
  45. fig, ax = plt.subplots()
  46. for i in range(4):
  47. values = np.log10(data["vocs"][:, i]) if log else data["vocs"][:, i]
  48. res = normaltest(values)
  49. ax.hist(
  50. values,
  51. bins=bins,
  52. label=f"{speakers[i]}, p={res.pvalue:.4f}",
  53. histtype="step",
  54. )
  55. fig.legend()
  56. suffix = "_log" if log else ""
  57. fig.savefig(f"output/normality{suffix}.eps")
  58. def truth_vocs_predictions(data, samples):
  59. fig, axes = plt.subplots(nrows=2, ncols=2, sharex=True, sharey=True)
  60. for row in range(2):
  61. for col in range(2):
  62. i = row + 2 * col
  63. truth = samples["truth_vocs"][:, :, i].mean(axis=0)
  64. axes[row, col].scatter(
  65. data["vocs"][:, i],
  66. truth,
  67. s=0.5,
  68. color=[cb_colors[data["corpus"][k - 1] - 1] for k in data["children"]],
  69. )
  70. axes[row, col].errorbar(
  71. data["vocs"][:, i],
  72. truth,
  73. yerr=(
  74. truth
  75. - np.quantile(samples["truth_vocs"][:, :, i], axis=0, q=0.1 / 2),
  76. np.quantile(samples["truth_vocs"][:, :, i], axis=0, q=1 - 0.1 / 2)
  77. - truth,
  78. ),
  79. lw=0.1,
  80. ls="none",
  81. color=[cb_colors[data["corpus"][k - 1] - 1] for k in data["children"]],
  82. label=[corpora_names[data["corpus"][k - 1] - 1] for k in data["children"]],
  83. alpha=0.5,
  84. )
  85. if row == 1 and col == 0:
  86. axes[row, col].set_xlabel("VTC")
  87. axes[row, col].set_ylabel("est. truth")
  88. axes[row,col].set_xscale("log")
  89. axes[row,col].set_yscale("log")
  90. axes[row,col].set_ylim(20,10000)
  91. axes[row, col].axline((100, 100), (1000, 1000), color="black")
  92. axes[row, col].set_title(speakers[i], y=1, pad=-14)
  93. plt.subplots_adjust(wspace=0, hspace=0)
  94. fig.savefig("output/vocs_predictions.eps", bbox_inches="tight")
  95. fig.savefig("output/vocs_predictions.png", bbox_inches="tight", dpi=300)
  96. def age_distrib(data):
  97. fig, ax = plt.subplots()
  98. age = data["age"]
  99. corpus = np.array([data["corpus"][k - 1] - 1 for k in data["children"]])
  100. bins = np.arange(age.min(), age.max() + 1)
  101. for c in np.unique(corpus):
  102. ax.hist(
  103. age[corpus == c],
  104. bins=bins,
  105. color=cb_colors[c],
  106. histtype="step",
  107. label=corpora_names[c]
  108. )
  109. ax.set_xlabel("age (months)")
  110. ax.legend()
  111. fig.savefig("output/age_distrib.eps", bbox_inches="tight")
  112. def correlations(data, samples):
  113. # keep_calibration = (data["group_corpus"]==1)|(data["group_corpus"]==4)
  114. # calibration_truth = data["speech_rates"][keep_calibration,:]
  115. vtc = data["vocs"]
  116. truth = samples["truth_vocs"]
  117. n_samples = truth.shape[0]
  118. fig, axes = plt.subplots(nrows=3, ncols=3, sharex=True, sharey=True)
  119. # FEM,MAL
  120. # OCH,FEM OCH,MAL
  121. # CHI,OCH CHI,FEM CHI,MAL
  122. for i in range(3):
  123. for j in range(3):
  124. if i<2-j:
  125. axes[i, j].axis('off')
  126. continue
  127. a = 2-i
  128. b = j+1
  129. vtc_r = np.corrcoef(vtc[:,a], vtc[:,b])[0,1]
  130. # calibration_r = np.corrcoef(calibration_truth[:,a], calibration_truth[:,b])[0,1]
  131. truth_r = [np.corrcoef(truth[k,:,a], truth[k,:,b])[0,1] for k in range(n_samples)]
  132. bins = np.linspace(-1,1,100)
  133. low = np.quantile(truth_r,q=0.05/2)
  134. high = np.quantile(truth_r,q=1-0.05/2)
  135. r = np.mean(truth_r)
  136. axes[i,j].axvline(vtc_r, color=cb_colors[0], lw=1)
  137. # axes[i,j].axvline(calibration_r, color="olive", lw=0.5, ls="dashed")
  138. axes[i,j].hist(truth_r, bins=bins, histtype="step", density=True, color="black")
  139. axes[i,j].text(
  140. 0.05, 0.9,
  141. f"\\tiny $R(\\mathrm{{{speakers[a]}}},\\mathrm{{{speakers[b]}}})={r:.2f}$",
  142. ha="left",
  143. transform=axes[i,j].transAxes,
  144. color="black"
  145. )
  146. axes[i,j].text(
  147. 0.05, 0.8,
  148. f"\\tiny $\\mathrm{{CI}}_{{95\\%}}$[{low:.2f}, {high:.2f}]",
  149. ha="left",
  150. transform = axes[i,j].transAxes,
  151. color="black"
  152. )
  153. axes[i,j].text(
  154. 0.05, 0.7,
  155. f"\\tiny $R(\\mathrm{{VTC}})={vtc_r:.2f}$",
  156. ha="left",
  157. transform = axes[i,j].transAxes,
  158. color=cb_colors[0]
  159. )
  160. # axes[i,j].text(
  161. # 0.05, 0.6,
  162. # f"\\tiny $R(\\mathrm{{human}})={calibration_r:.2f}\\ast$",
  163. # ha="left",
  164. # transform = axes[i,j].transAxes,
  165. # color="olive"
  166. # )
  167. axes[i,j].set_yticks([])
  168. axes[i,j].set_yticklabels([])
  169. axes[i,j].set_xlim(-0.8, 0.8)
  170. plt.subplots_adjust(wspace=0.1, hspace=0.1)
  171. fig.savefig("output/correlations.eps", bbox_inches="tight")
  172. def child_level_correlations(data, samples):
  173. mu = samples["mu_child_level"]
  174. sibs = np.zeros(data["n_children"])
  175. for k in range(data["n_recs"]):
  176. sibs[data["children"].iloc[k]-1] = data["siblings"].iloc[k]
  177. has_sibs_data = sibs>=0
  178. beta = samples["beta_sib_och"]
  179. n_samples = mu.shape[0]
  180. fig, axes = plt.subplots(nrows=2, ncols=2, sharex=True, sharey=True)
  181. # FEM,MAL
  182. # OCH,FEM OCH,MAL
  183. for i in range(2):
  184. for j in range(2):
  185. if i<1-j:
  186. axes[i, j].axis('off')
  187. continue
  188. a = 1-i+1
  189. b = j+2
  190. if (a==1 or b==1):
  191. if a==1:
  192. mu_r = [np.corrcoef(mu[k,has_sibs_data,a-1]*np.exp((sibs[has_sibs_data]>0)*beta[k]), mu[k,has_sibs_data,b-1])[0,1] for k in range(n_samples)]
  193. else:
  194. mu_r = [np.corrcoef(mu[k,has_sibs_data,a-1], mu[k,has_sibs_data,b-1]*np.exp((sibs[has_sibs_data]>0)*beta[k]))[0,1] for k in range(n_samples)]
  195. else:
  196. mu_r = [np.corrcoef(mu[k,:,a-1], mu[k,:,b-1])[0,1] for k in range(n_samples)]
  197. bins = np.linspace(-1,1,100)
  198. low = np.quantile(mu_r,q=0.05/2)
  199. high = np.quantile(mu_r,q=1-0.05/2)
  200. r = np.mean(mu_r)
  201. axes[i,j].hist(mu_r, bins=bins, histtype="step", density=True, color="black")
  202. axes[i,j].text(
  203. 0.05, 0.9,
  204. f"$R(\\mathrm{{{speakers[a]}}},\\mathrm{{{speakers[b]}}})={r:.2f}$",
  205. ha="left",
  206. transform=axes[i,j].transAxes,
  207. color="black"
  208. )
  209. axes[i,j].text(
  210. 0.05, 0.8,
  211. f"$\\mathrm{{CI}}_{{95\\%}}$[{low:.2f}, {high:.2f}]",
  212. ha="left",
  213. transform = axes[i,j].transAxes,
  214. color="black"
  215. )
  216. axes[i,j].set_yticks([])
  217. axes[i,j].set_yticklabels([])
  218. axes[i,j].set_xlim(-0.7, 0.7)
  219. plt.subplots_adjust(wspace=0.1, hspace=0.1)
  220. fig.savefig("output/child_level_correlations.eps", bbox_inches="tight")
  221. def dev_random_effect(data, samples):
  222. beta = samples["child_dev_age"]
  223. n_children = beta.shape[1]
  224. mu_beta = beta.mean(axis=0)
  225. low_beta = np.quantile(beta, axis=0, q=0.05/2)
  226. high_beta = np.quantile(beta, axis=0, q=1-0.05/2)
  227. order = np.argsort(mu_beta)
  228. corpora = data["corpus"][np.arange(n_children)]-1
  229. corpora = corpora[order]
  230. fig, ax = plt.subplots()
  231. ax.scatter(mu_beta[order], np.arange(n_children), s=1, color=[cb_colors[corpus] for corpus in corpora])
  232. ax.errorbar(
  233. mu_beta[order],
  234. np.arange(n_children),
  235. xerr=(mu_beta[order]-low_beta[order],high_beta[order]-mu_beta[order]),
  236. ls="none",
  237. color=[cb_colors[corpus] for corpus in corpora],
  238. label=[corpora_names[corpus] for corpus in corpora],
  239. lw=1
  240. )
  241. ax.set_xlabel("$\\beta_c$")
  242. ax.set_ylabel("child number")
  243. fig.savefig("output/dev_random_effect.eps", bbox_inches="tight")
  244. def input_effect(data, samples):
  245. fig, ax = plt.subplots()
  246. m = np.mean(samples["beta_dev"])
  247. low = np.quantile(samples["beta_dev"], q=0.05/2)
  248. high = np.quantile(samples["beta_dev"], q=1-0.05/2)
  249. bins = np.linspace(-2.5, 2.5, 25)
  250. ax.hist(
  251. samples["beta_dev"],
  252. bins=bins,
  253. histtype="step",
  254. density=True
  255. )
  256. ax.text(0.05, 0.9, f"$\\mu(\\delta)={m:.1f}$",
  257. ha="left",
  258. transform = ax.transAxes
  259. )
  260. ax.text(0.05, 0.8, f"$\\mathrm{{CI}}_{{95\\%}}$[{low:.1f}, {high:.1f}]",
  261. ha="left",
  262. transform = ax.transAxes
  263. )
  264. ax.set_xlabel("$\\delta$")
  265. fig.savefig("output/input_effect.eps", bbox_inches="tight")
  266. def dev_total_rate(data, samples):
  267. beta = samples["child_dev_age"]
  268. delta = samples["beta_dev"]
  269. adu = samples["mu_child_level"][:,:,1]+samples["mu_child_level"][:,:,2]
  270. adu_pop = samples["mu_pop_level"][:,2]+samples["mu_pop_level"][:,3]
  271. total = beta+delta[:,np.newaxis]*(adu-adu_pop[:,np.newaxis])
  272. n_children = total.shape[1]
  273. mu = total.mean(axis=0)
  274. low = np.quantile(total, axis=0, q=0.05/2)
  275. high = np.quantile(total, axis=0, q=1-0.05/2)
  276. order = np.argsort(mu)
  277. corpora = data["corpus"][np.arange(n_children)]-1
  278. corpora = corpora[order]
  279. fig, ax = plt.subplots()
  280. ax.scatter(mu[order], np.arange(n_children), s=1, color=[cb_colors[corpus] for corpus in corpora])
  281. ax.errorbar(
  282. mu[order],
  283. np.arange(n_children),
  284. xerr=(mu[order]-low[order],high[order]-mu[order]),
  285. ls="none",
  286. color=[cb_colors[corpus] for corpus in corpora],
  287. label=[corpora_names[corpus] for corpus in corpora],
  288. lw=1
  289. )
  290. ax.set_xlabel("$\\beta_c+\\delta (\\mu^{\\mathrm{child}}_{c,\mathrm{ADU}}-\\mu^{\\mathrm{pop}}_{\mathrm{ADU}})$")
  291. ax.set_ylabel("child number")
  292. fig.savefig("output/dev_total_rate.eps", bbox_inches="tight")
  293. with open(f"output/aggregates_{args.run}_{args.model}.pickle", "rb") as f:
  294. data = pickle.load(f)
  295. samples = np.load(f"output/aggregates_{args.run}_{args.model}.npz")
  296. dev_random_effect(data, samples)
  297. dev_total_rate(data, samples)
  298. input_effect(data, samples)
  299. correlations(data, samples)
  300. child_level_correlations(data, samples)
  301. normality_test(data, log=True)
  302. normality_test(data, log=False)
  303. truth_vocs_predictions(data, samples)
  304. age_distrib(data)