causal.stan 7.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
  1. functions {
  2. real confusion_model_lpmf(array[] int group,
  3. int start, int end,
  4. int n_classes,
  5. array[,,] int vtc,
  6. array[,] int vtc_fp,
  7. array[,] int truth,
  8. array[,,] int coocc,
  9. array[,] int truth_total,
  10. array[] real clip_duration,
  11. array[] matrix lambda,
  12. array[] matrix delta,
  13. array[] vector lambda_fp,
  14. array[] matrix coocc_rate
  15. ) {
  16. real ll = 0;
  17. for (k in start:end) {
  18. for (i in 1:n_classes) {
  19. for (j in 1:n_classes) {
  20. if(truth[k,j]==0) {
  21. continue;
  22. }
  23. ll += poisson_lpmf(
  24. vtc[k,i,j] | lambda[group[k-start+1],j,i]*truth[k,j]*exp(delta[k,i,j])
  25. );
  26. if (i!=j) {
  27. ll += binomial_lpmf(coocc[k,j,i] | truth_total[k,i], coocc_rate[group[k-start+1],i,j]);
  28. }
  29. }
  30. ll += poisson_lpmf(
  31. vtc_fp[k,i] | lambda_fp[group[k-start+1],i] * clip_duration[k]
  32. );
  33. }
  34. }
  35. return ll;
  36. }
  37. real model_lpmf(array[] int children,
  38. int start, int end,
  39. int n_recs,
  40. int n_classes,
  41. real duration,
  42. array [,] int vocs,
  43. matrix truth_vocs,
  44. array [] matrix actual_confusion,
  45. array [] vector actual_fp_rate,
  46. array[] matrix actual_coocc
  47. ) {
  48. real ll = 0;
  49. vector [4] expect;
  50. vector [4] sd;
  51. vector [4] tmp;
  52. for (k in start:end) {
  53. expect = rep_vector(0, 4);
  54. sd = rep_vector(0, 4);
  55. for (i in 1:n_classes) {
  56. expect[i] = dot_product(truth_vocs[k,:], actual_confusion[k,:,i]);
  57. expect[i] += actual_fp_rate[k,i] * duration;
  58. sd[i] = expect[i];
  59. tmp = actual_coocc[k,i,:]'; //'
  60. tmp[i] = 0;
  61. expect[i] -= 0.5*dot_product(tmp, actual_confusion[k,:,i]')*truth_vocs[k,i]*actual_confusion[k,i,i];
  62. sd[i] += 0.5*dot_product(tmp, actual_confusion[k,:,i]')*truth_vocs[k,i]*actual_confusion[k,i,i];
  63. }
  64. ll += normal_lpdf(vocs[k,:] | expect, sqrt(sd));
  65. }
  66. return ll;
  67. }
  68. }
  69. // TODO
  70. // use speech rates to set priors on truth_vocs
  71. data {
  72. int<lower=1> n_classes; // number of classes
  73. // analysis data block
  74. int<lower=1> n_recs;
  75. int<lower=1> n_children;
  76. array[n_recs] int<lower=1> children;
  77. array[n_recs] real<lower=1> age;
  78. array[n_recs, n_classes] int<lower=0> vocs;
  79. array[n_children] int<lower=1> corpus;
  80. real<lower=0> recs_duration;
  81. // speaker confusion data block
  82. int<lower=1> n_clips; // number of clips
  83. int<lower=1> n_groups; // number of groups
  84. int<lower=1> n_corpora;
  85. array [n_clips] int group;
  86. array [n_clips] int conf_corpus;
  87. array [n_clips,n_classes,n_classes] int<lower=0> vtc; // vtc vocs attributed to specific speakers
  88. array [n_clips,n_classes] int<lower=0> vtc_fp; // vtc vocs attributed to non-speech
  89. array [n_clips,n_classes] int<lower=0> truth;
  90. array [n_clips,n_classes] int<lower=0> truth_total;
  91. array [n_clips] real<lower=0> clip_duration;
  92. array [n_clips,n_classes,n_classes] int<lower=0> coocc;
  93. int<lower=1> n_validation;
  94. // actual speech rates
  95. int<lower=1> n_rates;
  96. array [n_rates,n_classes] int<lower=0> speech_rates;
  97. array [n_rates] int group_corpus;
  98. array [n_rates] real<lower=0> durations;
  99. }
  100. parameters {
  101. matrix<lower=0> [n_recs, n_classes] truth_vocs;
  102. //array [n_children] matrix<lower=0>[n_classes,n_classes] actual_confusion_baseline;
  103. array [n_recs] matrix<lower=0>[n_classes,n_classes] actual_confusion_baseline;
  104. array [n_recs] matrix<lower=0,upper=1>[n_classes,n_classes] actual_coocc;
  105. array [n_recs] vector<lower=0>[n_classes] actual_fp_rate;
  106. // confusion parameters
  107. matrix<lower=1>[n_classes,n_classes] alphas;
  108. matrix<lower=0>[n_classes,n_classes] mus;
  109. array [n_groups] matrix<lower=0>[n_classes,n_classes] lambda;
  110. array [n_clips] matrix[n_classes,n_classes] delta;
  111. real<lower=0> sigma;
  112. matrix<lower=1>[n_classes,n_classes] etas_coocc;
  113. matrix<lower=0,upper=1>[n_classes,n_classes] mus_coocc;
  114. array [n_groups] matrix<lower=0,upper=1>[n_classes,n_classes] coocc_rate;
  115. vector<lower=1>[n_classes] alphas_fp;
  116. vector<lower=0>[n_classes] mus_fp;
  117. array [n_groups] vector<lower=0>[n_classes] lambda_fp;
  118. //array [n_corpora] matrix[n_classes,n_classes] corpus_bias;
  119. //matrix<lower=0>[n_classes,n_classes] corpus_sigma;
  120. // speech rates
  121. matrix<lower=1>[n_classes,n_corpora] speech_rate_alpha;
  122. matrix<lower=0>[n_classes,n_corpora] speech_rate_mu;
  123. matrix<lower=0> [n_classes,n_rates] speech_rate;
  124. }
  125. transformed parameters {
  126. // array [n_children] matrix<lower=0,upper=1>[n_classes,n_classes] actual_confusion;
  127. // for (c in 1:n_children) {
  128. // actual_confusion[c] = inv_logit(logit(actual_confusion_baseline[c])+corpus_bias[corpus[c]]);
  129. // }
  130. }
  131. model {
  132. //actual model
  133. target += reduce_sum(
  134. model_lpmf, children, 1,
  135. n_recs, n_classes, recs_duration,
  136. vocs,
  137. truth_vocs, actual_confusion_baseline, actual_fp_rate, actual_coocc
  138. );
  139. for (k in 1:n_recs) {
  140. for (i in 1:n_classes) {
  141. actual_confusion_baseline[k,i] ~ gamma(alphas[i,:], alphas[i,:]./mus[i,:]);
  142. actual_coocc[k,i,:] ~ beta_proportion(mus_coocc[i,:], etas_coocc[i,:]);
  143. }
  144. actual_fp_rate[k] ~ gamma(alphas_fp, alphas_fp./mus_fp);
  145. }
  146. for (k in 1:n_recs) {
  147. truth_vocs[k,:] ~ gamma(
  148. speech_rate_alpha[:,corpus[children[k]]],
  149. (speech_rate_alpha[:,corpus[children[k]]]./speech_rate_mu[:,corpus[children[k]]])/1000/recs_duration
  150. );
  151. }
  152. target += reduce_sum(
  153. confusion_model_lpmf, group, n_clips%/%40,
  154. n_classes,
  155. vtc, vtc_fp, truth, coocc, truth_total, clip_duration,
  156. lambda, delta, lambda_fp, coocc_rate
  157. );
  158. mus_fp ~ exponential(1);
  159. alphas_fp ~ normal(1, 1);
  160. for (i in 1:n_classes) {
  161. lambda_fp[:,i] ~ gamma(alphas_fp[i], alphas_fp[i]/mus_fp[i]);
  162. for (j in 1:n_classes) {
  163. mus[i,j] ~ exponential(2);
  164. alphas[i,j] ~ normal(1,1);
  165. for (c in 1:n_groups) {
  166. lambda[c,i,j] ~ gamma(alphas[i,j], alphas[i,j]/mus[i,j]);
  167. coocc_rate[c,i,j] ~ beta_proportion(mus_coocc[i,j], etas_coocc[i,j]);
  168. }
  169. delta[:,i,j] ~ normal(0,sigma);
  170. mus_coocc[i,j] ~ uniform(0, 1);
  171. etas_coocc[i,j] ~ pareto(1, 1.5);
  172. }
  173. }
  174. sigma ~ normal(0, 0.1);
  175. // for (i in 1:n_classes) {
  176. // for (j in 1:n_classes) {
  177. // for (c in 1:n_corpora) {
  178. // corpus_bias[c,j,i] ~ normal(0, corpus_sigma[j,i]);
  179. // }
  180. // corpus_sigma[j,i] ~ normal(0, 1);
  181. // }
  182. // }
  183. // speech rates
  184. for (i in 1:n_classes) {
  185. speech_rate_alpha[i,:] ~ normal(1, 1);
  186. speech_rate_mu[i,:] ~ exponential(2);
  187. }
  188. for (g in 1:n_rates) {
  189. for (i in 1:n_classes) {
  190. speech_rate[i,g] ~ gamma(
  191. speech_rate_alpha[i,group_corpus[g]],
  192. (speech_rate_alpha[i,group_corpus[g]]/speech_rate_mu[i,group_corpus[g]])/1000
  193. );
  194. speech_rates[g,i] ~ poisson(speech_rate[i,g]*durations[g]);
  195. }
  196. }
  197. }