LAAC-LSCP
/
speaker-confusion-model


			
			
				
					
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
							functions {
    real confusion_model_lpmf(array[] int group,
        int start, int end,
        int n_classes,
        array[,] int vtc,
        array[,] int truth,
        array[] real clip_duration,
        array[] matrix lambda,
        array[] vector lambda_fp
    ) {
        real ll = 0;
        vector [4] bp;

        vector[16384] log_contrib_comb;
        int n = size(log_contrib_comb);

        for (k in start:end) {
            for (i in 1:n_classes) {
                log_contrib_comb[:n] = rep_vector(0, n);
                n = 1;

                for (chi in 0:(truth[k,1]>0?max(truth[k,1], vtc[k,i]):0)) {
                    bp[1] = truth[k,1]==0?0:poisson_lpmf(chi | truth[k,1]*lambda[group[k-start+1],1,i]);

                    for (och in 0:(truth[k,2]>0?max(truth[k,2], vtc[k,i]-chi):0)) {
                        bp[2] = truth[k,2]==0?0:poisson_lpmf(och | truth[k,2]*lambda[group[k-start+1],2,i]);

                        for (fem in 0:(truth[k,3]>0?max(truth[k,3], vtc[k,i]-chi-och):0)) {
                            bp[3] = truth[k,3]==0?0:poisson_lpmf(fem | truth[k,3]*lambda[group[k-start+1],3,i]);

                            for (mal in 0:(truth[k,4]>0?max(truth[k,4], vtc[k,i]-chi-och-fem):0)) {
                                bp[4] = truth[k,4]==0?0:poisson_lpmf(mal | truth[k,4]*lambda[group[k-start+1],4,i]);

                                int delta = vtc[k,i] - (mal+fem+och+chi);
                                if (delta >= 0) {
                                    log_contrib_comb[n] += sum(bp);
                                    log_contrib_comb[n] += poisson_lpmf(
                                        delta | lambda_fp[group[k-start+1],i]*clip_duration[k]
                                    );
                                    n = n+1;
                                }
                            }
                        }
                    }
                }
                if (n>1) {
                    ll += log_sum_exp(log_contrib_comb[1:n-1]);
                }
            }
        }
        return ll;
    }

    real model_lpmf(array[] int children,
        int start, int end,
        int n_recs,
        int n_classes,
        real duration,
        array [,] int vocs,
        matrix truth_vocs,
        array [] matrix actual_confusion,
        array [] vector actual_fp_rate
        ) {
            real ll = 0;

            vector [4] expect;
            //vector [4] sd;

            for (k in start:end) {
                expect = rep_vector(0, 4);
                //sd = rep_vector(0, 4);

                for (i in 1:n_classes) {
                    expect[i] = dot_product(truth_vocs[k,:], actual_confusion[k,:,i]);
                    expect[i] += actual_fp_rate[k,i] * duration;
                }
                
                ll += normal_lpdf(vocs[k,:] | expect, sqrt(expect));
            }

            return ll;
        }
}

// TODO
// use speech rates to set priors on truth_vocs
data {
    int<lower=1> n_classes; // number of classes

    // analysis data block
    int<lower=1> n_recs;
    int<lower=1> n_children;

    array[n_recs] int<lower=1> children;
    array[n_recs] real<lower=1> age;
    array[n_recs, n_classes] int<lower=0> vocs;
    array[n_children] int<lower=1> corpus;

    real<lower=0> recs_duration;

    // speaker confusion data block
    int<lower=1> n_clips;   // number of clips
    int<lower=1> n_groups; // number of groups
    int<lower=1> n_corpora;
    array [n_clips] int group;
    array [n_clips] int conf_corpus;
    array [n_clips,n_classes] int<lower=0> vtc_total; // vtc vocs attributed to specific speakers
    array [n_clips,n_classes] int<lower=0> truth_total;
    array [n_clips] real<lower=0> clip_duration;

    int<lower=1> n_validation;

    // actual speech rates
    int<lower=1> n_rates;
    array [n_rates,n_classes] int<lower=0> speech_rates;
    array [n_rates] int group_corpus;
    array [n_rates] real<lower=0> durations;

    // parallel processing
    int<lower=1> threads;
}

parameters {
    matrix<lower=0> [n_recs, n_classes] truth_vocs;
    array [n_recs] vector[n_classes*n_classes] log_actual_confusion;
    array [n_recs] vector<lower=0>[n_classes] actual_fp_rate;

    // confusion parameters
    vector[n_classes*n_classes] mus;
    cholesky_factor_corr[n_classes*n_classes] L_Omega;
    vector<lower=0>[n_classes*n_classes] L_sigma;
    array [n_groups] vector[n_classes*n_classes] log_lambda;

    vector<lower=1>[n_classes] alphas_fp;
    vector<lower=0>[n_classes] mus_fp;
    array [n_groups] vector<lower=0>[n_classes] lambda_fp;

    //array [n_corpora] matrix[n_classes,n_classes] corpus_bias;
    //matrix<lower=0>[n_classes,n_classes] corpus_sigma;

    // speech rates
    matrix<lower=1>[n_classes,n_corpora] speech_rate_alpha;
    matrix<lower=0>[n_classes,n_corpora] speech_rate_mu;
    matrix<lower=0> [n_classes,n_rates] speech_rate;
}

transformed parameters {
    array [n_groups] matrix<lower=0>[n_classes,n_classes] lambda;
    array [n_recs] matrix<lower=0>[n_classes,n_classes] actual_confusion;

    for (i in 1:n_classes) {
        for (j in 1:n_classes) {
            for (c in 1:n_groups) {
                lambda[c,i,j] = exp(log_lambda[c,i+n_classes*(j-1)]);
            }
            for (k in 1:n_recs) {
                actual_confusion[k,i,j] = exp(log_actual_confusion[k,i+n_classes*(j-1)]);
            }
        }
    }
}

model {
    matrix[n_classes*n_classes, n_classes*n_classes] L_Sigma = diag_pre_multiply(L_sigma, L_Omega);

    //actual model

    //target += reduce_sum(
    //   model_lpmf, children, 1,
    //   n_recs, n_classes, recs_duration,
    //   vocs,
    //   truth_vocs, actual_confusion, actual_fp_rate
    //);

    for (k in 1:n_recs) {
        log_actual_confusion[k] ~ multi_normal_cholesky(mus, L_Sigma);
        actual_fp_rate[k] ~ gamma(alphas_fp, alphas_fp./mus_fp);
    }
    

    for (k in 1:n_recs) {
        truth_vocs[k,:] ~ gamma(
            speech_rate_alpha[:,corpus[children[k]]],
            (speech_rate_alpha[:,corpus[children[k]]]./speech_rate_mu[:,corpus[children[k]]])/1000/recs_duration
        );
    }

    target += reduce_sum(
        confusion_model_lpmf, group, n_clips%/%(threads*4),
        n_classes,
        vtc_total, truth_total, clip_duration,
        lambda, lambda_fp
    );

    mus_fp ~ exponential(1);
    alphas_fp ~ normal(1, 1);

    for (i in 1:n_classes) {
        lambda_fp[:,i] ~ gamma(alphas_fp[i], alphas_fp[i]/mus_fp[i]);    
    }

    exp(mus) ~ exponential(1);
    L_Omega ~ lkj_corr_cholesky(2);
    L_sigma ~ exponential(5);
    for (c in 1:n_groups) {
        log_lambda[c] ~ multi_normal_cholesky(mus, L_Sigma);
    }

    // speech rates
    for (i in 1:n_classes) {
        speech_rate_alpha[i,:] ~ normal(1, 1);
        speech_rate_mu[i,:] ~ exponential(2);
    }

    for (g in 1:n_rates) {
        for (i in 1:n_classes) {
            speech_rate[i,g] ~ gamma(
                speech_rate_alpha[i,group_corpus[g]],
                (speech_rate_alpha[i,group_corpus[g]]/speech_rate_mu[i,group_corpus[g]])/1000
            );
            speech_rates[g,i] ~ poisson(speech_rate[i,g]*durations[g]);
        }
    }
}