LAAC-LSCP
/
speaker-confusion-model


			
			
				
					
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407
							functions {
    real confusion_model_lpmf(array[] int group,
        int start, int end,
        int n_classes,
        array[,] int algo,
        array[,] int truth,
        array[] real age,
        array[] real clip_duration,
        array[] matrix lambda,
        matrix omega//,
        //array[] vector lambda_fp,
    ) {
        real ll = 0;
        vector [4] bp;

        vector[8192] log_contrib_comb;
        int n = size(log_contrib_comb);

        for (k in start:end) {
            for (i in 1:n_classes) {
                log_contrib_comb[:n] = rep_vector(0, n);
                n = 1;

                for (chi in 0:(truth[k,1]>0?max(truth[k,1], algo[k,i]):0)) {
                    bp[1] = truth[k,1]==0?0:neg_binomial_lpmf(chi | truth[k,1]*lambda[group[k-start+1],1,i]/(omega[1,i]-1), 1/(omega[1,i]-1));

                    for (och in 0:(truth[k,2]>0?max(truth[k,2], algo[k,i]-chi):0)) {
                        bp[2] = truth[k,2]==0?0:neg_binomial_lpmf(och | truth[k,2]*lambda[group[k-start+1],2,i]/(omega[2,i]-1), 1/(omega[2,i]-1));

                        for (fem in 0:(truth[k,3]>0?max(truth[k,3], algo[k,i]-chi-och):0)) {
                            bp[3] = truth[k,3]==0?0:neg_binomial_lpmf(fem | truth[k,3]*lambda[group[k-start+1],3,i]/(omega[3,i]-1), 1/(omega[3,i]-1));

                            for (mal in 0:(truth[k,4]>0?max(truth[k,4], algo[k,i]-chi-och-fem):0)) {
                                bp[4] = truth[k,4]==0?0:neg_binomial_lpmf(mal | truth[k,4]*lambda[group[k-start+1],4,i]/(omega[4,i]-1), 1/(omega[4,i]-1));

                                int delta = algo[k,i] - (mal+fem+och+chi);
                                // if (delta >= 0) {
                                //     log_contrib_comb[n] += sum(bp);
                                //     log_contrib_comb[n] += poisson_lpmf(
                                //         delta | lambda_fp[group[k-start+1],i]*clip_duration[k]
                                //     );
                                //     n = n+1;
                                // }
                                if (delta==0) {
                                    log_contrib_comb[n] += sum(bp);
                                    n = n+1;
                                }
                            }
                        }
                    }
                }
                if (n>1) {
                    ll += log_sum_exp(log_contrib_comb[1:n-1]);
                }
            }
        }
        return ll;
    }

    real inverse_model_lpmf(array[] int children,
        int start, int end,
        int n_recs,
        int n_classes,
        real duration,
        array [,] int vocs,
        array [] real age,
        matrix truth_vocs,
        array [] matrix actual_confusion,
        //array [] vector actual_fp_rate,
        matrix mus,
        matrix alphas,
        matrix omega//,
        //vector mus_fp,
        //vector alphas_fp
        ) {
            real ll = 0;

            vector [4] expect;
            vector [4] sd;

            for (k in start:end) {
                expect = rep_vector(0, 4);

                for (i in 1:n_classes) {
                    ll += gamma_lpdf(actual_confusion[k,i] | alphas[i,:], alphas[i,:]./mus[i,:]);
                    //ll += gamma_lpdf(actual_fp_rate[k] | alphas_fp, alphas_fp./mus_fp);
                    
                    expect[i] = dot_product(truth_vocs[k,:], actual_confusion[k,:,i]);
                    sd[i] = dot_product(truth_vocs[k,:], actual_confusion[k,:,i].*omega[:,i]);
                    //expect[i] += actual_fp_rate[k,i] * duration;
                }
                
                ll += normal_lpdf(vocs[k,:] | expect, sqrt(sd));
            }

            return ll;
        }

    real recs_priors_lpmf(array[] int children,
        int start, int end,
        int n_recs,
        int n_classes,
        real recs_duration,
        array [] real age,
        matrix truth_vocs,
        vector mu_pop_level,
        matrix mu_child_level,
        vector alpha_child_level,
        vector child_dev_age,
        real beta_dev
        ) {
            real ll = 0;
            
            for (k in start:end) {
                real chi_mu = mu_pop_level[1]*exp(
                    child_dev_age[children[k-start+1]]*age[k]/12.0/10.0+beta_dev*(mu_child_level[children[k-start+1],2]+mu_child_level[children[k-start+1],3]-mu_pop_level[3]-mu_pop_level[4])*age[k]/12.0/10.0
                );
                ll += gamma_lpdf(
                    truth_vocs[k,1]/1000/recs_duration | alpha_child_level[1], alpha_child_level[1]/chi_mu
                );

                ll += gamma_lpdf(
                    truth_vocs[k,2:]/1000/recs_duration | alpha_child_level[2:], alpha_child_level[2:]./mu_child_level[children[k-start+1],:]' //'
                );
            }

            return ll;
        }
}

// TODO
// use speech rates to set priors on truth_vocs
data {
    int<lower=1> n_classes; // number of classes

    // analysis data block
    int<lower=1> n_recs;
    int<lower=1> n_children;

    array[n_recs] int<lower=1> children;
    array[n_recs] real<lower=1> age;
    array[n_recs] int<lower=-1> siblings;
    array[n_recs, n_classes] int<lower=0> vocs;
    array[n_children] int<lower=1> corpus;

    real<lower=0> recs_duration;

    // speaker confusion data block
    int<lower=1> n_clips;   // number of clips
    int<lower=1> n_groups; // number of groups
    int<lower=1> n_corpora;
    array [n_clips] int group;
    array [n_clips] int conf_corpus;
    array [n_clips,n_classes] int<lower=0> algo_total; // algo vocs attributed to specific speakers
    array [n_clips,n_classes] int<lower=0> truth_total;
    array [n_clips] real<lower=0> clip_duration;
    array [n_clips] real<lower=0> clip_age;

    int<lower=0> n_validation;

    // actual speech rates
    int<lower=1> n_rates;
    int<lower=1> n_speech_rate_children;

    array [n_rates,n_classes] int<lower=0> speech_rates;
    array [n_rates] int group_corpus;
    array [n_rates] real<lower=0> durations;
    array [n_rates] real<lower=0> speech_rate_age;
    array [n_rates] int<lower=-1> speech_rate_siblings;
    array [n_rates] int<lower=1,upper=n_speech_rate_children> speech_rate_child;

    // parallel processing
    int<lower=1> threads;
}

transformed data {
    vector<lower=0>[n_groups] recording_age;
    array[n_speech_rate_children] int<lower=1> speech_rate_child_corpus;

    array[n_children] int<lower=-1> child_siblings;
    array[n_speech_rate_children] int<lower=-1> speech_rate_child_siblings;
    int no_siblings = 0;
    int has_siblings = 0;

    for (c in 1:n_clips) {
        recording_age[group[c]] = clip_age[c];
    }

    for (k in 1:n_rates) {
        speech_rate_child_corpus[speech_rate_child[k]] = group_corpus[k];
    }

    for (k in 1:n_recs) {
        child_siblings[children[k]] = siblings[k];
    }

    for (c in 1:n_children) {
        if (child_siblings[c] == 0) {
            no_siblings += 1;
        }
        else if (child_siblings[c] > 0) {
            has_siblings += 1;
        }
    }

    for (k in 1:n_rates) {
        speech_rate_child_siblings[speech_rate_child[k]] = speech_rate_siblings[k];
    }
}

parameters {
    matrix<lower=0>[n_children,n_classes-1] mu_child_level;
    vector [n_children] child_dev_age;
    matrix<lower=0> [n_recs, n_classes] truth_vocs;

    // nuisance parameters
    array [n_recs] matrix<lower=0>[n_classes,n_classes] actual_confusion_baseline;
    //array [n_recs] vector<lower=0>[n_classes] actual_fp_rate;

    // confusion parameters
    // confusion matrix
    matrix<lower=0>[n_classes,n_classes] alphas;
    matrix<lower=0>[n_classes,n_classes] mus;
    matrix<lower=0>[n_classes,n_classes] conf_sd;
    array [n_groups] matrix<lower=0>[n_classes,n_classes] lambda;

    // false positives
    //vector<lower=0>[n_classes] alphas_fp;
    //vector<lower=0>[n_classes] mus_fp;
    //array [n_groups] vector<lower=0>[n_classes] lambda_fp;

    // speech rates
    vector<lower=0>[n_classes] alpha_child_level; // variance across recordings for a given child
    array[2] matrix<lower=0>[n_classes-1,n_corpora] alpha_corpus_level; // variance among children
    matrix<lower=0>[n_classes-1,n_corpora] mu_corpus_level; // child-level average
    vector<lower=0>[n_classes-1] alpha_pop_level; // variance among corpora
    vector<lower=0>[n_classes] mu_pop_level; // population level averages
    vector<lower=0>[n_classes-1] alpha_pop;
    matrix<lower=0>[n_classes,n_rates] speech_rate; // truth speech rates observed in annotated clips
    matrix<lower=0>[n_speech_rate_children,n_classes-1] speech_rate_child_level; // expected speech rate at the child-level

    // siblings
    real beta_sib_och; // effect of having siblings on OCH speech
    real beta_sib_adu; // effect of having siblings on ADU speech
    real<lower=0,upper=1> p_sib; // prob of having siblings

    vector [n_speech_rate_children] child_dev_speech_age;

    // average effect of age
    real alpha_dev;
    real<lower=0> sigma_dev;

    // effect of excess ADU input
    real beta_dev;
}

model {
    //actual model
    matrix[n_classes,n_classes] omega = exp(conf_sd/10);

    // inverse confusion model
    target += reduce_sum(
       inverse_model_lpmf, children, 1,
       n_recs, n_classes, recs_duration,
       vocs, age,
       truth_vocs, actual_confusion_baseline, mus, alphas, omega//, mus_fp, alphas_fp
    );

    // priors on actual speech
    target += reduce_sum(
        recs_priors_lpmf, children, 1,
        n_recs, n_classes, recs_duration, age,
        truth_vocs,
        mu_pop_level, mu_child_level, alpha_child_level,
        child_dev_age, beta_dev
    );

    vector [2] ll;
    int distrib;
    for (c in 1:n_children) {
        // if there is sibling data
        if (child_siblings[c]>=0) {
            distrib = child_siblings[c]>0?2:1;

            mu_child_level[c,1] ~ gamma(
                alpha_corpus_level[distrib,1,corpus[c]],
                (alpha_corpus_level[distrib,1,corpus[c]]/(mu_corpus_level[1,corpus[c]]*exp(
                   child_siblings[c]>0?beta_sib_och:0 
                )))
            );
            mu_child_level[c,2:] ~ gamma(
                alpha_corpus_level[distrib,2:,corpus[c]],
                (alpha_corpus_level[distrib,2:,corpus[c]]./mu_corpus_level[2:,corpus[c]]*exp(
                   child_siblings[c]>0?beta_sib_adu:0 
                ))
            );
        }
        // otherwise
        else {
            // assuming no sibling
            ll[1] = log(p_sib)+gamma_lpdf(
                mu_child_level[c,1] | alpha_corpus_level[2,1,corpus[c]], alpha_corpus_level[2,1,corpus[c]]/(mu_corpus_level[1,corpus[c]]*exp(beta_sib_och))
            );
            ll[1] += gamma_lpdf(
                mu_child_level[c,2] | alpha_corpus_level[2,2,corpus[c]], alpha_corpus_level[2,2,corpus[c]]/(mu_corpus_level[2,corpus[c]]*exp(beta_sib_adu))
            );
            ll[1] += gamma_lpdf(
                mu_child_level[c,3] | alpha_corpus_level[2,3,corpus[c]], alpha_corpus_level[2,3,corpus[c]]/(mu_corpus_level[3,corpus[c]]*exp(beta_sib_adu))
            );

            // assuming sibling
            ll[2] = log(1-p_sib)+gamma_lpdf(
                mu_child_level[c,1] | alpha_corpus_level[1,1,corpus[c]], alpha_corpus_level[1,1,corpus[c]]/(mu_corpus_level[1,corpus[c]])
            );
            ll[2] += gamma_lpdf(
                mu_child_level[c,2] | alpha_corpus_level[1,2,corpus[c]], alpha_corpus_level[1,2,corpus[c]]/(mu_corpus_level[2,corpus[c]])
            );
            ll[2] += gamma_lpdf(
                mu_child_level[c,3] | alpha_corpus_level[1,3,corpus[c]], alpha_corpus_level[1,3,corpus[c]]/(mu_corpus_level[3,corpus[c]])
            );
            target += log_sum_exp(ll);
        }
    }

    alpha_child_level ~ gamma(2,1);

    target += reduce_sum(
        confusion_model_lpmf, group, n_clips%/%(threads*4),
        n_classes,
        algo_total, truth_total, clip_duration, clip_age,
        lambda, omega//, lambda_fp
    );

    //mus_fp ~ exponential(1);
    //alphas_fp ~ gamma(2, 1);

    for (i in 1:n_classes) {
        //lambda_fp[:,i] ~ gamma(alphas_fp[i], alphas_fp[i]/mus_fp[i]);
        conf_sd[i,:] ~ normal(0, 1);
        for (j in 1:n_classes) {
            mus[i,j] ~ exponential(i==j?2:8);
            alphas[i,j] ~ gamma(2,1);
            // mus[i,j] ~ exponential(1);
            // alphas[i,j] ~ exponential(1);
            for (c in 1:n_groups) {
                lambda[c,i,j] ~ gamma(alphas[i,j], alphas[i,j]/mus[i,j]);
            }
        }
    }

    // speech rates
    mu_pop_level ~ exponential(4); // 250 vocs/hour
    alpha_pop_level ~ gamma(8, 4); // sd = 0.35 x \mu
    alpha_pop ~ gamma(10, 10);
    for (i in 1:n_classes-1) {
        alpha_corpus_level[1,i,:] ~ gamma(4, 4/alpha_pop[i]);
        alpha_corpus_level[2,i,:] ~ gamma(4, 4/alpha_pop[i]);
        mu_corpus_level[i,:] ~ gamma(alpha_pop_level[i],alpha_pop_level[i]/mu_pop_level[i+1]);
    }

    for (g in 1:n_rates) {
        real chi_mu = mu_pop_level[1]*exp(
            child_dev_speech_age[speech_rate_child[g]]*speech_rate_age[g]/12.0/10.0 + beta_dev*(speech_rate_child_level[speech_rate_child[g],2]+speech_rate_child_level[speech_rate_child[g],3]-mu_pop_level[3]-mu_pop_level[4])*speech_rate_age[g]/12.0/10.0
        );
        speech_rate[1,g] ~ gamma(
            alpha_child_level[1],
            alpha_child_level[1]/chi_mu
        );

        speech_rate[2:,g] ~ gamma(
            alpha_child_level[2:],
            (alpha_child_level[2:]./(speech_rate_child_level[speech_rate_child[g],:]')) //'
        );
        speech_rates[g,:] ~ poisson(speech_rate[:,g]*durations[g]*1000);
    }

    for (c in 1:n_speech_rate_children) {
        distrib = child_siblings[c]>0?2:1;

        speech_rate_child_level[c,1] ~ gamma(
            alpha_corpus_level[distrib,1,speech_rate_child_corpus[c]],
            (alpha_corpus_level[distrib,1,speech_rate_child_corpus[c]]/(mu_corpus_level[1,speech_rate_child_corpus[c]]*exp(
                speech_rate_child_siblings[c]>0?beta_sib_och:0
            )))
        );

        speech_rate_child_level[c,2:] ~ gamma(
            alpha_corpus_level[distrib,2:,speech_rate_child_corpus[c]],
            (alpha_corpus_level[distrib,2:,speech_rate_child_corpus[c]]./(mu_corpus_level[2:,speech_rate_child_corpus[c]]*exp(
                speech_rate_child_siblings[c]>0?beta_sib_adu:0
            )))
        );
    }

    child_dev_age ~ normal(alpha_dev, sigma_dev);
    child_dev_speech_age ~ normal(alpha_dev, sigma_dev);

    has_siblings ~ binomial(has_siblings+no_siblings, p_sib);
    p_sib ~ uniform(0, 1);
    beta_sib_och ~ normal(0, 1);
    beta_sib_adu ~ normal(0, 1);

    alpha_dev ~ normal(0, 1);
    sigma_dev ~ exponential(1);

    beta_dev ~ normal(0, 1);
}