123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189 |
- #!/usr/bin/env python3
- from ChildProject.projects import ChildProject
- from ChildProject.annotations import AnnotationManager
- from ChildProject.metrics import segments_to_annotation
- import datalad.api
- from os.path import join as opj
- from os.path import basename, exists
- import pandas as pd
- import stan
- import numpy as np
- from scipy.stats import beta
- from matplotlib import pyplot as plt
- import seaborn as sns
- def compute_counts(parameters):
- annotator = parameters['annotator']
- corpus = parameters['corpus']
- speakers = ['CHI', 'FEM', 'MAL', 'OCH']
- project = ChildProject(parameters['path'])
- am = AnnotationManager(project)
- am.read()
- intersection = AnnotationManager.intersection(
- am.annotations, ['vtc', annotator]
- )
- intersection['onset'] = intersection.apply(lambda r: np.arange(r['range_onset'], r['range_offset'], 15000), axis = 1)
- intersection = intersection.explode('onset')
- intersection['range_onset'] = intersection['onset']
- intersection['range_offset'] = (intersection['range_onset']+15000).clip(upper = intersection['range_offset'])
- intersection['path'] = intersection.apply(
- lambda r: opj(project.path, 'annotations', r['set'], 'converted', r['annotation_filename']),
- axis = 1
- )
- datalad.api.get(list(intersection['path'].unique()))
- segments = am.get_collapsed_segments(intersection)
- segments = segments.merge(project.recordings[['recording_filename', 'child_id']], how = 'left')
- segments['child'] = corpus + '_' + segments['child_id'].astype(str)
- segments = segments[segments['speaker_type'].isin(speakers)]
- segments['set'] = segments['set'].replace({annotator: 'truth'})
- segments = segments[segments['segment_offset'] > segments['segment_onset'] + 100]
-
- return (
- segments
- .groupby(['set', 'child', 'recording_filename', 'range_onset', 'speaker_type'])
- .agg(
- count = ('segment_onset', 'count')
- )
- .reset_index()
- .pivot(index = ['child', 'recording_filename', 'range_onset'], columns = ['set', 'speaker_type'], values = ['count'])
- .assign(
- corpus = corpus
- )
- )
- annotators = pd.read_csv('input/annotators.csv')
- annotators['path'] = annotators['corpus'].apply(lambda c: opj('input', c))
- counts = pd.concat([compute_counts(annotator) for annotator in annotators.to_dict(orient = 'records')])
- counts = counts.fillna(0)
- truth = np.transpose([counts['count']['truth'][speaker].values for speaker in ['CHI', 'OCH', 'FEM', 'MAL']]).astype(int)
- vtc = np.transpose([counts['count']['vtc'][speaker].values for speaker in ['CHI', 'OCH', 'FEM', 'MAL']]).astype(int)
- counts.reset_index(inplace = True)
- counts['child'] = counts['child'].astype('category').cat.codes
- # random data
- # n_clips = 200
- # n_classes = 4
- # expectation = np.array([15,1,5,1])
- # confusion = np.zeros((n_classes, n_classes))
- # for i in range(n_classes):
- # for j in range(n_classes):
- # confusion[i,j] = 0.9 if i == j else 0.05
- # truth = np.random.poisson(expectation, size = (n_clips, n_classes))
- # vtc = np.zeros((n_clips, n_classes))
- # for k in range(n_clips):
- # for i in range(n_classes):
- # vtc[k,i] = np.sum(np.random.binomial(truth[k,:], confusion[:,i]))
- data = {
- 'n_clips': truth.shape[0],
- 'n_classes': truth.shape[1],
- 'n_children': counts['child'].nunique(),
- 'child': 1+counts['child'].astype('category').cat.codes.values,
- 'truth': truth.astype(int),
- 'vtc': vtc.astype(int)
- }
- print(f"clips: {data['n_clips']}")
- print(f"children: {data['n_children']}")
- print("true vocs: {}".format(np.sum(data['truth'])))
- print("vtc vocs: {}".format(np.sum(data['vtc'])))
- plt.scatter(data['truth'][:,0]+np.random.normal(0,0.1,truth.shape[0]), data['vtc'][:,0]+np.random.normal(0,0.1,truth.shape[0]))
- plt.scatter(data['truth'][:,1]+np.random.normal(0,0.1,truth.shape[0]), data['vtc'][:,1]+np.random.normal(0,0.1,truth.shape[0]))
- plt.scatter(data['truth'][:,2]+np.random.normal(0,0.1,truth.shape[0]), data['vtc'][:,2]+np.random.normal(0,0.1,truth.shape[0]))
- plt.scatter(data['truth'][:,3]+np.random.normal(0,0.1,truth.shape[0]), data['vtc'][:,3]+np.random.normal(0,0.1,truth.shape[0]))
- plt.show()
- stan_code = """
- data {
- int<lower=1> n_clips; // number of clips
- int<lower=1> n_children; // number of children
- int<lower=1> n_classes; // number of classes
- int child[n_clips];
- int truth[n_clips,n_classes];
- int vtc[n_clips,n_classes];
- }
- parameters {
- matrix<lower=0,upper=1>[n_classes,n_classes] mus;
- matrix<lower=0>[n_classes,n_classes] logetas;
- matrix<lower=0,upper=1>[n_classes,n_classes] child_confusion[n_children];
- }
- transformed parameters {
- matrix<lower=0>[n_classes,n_classes] alphas;
- matrix<lower=0>[n_classes,n_classes] betas;
- matrix[n_clips, n_classes] log_lik;
- log_lik = rep_matrix(0, n_clips, n_classes);
- alphas = mus * exp(logetas);
- betas = (1-mus) * exp(logetas);
- for (k in 1:n_clips) {
- for (i in 1:n_classes) {
- int n = 1;
- vector[200] log_contrib_comb;
- log_contrib_comb = rep_vector(0, 200);
- for (chi in 0:truth[k,1]) {
- for (och in 0:truth[k,2]) {
- for (fem in 0:truth[k,3]) {
- for (mal in 0:truth[k, 4]) {
- if (mal+fem+och+chi == vtc[k,i]) {
- log_contrib_comb[n] += binomial_lpmf(mal | truth[k,4], child_confusion[child[k],4,i]);
- log_contrib_comb[n] += binomial_lpmf(fem | truth[k,3], child_confusion[child[k],3,i]);
- log_contrib_comb[n] += binomial_lpmf(och | truth[k,2], child_confusion[child[k],2,i]);
- log_contrib_comb[n] += binomial_lpmf(chi | truth[k,1], child_confusion[child[k],1,i]);
- n = n+1;
- }
- }
- }
- }
- }
- log_lik[k,i] = log_sum_exp(log_contrib_comb[1:n]);
- }
- }
- }
- model {
- for (k in 1:n_clips) {
- target += log_sum_exp(log_lik[k,:]);
- }
- for (i in 1:n_classes) {
- for (j in 1:n_classes) {
- mus[i,j] ~ beta(2,2);
- logetas[i,j] ~ logistic(log(10), 1);
- }
- }
- for (c in 1:n_children) {
- for (i in 1:n_classes) {
- for (j in 1:n_classes) {
- child_confusion[c,i,j] ~ beta(alphas[i,j], betas[i,j]);
- }
- }
- }
- }
- """
- num_chains = 4
- posterior = stan.build(stan_code, data = data)
- fit = posterior.sample(num_chains = num_chains, num_samples = 4000)
- df = fit.to_frame()
- df.to_csv('fit.csv')
|