Browse Source

further cleanup...

Lucas Gautheron 1 year ago
parent
commit
11f9e8755d
2 changed files with 0 additions and 279 deletions
  1. 0 278
      code/models/vanuatu_null_hyp.py
  2. 0 1
      input/vanuatu-paper/correlations.csv

+ 0 - 278
code/models/vanuatu_null_hyp.py

@@ -1,278 +0,0 @@
-#!/usr/bin/env python3
-
-from ChildProject.projects import ChildProject
-from ChildProject.annotations import AnnotationManager
-from ChildProject.metrics import segments_to_annotation
-
-import argparse
-
-import datalad.api
-from os.path import join as opj
-from os.path import basename, exists
-
-import multiprocessing as mp
-
-import numpy as np
-import pandas as pd
-import pickle
-from pyannote.core import Annotation, Segment, Timeline
-
-import stan
-
-parser = argparse.ArgumentParser(description = 'main model described throughout the notes.')
-parser.add_argument('--group', default = 'child', choices = ['corpus', 'child'])
-parser.add_argument('--chains', default = 4, type = int)
-parser.add_argument('--samples', default = 2000, type = int)
-parser.add_argument('--validation', default = 0, type = float)
-parser.add_argument('--output', default = 'model3')
-args = parser.parse_args()
-
-def extrude(self, removed, mode: str = 'intersection'):
-    if isinstance(removed, Segment):
-        removed = Timeline([removed])
-
-    truncating_support = removed.gaps(support=self.extent())
-    # loose for truncate means strict for crop and vice-versa
-    if mode == "loose":
-        mode = "strict"
-    elif mode == "strict":
-        mode = "loose"
-    
-    return self.crop(truncating_support, mode=mode)
-
-def compute_counts(parameters):
-    corpus = parameters['corpus']
-    annotator = parameters['annotator']
-    speakers = ['CHI', 'OCH', 'FEM', 'MAL']
-
-    project = ChildProject(parameters['path'])
-    am = AnnotationManager(project)
-    am.read()
-
-    intersection = AnnotationManager.intersection(
-        am.annotations, ['vtc', annotator]
-    )
-
-    intersection['path'] = intersection.apply(
-        lambda r: opj(project.path, 'annotations', r['set'], 'converted', r['annotation_filename']),
-        axis = 1
-    )
-    datalad.api.get(list(intersection['path'].unique()))
-
-    intersection = intersection.merge(project.recordings[['recording_filename', 'child_id']], how = 'left')
-    intersection['child'] = corpus + '_' + intersection['child_id'].astype(str)
-    intersection['duration'] = intersection['range_offset']-intersection['range_onset']
-    print(corpus, annotator, (intersection['duration']/1000/2).sum()/3600)
-
-    data = []
-    for child, ann in intersection.groupby('child'):
-        #print(corpus, child)
-
-        segments = am.get_collapsed_segments(ann)
-        if 'speaker_type' not in segments.columns:
-            continue
-
-        segments = segments[segments['speaker_type'].isin(speakers)]
-        
-        vtc = {
-            speaker: segments_to_annotation(segments[(segments['set'] == 'vtc') & (segments['speaker_type'] == speaker)], 'speaker_type').get_timeline()
-            for speaker in speakers
-        }
-
-        truth = {
-            speaker: segments_to_annotation(segments[(segments['set'] == annotator) & (segments['speaker_type'] == speaker)], 'speaker_type').get_timeline()
-            for speaker in speakers
-        }
-
-        for speaker_A in speakers:
-            vtc[f'{speaker_A}_vocs_explained'] = vtc[speaker_A].crop(truth[speaker_A], mode = 'loose')
-            vtc[f'{speaker_A}_vocs_fp'] = extrude(vtc[speaker_A], vtc[f'{speaker_A}_vocs_explained'])
-            vtc[f'{speaker_A}_vocs_fn'] = extrude(truth[speaker_A], truth[speaker_A].crop(vtc[speaker_A], mode = 'loose'))
-
-            for speaker_B in speakers:
-                vtc[f'{speaker_A}_vocs_fp_{speaker_B}'] = vtc[f'{speaker_A}_vocs_fp'].crop(truth[speaker_B], mode = 'loose')
-
-                for speaker_C in speakers:
-                    if speaker_C != speaker_B and speaker_C != speaker_A:
-                        vtc[f'{speaker_A}_vocs_fp_{speaker_B}'] = extrude(
-                            vtc[f'{speaker_A}_vocs_fp_{speaker_B}'],
-                            vtc[f'{speaker_A}_vocs_fp_{speaker_B}'].crop(truth[speaker_C], mode = 'loose')
-                        )
-
-
-        d = {}
-        for i, speaker_A in enumerate(speakers):
-            for j, speaker_B in enumerate(speakers):
-                if i != j:
-                    z = len(vtc[f'{speaker_A}_vocs_fp_{speaker_B}'])
-                else:
-                    z = min(len(vtc[f'{speaker_A}_vocs_explained']), len(truth[speaker_A]))
-
-                d[f'vtc_{i}_{j}'] = z
-
-            d[f'truth_{i}'] = len(truth[speaker_A])
-            d['child'] = child
-
-        d['duration'] = ann['duration'].sum()/2/1000
-        data.append(d)
-
-    return pd.DataFrame(data).assign(
-        corpus = corpus,
-    )
-
-stan_code = """
-data {
-  int<lower=1> n_clips;   // number of clips
-  int<lower=1> n_groups; // number of groups
-  int<lower=1> n_classes; // number of classes
-  int group[n_clips];
-  int vtc[n_clips,n_classes,n_classes];
-  int truth[n_clips,n_classes];
-
-  int<lower=1> n_validation;
-  int<lower=1> n_sim;
-
-  real<lower=0> rates_alphas[n_classes];
-  real<lower=0> rates_betas[n_classes];
-}
-
-parameters {
-  matrix<lower=0,upper=1>[n_classes,n_classes] mus;
-  matrix<lower=1>[n_classes,n_classes] etas;
-  matrix<lower=0,upper=1>[n_classes,n_classes] group_confusion[n_groups];
-}
-
-transformed parameters {
-  matrix<lower=0>[n_classes,n_classes] alphas;
-  matrix<lower=0>[n_classes,n_classes] betas;
-
-  alphas = mus * etas;
-  betas = (1-mus) * etas;
-}
-
-model {
-    for (k in n_validation:n_clips) {
-        for (i in 1:n_classes) {
-            for (j in 1:n_classes) {
-                vtc[k,i,j] ~ binomial(truth[k,j], group_confusion[group[k],j,i]);
-            }
-        }
-    }
-
-    for (i in 1:n_classes) {
-        for (j in 1:n_classes) {
-            mus[i,j] ~ beta(1,1);
-            etas[i,j] ~ pareto(1,1.5);
-        }
-    }
-
-    for (c in 1:n_groups) {
-        for (i in 1:n_classes) {
-            for (j in 1:n_classes) {
-                group_confusion[c,i,j] ~ beta(alphas[i,j], betas[i,j]);
-            }
-        }
-    }
-}
-
-generated quantities {
-    int pred[n_clips,n_classes,n_classes];
-    matrix[n_classes,n_classes] probs[n_groups];
-    matrix[n_classes,n_classes] log_lik[n_clips];
-
-    int sim_truth[n_sim,n_classes];
-    int sim_vtc[n_sim,n_classes];
-    vector[n_classes] lambdas;
-    real chi_adu_coef = 0; // null hypothesis
-
-    for (c in 1:n_groups) {
-        for (i in 1:n_classes) {
-            for (j in 1:n_classes) {
-                probs[c,i,j] = beta_rng(alphas[i,j], betas[i,j]);
-            }
-        }
-    }
-
-    for (k in 1:n_clips) {
-        for (i in 1:n_classes) {
-            for (j in 1:n_classes) {
-                if (k >= n_validation) {
-                    pred[k,i,j] = binomial_rng(truth[k,j], group_confusion[group[k],i,j]);
-                    log_lik[k,i,j] = binomial_lpmf(vtc[k,i,j] | truth[k,j], group_confusion[group[k],j,i]);
-                }
-                else {
-                    pred[k,i,j] = binomial_rng(truth[k,j], probs[group[k],j,i]);
-                    log_lik[k,i,j] = beta_lpdf(probs[group[k],j,i] | alphas[j,i], betas[j,i]);
-                    log_lik[k,i,j] += binomial_lpmf(vtc[k,i,j] | truth[k,j], probs[group[k],j,i]);
-                }
-            }
-        }
-    }
-
-    real lambda;
-    for (k in 1:n_sim) {
-        for (i in 2:n_classes) {
-            lambda = gamma_rng(rates_alphas[i], rates_betas[i]);
-            sim_truth[k,i] = poisson_rng(lambda);
-        }
-        lambda = gamma_rng(rates_alphas[1], rates_betas[1]);
-        sim_truth[k,1] = poisson_rng(lambda + chi_adu_coef*(sim_truth[k,3]+sim_truth[k,4]));
-    }
-
-    for (k in 1:n_sim) {
-        for (i in 1:n_classes) {
-            sim_vtc[k,i] = 0;
-            for (j in 1:n_classes) {
-                real p = beta_rng(alphas[j,i], betas[j,i]);
-                sim_vtc[k,i] += binomial_rng(sim_truth[k,j], p);
-            }
-        }
-    }
-}
-"""
-
-if __name__ == "__main__":
-    annotators = pd.read_csv('input/annotators.csv')
-    annotators['path'] = annotators['corpus'].apply(lambda c: opj('input', c))
-
-    with mp.Pool(processes = 8) as pool:
-        data = pd.concat(pool.map(compute_counts, annotators.to_dict(orient = 'records')))
-
-    data = data.sample(frac = 1)
-    duration = data['duration'].sum()
-
-    vtc = np.moveaxis([[data[f'vtc_{j}_{i}'].values for i in range(4)] for j in range(4)], -1, 0)
-    truth = np.transpose([data[f'truth_{i}'].values for i in range(4)])
-
-    print(vtc.shape)
-
-    rates = pd.read_csv('output/speech_dist.csv')
-
-    data = {
-        'n_clips': truth.shape[0],
-        'n_classes': truth.shape[1],
-        'n_groups': data[args.group].nunique(),
-        'n_validation': max(1, int(truth.shape[0]*args.validation)),
-        'n_sim': 40,
-        'group': 1+data[args.group].astype('category').cat.codes.values,
-        'truth': truth.astype(int),
-        'vtc': vtc.astype(int),
-        'rates_alphas': rates['alpha'].values,
-        'rates_betas': rates['beta'].values
-    }
-
-    print(f"clips: {data['n_clips']}")
-    print(f"groups: {data['n_groups']}")
-    print("true vocs: {}".format(np.sum(data['truth'])))
-    print("vtc vocs: {}".format(np.sum(data['vtc'])))
-    print("duration: {}".format(duration))
-
-    with open(f'data_{args.output}.pickle', 'wb') as fp:
-        pickle.dump(data, fp, pickle.HIGHEST_PROTOCOL)
-
-    posterior = stan.build(stan_code, data = data)
-    fit = posterior.sample(num_chains = args.chains, num_samples = args.samples)
-    df = fit.to_frame()
-    df.to_parquet(f'fit_{args.output}.parquet')
-
-

+ 0 - 1
input/vanuatu-paper/correlations.csv

@@ -1 +0,0 @@
-../../.git/annex/objects/Qk/v8/MD5E-s63--22fcd7368ea2f169260fc0b5f6f3a219.csv/MD5E-s63--22fcd7368ea2f169260fc0b5f6f3a219.csv