model2.py 6.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. #!/usr/bin/env python3
  2. from ChildProject.projects import ChildProject
  3. from ChildProject.annotations import AnnotationManager
  4. from ChildProject.metrics import segments_to_annotation
  5. import datalad.api
  6. from os.path import join as opj
  7. from os.path import basename, exists
  8. import pandas as pd
  9. import stan
  10. import numpy as np
  11. from scipy.stats import beta
  12. from matplotlib import pyplot as plt
  13. import seaborn as sns
  14. def compute_counts(parameters):
  15. annotator = parameters['annotator']
  16. corpus = parameters['corpus']
  17. speakers = ['CHI', 'FEM', 'MAL', 'OCH']
  18. project = ChildProject(parameters['path'])
  19. am = AnnotationManager(project)
  20. am.read()
  21. intersection = AnnotationManager.intersection(
  22. am.annotations, ['vtc', annotator]
  23. )
  24. intersection['onset'] = intersection.apply(lambda r: np.arange(r['range_onset'], r['range_offset'], 15000), axis = 1)
  25. intersection = intersection.explode('onset')
  26. intersection['range_onset'] = intersection['onset']
  27. intersection['range_offset'] = (intersection['range_onset']+15000).clip(upper = intersection['range_offset'])
  28. intersection['path'] = intersection.apply(
  29. lambda r: opj(project.path, 'annotations', r['set'], 'converted', r['annotation_filename']),
  30. axis = 1
  31. )
  32. datalad.api.get(list(intersection['path'].unique()))
  33. segments = am.get_collapsed_segments(intersection)
  34. segments = segments.merge(project.recordings[['recording_filename', 'child_id']], how = 'left')
  35. segments['child'] = corpus + '_' + segments['child_id'].astype(str)
  36. segments = segments[segments['speaker_type'].isin(speakers)]
  37. segments['set'] = segments['set'].replace({annotator: 'truth'})
  38. segments = segments[segments['segment_offset'] > segments['segment_onset'] + 100]
  39. return (
  40. segments
  41. .groupby(['set', 'child', 'recording_filename', 'range_onset', 'speaker_type'])
  42. .agg(
  43. count = ('segment_onset', 'count')
  44. )
  45. .reset_index()
  46. .pivot(index = ['child', 'recording_filename', 'range_onset'], columns = ['set', 'speaker_type'], values = ['count'])
  47. .assign(
  48. corpus = corpus
  49. )
  50. )
  51. annotators = pd.read_csv('input/annotators.csv')
  52. annotators['path'] = annotators['corpus'].apply(lambda c: opj('input', c))
  53. counts = pd.concat([compute_counts(annotator) for annotator in annotators.to_dict(orient = 'records')])
  54. counts = counts.fillna(0)
  55. truth = np.transpose([counts['count']['truth'][speaker].values for speaker in ['CHI', 'OCH', 'FEM', 'MAL']]).astype(int)
  56. vtc = np.transpose([counts['count']['vtc'][speaker].values for speaker in ['CHI', 'OCH', 'FEM', 'MAL']]).astype(int)
  57. counts.reset_index(inplace = True)
  58. counts['child'] = counts['child'].astype('category').cat.codes
  59. # random data
  60. # n_clips = 200
  61. # n_classes = 4
  62. # expectation = np.array([15,1,5,1])
  63. # confusion = np.zeros((n_classes, n_classes))
  64. # for i in range(n_classes):
  65. # for j in range(n_classes):
  66. # confusion[i,j] = 0.9 if i == j else 0.05
  67. # truth = np.random.poisson(expectation, size = (n_clips, n_classes))
  68. # vtc = np.zeros((n_clips, n_classes))
  69. # for k in range(n_clips):
  70. # for i in range(n_classes):
  71. # vtc[k,i] = np.sum(np.random.binomial(truth[k,:], confusion[:,i]))
  72. data = {
  73. 'n_clips': truth.shape[0],
  74. 'n_classes': truth.shape[1],
  75. 'n_children': counts['child'].nunique(),
  76. 'child': 1+counts['child'].astype('category').cat.codes.values,
  77. 'truth': truth.astype(int),
  78. 'vtc': vtc.astype(int)
  79. }
  80. print(f"clips: {data['n_clips']}")
  81. print(f"children: {data['n_children']}")
  82. print("true vocs: {}".format(np.sum(data['truth'])))
  83. print("vtc vocs: {}".format(np.sum(data['vtc'])))
  84. plt.scatter(data['truth'][:,0]+np.random.normal(0,0.1,truth.shape[0]), data['vtc'][:,0]+np.random.normal(0,0.1,truth.shape[0]))
  85. plt.scatter(data['truth'][:,1]+np.random.normal(0,0.1,truth.shape[0]), data['vtc'][:,1]+np.random.normal(0,0.1,truth.shape[0]))
  86. plt.scatter(data['truth'][:,2]+np.random.normal(0,0.1,truth.shape[0]), data['vtc'][:,2]+np.random.normal(0,0.1,truth.shape[0]))
  87. plt.scatter(data['truth'][:,3]+np.random.normal(0,0.1,truth.shape[0]), data['vtc'][:,3]+np.random.normal(0,0.1,truth.shape[0]))
  88. plt.show()
  89. stan_code = """
  90. data {
  91. int<lower=1> n_clips; // number of clips
  92. int<lower=1> n_children; // number of children
  93. int<lower=1> n_classes; // number of classes
  94. int child[n_clips];
  95. int truth[n_clips,n_classes];
  96. int vtc[n_clips,n_classes];
  97. }
  98. parameters {
  99. matrix<lower=0,upper=1>[n_classes,n_classes] mus;
  100. matrix<lower=0>[n_classes,n_classes] logetas;
  101. matrix<lower=0,upper=1>[n_classes,n_classes] child_confusion[n_children];
  102. }
  103. transformed parameters {
  104. matrix<lower=0>[n_classes,n_classes] alphas;
  105. matrix<lower=0>[n_classes,n_classes] betas;
  106. matrix[n_clips, n_classes] log_lik;
  107. log_lik = rep_matrix(0, n_clips, n_classes);
  108. alphas = mus * exp(logetas);
  109. betas = (1-mus) * exp(logetas);
  110. for (k in 1:n_clips) {
  111. for (i in 1:n_classes) {
  112. int n = 1;
  113. vector[200] log_contrib_comb;
  114. log_contrib_comb = rep_vector(0, 200);
  115. for (chi in 0:truth[k,1]) {
  116. for (och in 0:truth[k,2]) {
  117. for (fem in 0:truth[k,3]) {
  118. for (mal in 0:truth[k, 4]) {
  119. if (mal+fem+och+chi == vtc[k,i]) {
  120. log_contrib_comb[n] += binomial_lpmf(mal | truth[k,4], child_confusion[child[k],4,i]);
  121. log_contrib_comb[n] += binomial_lpmf(fem | truth[k,3], child_confusion[child[k],3,i]);
  122. log_contrib_comb[n] += binomial_lpmf(och | truth[k,2], child_confusion[child[k],2,i]);
  123. log_contrib_comb[n] += binomial_lpmf(chi | truth[k,1], child_confusion[child[k],1,i]);
  124. n = n+1;
  125. }
  126. }
  127. }
  128. }
  129. }
  130. log_lik[k,i] = log_sum_exp(log_contrib_comb[1:n]);
  131. }
  132. }
  133. }
  134. model {
  135. for (k in 1:n_clips) {
  136. target += log_sum_exp(log_lik[k,:]);
  137. }
  138. for (i in 1:n_classes) {
  139. for (j in 1:n_classes) {
  140. mus[i,j] ~ beta(2,2);
  141. logetas[i,j] ~ logistic(log(10), 1);
  142. }
  143. }
  144. for (c in 1:n_children) {
  145. for (i in 1:n_classes) {
  146. for (j in 1:n_classes) {
  147. child_confusion[c,i,j] ~ beta(alphas[i,j], betas[i,j]);
  148. }
  149. }
  150. }
  151. }
  152. """
  153. num_chains = 4
  154. posterior = stan.build(stan_code, data = data)
  155. fit = posterior.sample(num_chains = num_chains, num_samples = 4000)
  156. df = fit.to_frame()
  157. df.to_csv('fit.csv')