8 months ago · 06df10583f
--- a/code/models/dev_siblings.stan
+++ b/code/models/dev_siblings.stan
@@ -0,0 +1,329 @@
 
				+functions {
			
 
				+    real confusion_model_lpmf(array[] int group,
			
 
				+        int start, int end,
			
 
				+        int n_classes,
			
 
				+        array[,] int vtc,
			
 
				+        array[,] int truth,
			
 
				+        array[] real age,
			
 
				+        array[] real clip_duration,
			
 
				+        array[] matrix lambda,
			
 
				+        array[] vector lambda_fp
			
 
				+    ) {
			
 
				+        real ll = 0;
			
 
				+        vector [4] bp;
			
 
				+        real lambda_chi;
			
 
				+
			
 
				+        vector[16384] log_contrib_comb;
			
 
				+        int n = size(log_contrib_comb);
			
 
				+
			
 
				+        for (k in start:end) {
			
 
				+            for (i in 1:n_classes) {
			
 
				+                log_contrib_comb[:n] = rep_vector(0, n);
			
 
				+                n = 1;
			
 
				+
			
 
				+                for (chi in 0:(truth[k,1]>0?max(truth[k,1], vtc[k,i]):0)) {
			
 
				+                    bp[1] = truth[k,1]==0?0:poisson_lpmf(chi | truth[k,1]*lambda[group[k-start+1],1,i]);
			
 
				+
			
 
				+                    for (och in 0:(truth[k,2]>0?max(truth[k,2], vtc[k,i]-chi):0)) {
			
 
				+                        bp[2] = truth[k,2]==0?0:poisson_lpmf(och | truth[k,2]*lambda[group[k-start+1],2,i]);
			
 
				+
			
 
				+                        for (fem in 0:(truth[k,3]>0?max(truth[k,3], vtc[k,i]-chi-och):0)) {
			
 
				+                            bp[3] = truth[k,3]==0?0:poisson_lpmf(fem | truth[k,3]*lambda[group[k-start+1],3,i]);
			
 
				+
			
 
				+                            for (mal in 0:(truth[k,4]>0?max(truth[k,4], vtc[k,i]-chi-och-fem):0)) {
			
 
				+                                bp[4] = truth[k,4]==0?0:poisson_lpmf(mal | truth[k,4]*lambda[group[k-start+1],4,i]);
			
 
				+
			
 
				+                                int delta = vtc[k,i] - (mal+fem+och+chi);
			
 
				+                                if (delta >= 0) {
			
 
				+                                    log_contrib_comb[n] += sum(bp);
			
 
				+                                    log_contrib_comb[n] += poisson_lpmf(
			
 
				+                                        delta | lambda_fp[group[k-start+1],i]*clip_duration[k]
			
 
				+                                    );
			
 
				+                                    n = n+1;
			
 
				+                                }
			
 
				+                            }
			
 
				+                        }
			
 
				+                    }
			
 
				+                }
			
 
				+                if (n>1) {
			
 
				+                    ll += log_sum_exp(log_contrib_comb[1:n-1]);
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+        return ll;
			
 
				+    }
			
 
				+
			
 
				+    real model_lpmf(array[] int children,
			
 
				+        int start, int end,
			
 
				+        int n_recs,
			
 
				+        int n_classes,
			
 
				+        real duration,
			
 
				+        array [,] int vocs,
			
 
				+        array [] real age,
			
 
				+        matrix truth_vocs,
			
 
				+        array [] matrix actual_confusion,
			
 
				+        array [] vector actual_fp_rate
			
 
				+        ) {
			
 
				+            real ll = 0;
			
 
				+
			
 
				+            vector [4] expect;
			
 
				+            //vector [4] sd;
			
 
				+
			
 
				+            for (k in start:end) {
			
 
				+                expect = rep_vector(0, 4);
			
 
				+                //sd = rep_vector(0, 4);
			
 
				+
			
 
				+                for (i in 1:n_classes) {
			
 
				+                    expect[i] = dot_product(truth_vocs[k,:], actual_confusion[k,:,i]);
			
 
				+                    expect[i] += actual_fp_rate[k,i] * duration;
			
 
				+                }
			
 
				+                
			
 
				+                ll += normal_lpdf(vocs[k,:] | expect, sqrt(expect));
			
 
				+            }
			
 
				+
			
 
				+            return ll;
			
 
				+        }
			
 
				+}
			
 
				+
			
 
				+// TODO
			
 
				+// use speech rates to set priors on truth_vocs
			
 
				+data {
			
 
				+    int<lower=1> n_classes; // number of classes
			
 
				+
			
 
				+    // analysis data block
			
 
				+    int<lower=1> n_recs;
			
 
				+    int<lower=1> n_children;
			
 
				+
			
 
				+    array[n_recs] int<lower=1> children;
			
 
				+    array[n_recs] real<lower=1> age;
			
 
				+    array[n_recs] int<lower=-1> siblings;
			
 
				+    array[n_recs, n_classes] int<lower=0> vocs;
			
 
				+    array[n_children] int<lower=1> corpus;
			
 
				+
			
 
				+    real<lower=0> recs_duration;
			
 
				+
			
 
				+    // speaker confusion data block
			
 
				+    int<lower=1> n_clips;   // number of clips
			
 
				+    int<lower=1> n_groups; // number of groups
			
 
				+    int<lower=1> n_corpora;
			
 
				+    array [n_clips] int group;
			
 
				+    array [n_clips] int conf_corpus;
			
 
				+    array [n_clips,n_classes] int<lower=0> vtc_total; // vtc vocs attributed to specific speakers
			
 
				+    array [n_clips,n_classes] int<lower=0> truth_total;
			
 
				+    array [n_clips] real<lower=0> clip_duration;
			
 
				+    array [n_clips] real<lower=0> clip_age;
			
 
				+
			
 
				+    int<lower=0> n_validation;
			
 
				+
			
 
				+    // actual speech rates
			
 
				+    int<lower=1> n_rates;
			
 
				+    int<lower=1> n_speech_rate_children;
			
 
				+
			
 
				+    array [n_rates,n_classes] int<lower=0> speech_rates;
			
 
				+    array [n_rates] int group_corpus;
			
 
				+    array [n_rates] real<lower=0> durations;
			
 
				+    array [n_rates] real<lower=0> speech_rate_age;
			
 
				+    array [n_rates] int<lower=-1> speech_rate_siblings;
			
 
				+    array [n_rates] int<lower=1,upper=n_speech_rate_children> speech_rate_child;
			
 
				+
			
 
				+    // parallel processing
			
 
				+    int<lower=1> threads;
			
 
				+}
			
 
				+
			
 
				+transformed data {
			
 
				+    vector<lower=0>[n_groups] recording_age;
			
 
				+    array[n_speech_rate_children] int<lower=1> speech_rate_child_corpus;
			
 
				+
			
 
				+    array[n_children] int<lower=-1> child_siblings;
			
 
				+    array[n_speech_rate_children] int<lower=-1> speech_rate_child_siblings;
			
 
				+
			
 
				+    for (c in 1:n_clips) {
			
 
				+        recording_age[group[c]] = clip_age[c];
			
 
				+    }
			
 
				+
			
 
				+    for (k in 1:n_rates) {
			
 
				+        speech_rate_child_corpus[speech_rate_child[k]] = group_corpus[k];
			
 
				+    }
			
 
				+
			
 
				+    for (k in 1:n_recs) {
			
 
				+        child_siblings[children[k]] = siblings[k];
			
 
				+    }
			
 
				+
			
 
				+    for (k in 1:n_rates) {
			
 
				+        speech_rate_child_siblings[speech_rate_child[k]] = speech_rate_siblings[k];
			
 
				+    }
			
 
				+}
			
 
				+
			
 
				+parameters {
			
 
				+    matrix<lower=0>[n_children,n_classes-1] mu_child_level;
			
 
				+    vector [n_children] child_dev_age;
			
 
				+    matrix<lower=0> [n_recs, n_classes] truth_vocs;
			
 
				+
			
 
				+    // nuisance parameters
			
 
				+    array [n_recs] matrix<lower=0>[n_classes,n_classes] actual_confusion_baseline;
			
 
				+    array [n_recs] vector<lower=0>[n_classes] actual_fp_rate;
			
 
				+
			
 
				+    // confusion parameters
			
 
				+    // confusion matrix
			
 
				+    matrix<lower=0>[n_classes,n_classes] alphas;
			
 
				+    matrix<lower=0>[n_classes,n_classes] mus;
			
 
				+    array [n_groups] matrix<lower=0>[n_classes,n_classes] lambda;
			
 
				+    // false positives
			
 
				+    vector<lower=0>[n_classes] alphas_fp;
			
 
				+    vector<lower=0>[n_classes] mus_fp;
			
 
				+    array [n_groups] vector<lower=0>[n_classes] lambda_fp;
			
 
				+
			
 
				+    // speech rates
			
 
				+    vector<lower=0>[n_classes] alpha_child_level; // variance across recordings for a given child
			
 
				+    matrix<lower=0>[n_classes-1,n_corpora] alpha_corpus_level; // variance among children
			
 
				+    matrix<lower=0>[n_classes-1,n_corpora] mu_corpus_level; // child-level average
			
 
				+    vector<lower=0>[n_classes-1] alpha_pop_level; // variance among corpora
			
 
				+    vector<lower=0>[n_classes] mu_pop_level; // population level averages
			
 
				+    vector<lower=0>[n_classes-1] alpha_pop;
			
 
				+    matrix<lower=0>[n_classes,n_rates] speech_rate; // truth speech rates observed in annotated clips
			
 
				+    matrix<lower=0>[n_speech_rate_children,n_classes-1] speech_rate_child_level; // expected speech rate at the child-level
			
 
				+    real<lower=0> beta_sib_och; // effect of n of siblings on OCH speech
			
 
				+
			
 
				+    vector [n_speech_rate_children] child_dev_speech_age;
			
 
				+
			
 
				+    // average effect of age
			
 
				+    real alpha_dev;
			
 
				+    real<lower=0> sigma_dev;
			
 
				+
			
 
				+    // effect of excess ADU input
			
 
				+    real beta_dev;
			
 
				+}
			
 
				+
			
 
				+model {
			
 
				+    //actual model
			
 
				+
			
 
				+    target += reduce_sum(
			
 
				+       model_lpmf, children, 1,
			
 
				+       n_recs, n_classes, recs_duration,
			
 
				+       vocs, age,
			
 
				+       truth_vocs, actual_confusion_baseline, actual_fp_rate
			
 
				+    );
			
 
				+
			
 
				+    for (k in 1:n_recs) {
			
 
				+        for (i in 1:n_classes) {
			
 
				+            if (i == 1) {
			
 
				+                actual_confusion_baseline[k,i] ~ gamma(alphas[i,:], alphas[i,:]./mus[i,:]);
			
 
				+                //actual_confusion_baseline[k,i] ~ gamma(alphas[i,:], alphas[i,:]./(mus[i,:].*exp(delta_chi_age'*age[k]/12.0))); //'
			
 
				+            }
			
 
				+            else {
			
 
				+                actual_confusion_baseline[k,i] ~ gamma(alphas[i,:], alphas[i,:]./mus[i,:]);
			
 
				+            }
			
 
				+        }
			
 
				+        actual_fp_rate[k] ~ gamma(alphas_fp, alphas_fp./mus_fp);
			
 
				+    }
			
 
				+    
			
 
				+
			
 
				+    for (k in 1:n_recs) {
			
 
				+        real chi_mu = exp(
			
 
				+            log(mu_pop_level[1]) + child_dev_age[children[k]]*age[k]/12.0/10.0+beta_dev*(mu_child_level[children[k],2]+mu_child_level[children[k],3]-mu_pop_level[3]-mu_pop_level[4])*age[k]/12.0/10.0
			
 
				+        );
			
 
				+        (truth_vocs[k,1]/1000/recs_duration) ~ gamma(
			
 
				+            alpha_child_level[1],
			
 
				+            alpha_child_level[1]/chi_mu
			
 
				+        );
			
 
				+
			
 
				+        real och_mu = exp(
			
 
				+            log(mu_child_level[children[k],1]) + (child_siblings[children[k]]>0?beta_sib_och:0)
			
 
				+        );
			
 
				+        (truth_vocs[k,2]/1000/recs_duration) ~ gamma(
			
 
				+            alpha_child_level[2],
			
 
				+            alpha_child_level[2]/och_mu
			
 
				+        );
			
 
				+        
			
 
				+        (truth_vocs[k,3:]/1000/recs_duration) ~ gamma(
			
 
				+            alpha_child_level[3:], alpha_child_level[2:]./mu_child_level[children[k],2:]' //'
			
 
				+        );    
			
 
				+    }
			
 
				+
			
 
				+    for (c in 1:n_children) {
			
 
				+        mu_child_level[c] ~ gamma(
			
 
				+            alpha_corpus_level[:,corpus[c]],
			
 
				+            (alpha_corpus_level[:,corpus[c]]./mu_corpus_level[:,corpus[c]])
			
 
				+        );
			
 
				+    }
			
 
				+
			
 
				+    alpha_child_level ~ gamma(2,1);
			
 
				+
			
 
				+    target += reduce_sum(
			
 
				+        confusion_model_lpmf, group, n_clips%/%(threads*4),
			
 
				+        n_classes,
			
 
				+        vtc_total, truth_total, clip_duration, clip_age,
			
 
				+        lambda, lambda_fp
			
 
				+    );
			
 
				+
			
 
				+    mus_fp ~ exponential(1);
			
 
				+    alphas_fp ~ gamma(2, 1);
			
 
				+
			
 
				+    for (i in 1:n_classes) {
			
 
				+        lambda_fp[:,i] ~ gamma(alphas_fp[i], alphas_fp[i]/mus_fp[i]);
			
 
				+
			
 
				+        for (j in 1:n_classes) {
			
 
				+            mus[i,j] ~ exponential(i==j?2:8);
			
 
				+            alphas[i,j] ~ gamma(2,1);
			
 
				+            for (c in 1:n_groups) {
			
 
				+                if (i==1) {
			
 
				+                    lambda[c,i,j] ~ gamma(alphas[i,j], alphas[i,j]/mus[i,j]);
			
 
				+                    //lambda[c,i,j] ~ gamma(alphas[i,j], alphas[i,j]/(mus[i,j]*exp(delta_chi_age[j]*recording_age[c]/12.0)));
			
 
				+                }
			
 
				+                else {
			
 
				+                    lambda[c,i,j] ~ gamma(alphas[i,j], alphas[i,j]/mus[i,j]);
			
 
				+                }
			
 
				+            }
			
 
				+        }
			
 
				+    }
			
 
				+    //delta_chi_age ~ normal(0, 0.1);
			
 
				+
			
 
				+    // speech rates
			
 
				+    mu_pop_level ~ exponential(4);
			
 
				+    alpha_pop_level ~ gamma(8, 4);
			
 
				+    alpha_pop ~ gamma(10, 10);
			
 
				+    for (i in 1:n_classes-1) {
			
 
				+        alpha_corpus_level[i,:] ~ gamma(4, 4/alpha_pop[i]);
			
 
				+        mu_corpus_level[i,:] ~ gamma(alpha_pop_level[i],alpha_pop_level[i]/mu_pop_level[i+1]);
			
 
				+    }
			
 
				+
			
 
				+    for (g in 1:n_rates) {
			
 
				+        real chi_mu = exp(
			
 
				+            log(mu_pop_level[1]) + child_dev_speech_age[speech_rate_child[g]]*speech_rate_age[g]/12.0/10.0 + beta_dev*(speech_rate_child_level[speech_rate_child[g],2]+speech_rate_child_level[speech_rate_child[g],3]-mu_pop_level[3]-mu_pop_level[4])*speech_rate_age[g]/12.0/10.0
			
 
				+        );
			
 
				+        speech_rate[1,g] ~ gamma(
			
 
				+            alpha_child_level[1],
			
 
				+            alpha_child_level[1]/chi_mu
			
 
				+        );
			
 
				+
			
 
				+        real och_mu = exp(
			
 
				+            log(speech_rate_child_level[speech_rate_child[g],1]) + (speech_rate_child_siblings[speech_rate_child[g]]>0?beta_sib_och:0)
			
 
				+        );
			
 
				+        speech_rate[2,g] ~ gamma(
			
 
				+            alpha_child_level[2],
			
 
				+            alpha_child_level[2]/och_mu
			
 
				+        );
			
 
				+
			
 
				+        speech_rate[3:,g] ~ gamma(
			
 
				+            alpha_child_level[3:],
			
 
				+            (alpha_child_level[3:]./(speech_rate_child_level[speech_rate_child[g],2:]')) //'
			
 
				+        );
			
 
				+        speech_rates[g,:] ~ poisson(speech_rate[:,g]*durations[g]*1000);
			
 
				+    }
			
 
				+
			
 
				+    for (c in 1:n_speech_rate_children) {
			
 
				+        speech_rate_child_level[c,:] ~ gamma(
			
 
				+            alpha_corpus_level[:,speech_rate_child_corpus[c]],
			
 
				+            (alpha_corpus_level[:,speech_rate_child_corpus[c]]./(mu_corpus_level[:,speech_rate_child_corpus[c]]))
			
 
				+        );
			
 
				+    }
			
 
				+
			
 
				+    child_dev_age ~ normal(alpha_dev, sigma_dev);
			
 
				+    child_dev_speech_age ~ normal(alpha_dev, sigma_dev);
			
 
				+    beta_sib_och ~ exponential(1);
			
 
				+
			
 
				+    alpha_dev ~ normal(0, 1);
			
 
				+    sigma_dev ~ exponential(1);
			
 
				+
			
 
				+    beta_dev ~ normal(0, 1);
			
 
				+}
			
--- a/code/models/enumeration.py
+++ b/code/models/enumeration.py
@@ -27,6 +27,8 @@ matplotlib.rcParams.update(
 
				     }
			
 
				 )
			
 
				 
			
 
				+from collections import defaultdict
			
 
				+
			
 
				 import pickle
			
 
				 
			
 
				 import datalad.api
			
@@ -69,6 +71,18 @@ def extrude(self, removed, mode: str = "intersection"):
 
				 
			
 
				     return self.crop(truncating_support, mode=mode)
			
 
				 
			
 
				+def children_siblings(corpus):
			
 
				+    siblings = pd.read_csv("input/siblings.csv")
			
 
				+    siblings = siblings[siblings["corpus"]==corpus].set_index("child_id")
			
 
				+    siblings = siblings["n_siblings"].to_dict()
			
 
				+
			
 
				+    n = defaultdict(-1)
			
 
				+    for c in siblings:
			
 
				+        n[c] = siblings[c]
			
 
				+
			
 
				+    return n
			
 
				+
			
 
				+
			
 
				 
			
 
				 def compute_counts(parameters):
			
 
				     corpus = parameters["corpus"]
			
@@ -201,7 +215,12 @@ def rates(parameters):
 
				     metrics = pipeline.extract()
			
 
				     metrics = pd.DataFrame(metrics).assign(corpus=corpus, annotator=annotator)
			
 
				     project.recordings["age"] = project.compute_ages()
			
 
				-    metrics = metrics.merge(project.recordings[["recording_filename", "age"]])
			
 
				+    project.recordings["siblings"] = project.recordings.child_id.map(
			
 
				+        children_siblings(corpus)
			
 
				+    )
			
 
				+    metrics = metrics.merge(
			
 
				+        project.recordings[["recording_filename", "age", "siblings"]]
			
 
				+    )
			
 
				     metrics["duration"] = metrics[f"duration_{annotator}"] / 1000 / 3600
			
 
				     metrics = metrics[metrics["duration"] > 0.01]
			
 
				     metrics["child"] = corpus + "_" + metrics["child_id"].astype(str)
			
@@ -261,6 +280,9 @@ def compile_recordings(corpus):
 
				     am.read()
			
 
				 
			
 
				     project.recordings["age"] = project.compute_ages()
			
 
				+    project.recordings["siblings"] = project.recordings.child_id.map(
			
 
				+        children_siblings(corpus)
			
 
				+    )
			
 
				 
			
 
				     annotations = am.annotations[am.annotations["set"] == "vtc"]
			
 
				     annotations = annotations.merge(
			
@@ -292,6 +314,7 @@ def compile_recordings(corpus):
 
				         child_id = _annotations["child_id"].max()
			
 
				         age = _annotations["age"].max()
			
 
				         duration = (_annotations["range_offset"] - _annotations["range_onset"]).sum()
			
 
				+        siblings = _annotations["siblings"].max()
			
 
				 
			
 
				         if duration < args.duration * 3600 * 1000:
			
 
				             continue
			
@@ -331,6 +354,7 @@ def compile_recordings(corpus):
 
				         rec["children"] = f"{corpus}_{child_id}"
			
 
				         rec["corpus"] = basename(corpus)
			
 
				         rec["age"] = age
			
 
				+        rec["siblings"] = siblings
			
 
				 
			
 
				         recs.append(rec)
			
 
				 
			
@@ -372,6 +396,7 @@ if __name__ == "__main__":
 
				         [speech_rates[f"speech_rate_{i}"].values for i in range(4)]
			
 
				     )
			
 
				     speech_rate_age = speech_rates["age"].values
			
 
				+    speech_rate_siblings = speech_rates["siblings"].values.astype(int)
			
 
				 
			
 
				     speech_rates.to_csv("rates.csv")
			
 
				 
			
@@ -393,6 +418,7 @@ if __name__ == "__main__":
 
				         "n_unique_clips": data["clip_id"].nunique(),
			
 
				         "speech_rates": speech_rate_matrix.astype(int),
			
 
				         "speech_rate_age": speech_rate_age,
			
 
				+        "speech_rate_siblings": speech_rate_siblings,
			
 
				         "group_corpus": (
			
 
				             1 + speech_rates["corpus"].map(corpora_map).astype(int).values
			
 
				         ),
			
@@ -415,6 +441,7 @@ if __name__ == "__main__":
 
				         "children": recs["children"],
			
 
				         "vocs": np.transpose([recs[f"vtc_{i}"].values for i in range(4)]),
			
 
				         "age": recs["age"],
			
 
				+        "siblings": recs["siblings"].astype(int),
			
 
				         "corpus": children_corpus,
			
 
				         "recs_duration": args.duration,
			
 
				     }
			
--- a/code/preprocessing/siblings.py
+++ b/code/preprocessing/siblings.py
@@ -0,0 +1,63 @@
 
				+import pandas as pd 
			
 
				+
			
 
				+from ChildProject.projects import ChildProject
			
 
				+
			
 
				+from os.path import join as opj, basename
			
 
				+
			
 
				+corpora = [
			
 
				+    "input/bergelson",
			
 
				+    "input/warlaumont",
			
 
				+    "input/winnipeg",
			
 
				+    "input/lucid"
			
 
				+]
			
 
				+
			
 
				+dic = {
			
 
				+    "input/bergelson": "confidential/original/bergelson_dict.csv",
			
 
				+    "input/lucid": "confidential/original/lucid_dict.csv",
			
 
				+    "input/warlaumont": "original/warlaumont_dict_matched.csv",
			
 
				+    "input/winnipeg": "confidential/original/winnipeg_dict_matched.csv"
			
 
				+}
			
 
				+
			
 
				+correspondance = {
			
 
				+    "BER": "input/bergelson",
			
 
				+    "ROW": "input/lucid",
			
 
				+    "SOD": "input/winnipeg",
			
 
				+    "WAR": "input/warlaumont"
			
 
				+}
			
 
				+
			
 
				+projects = [ 
			
 
				+    ChildProject(corpus) for corpus in corpora
			
 
				+]
			
 
				+
			
 
				+for project in projects:
			
 
				+    project.read()
			
 
				+
			
 
				+recordings = pd.concat([
			
 
				+    projects[i].recordings.assign(corpus=corpus)
			
 
				+    for i, corpus in enumerate(corpora)
			
 
				+])
			
 
				+
			
 
				+recordings["its_filename"] = recordings["its_filename"].str.replace(".its", "")
			
 
				+
			
 
				+aclew_id = pd.concat([
			
 
				+    pd.read_csv(opj(corpus, "metadata", dic[corpus])).assign(corpus=corpus)
			
 
				+    for corpus in corpora
			
 
				+])
			
 
				+
			
 
				+aclew_id["its"] = aclew_id["its"].str.replace(".its", "")
			
 
				+
			
 
				+
			
 
				+aclew_md = pd.read_csv("input/aclew_md.csv")
			
 
				+
			
 
				+recordings = recordings[["corpus", "child_id", "recording_filename", "its_filename"]].merge(
			
 
				+    aclew_id[["corpus", "its", "aclew_id"]],
			
 
				+    how="inner",
			
 
				+    left_on=["corpus", "its_filename"],
			
 
				+    right_on=["corpus", "its"]
			
 
				+)
			
 
				+
			
 
				+recordings = recordings.merge(aclew_md, how="inner", left_on="aclew_id", right_on="aclew_id")
			
 
				+children = recordings.groupby(["corpus", "child_id"]).agg(n_siblings=("number_older_sibs", "max"))
			
 
				+children = children.reset_index()
			
 
				+children["corpus"] = children.corpus.map(basename)
			
 
				+children.to_csv("input/siblings.csv")
			
--- a/input/aclew_md.csv
+++ b/input/aclew_md.csv
@@ -0,0 +1,841 @@
 
				+"labname","aclew_id","child_level_id","number_older_sibs"
			
 
				+"BER",6265,"06",0
			
 
				+"BER",1227,"01",1
			
 
				+"BER",5510,"01",1
			
 
				+"BER",1982,"01",1
			
 
				+"BER",3439,"01",1
			
 
				+"BER",8712,"01",1
			
 
				+"BER",5813,"01",1
			
 
				+"BER",7979,"01",1
			
 
				+"BER",4890,"01",1
			
 
				+"BER",1445,"01",1
			
 
				+"BER",2224,"01",1
			
 
				+"BER",9866,"01",1
			
 
				+"BER",9061,"02",1
			
 
				+"BER",3483,"02",1
			
 
				+"BER",673,"02",1
			
 
				+"BER",5388,"02",1
			
 
				+"BER",4330,"02",1
			
 
				+"BER",3832,"02",1
			
 
				+"BER",7209,"02",1
			
 
				+"BER",8307,"02",1
			
 
				+"BER",1836,"02",1
			
 
				+"BER",8757,"02",1
			
 
				+"BER",2756,"02",1
			
 
				+"BER",9306,"02",1
			
 
				+"BER",4524,"07",0
			
 
				+"BER",7780,"03",1
			
 
				+"BER",9470,"03",1
			
 
				+"BER",850,"03",1
			
 
				+"BER",4164,"03",1
			
 
				+"BER",6245,"03",1
			
 
				+"BER",1843,"03",1
			
 
				+"BER",8204,"03",1
			
 
				+"BER",7810,"03",1
			
 
				+"BER",256,"03",1
			
 
				+"BER",3194,"08",0
			
 
				+"BER",8262,"04",4
			
 
				+"BER",3895,"04",4
			
 
				+"BER",5102,"04",4
			
 
				+"BER",9363,"04",4
			
 
				+"BER",9947,"04",4
			
 
				+"BER",3657,"04",4
			
 
				+"BER",3798,"04",4
			
 
				+"BER",2101,"04",4
			
 
				+"BER",1234,"04",4
			
 
				+"BER",595,"04",4
			
 
				+"BER",6112,"04",4
			
 
				+"BER",6661,"09",0
			
 
				+"BER",4889,"06",0
			
 
				+"BER",5034,"06",0
			
 
				+"BER",8638,"06",0
			
 
				+"BER",1104,"06",0
			
 
				+"BER",2471,"06",0
			
 
				+"BER",9513,"06",0
			
 
				+"BER",3256,"06",0
			
 
				+"BER",9123,"06",0
			
 
				+"BER",6016,"06",0
			
 
				+"BER",8735,"06",0
			
 
				+"BER",9685,"06",0
			
 
				+"BER",2755,"11",0
			
 
				+"BER",713,"07",0
			
 
				+"BER",8191,"07",0
			
 
				+"BER",964,"07",0
			
 
				+"BER",4912,"07",0
			
 
				+"BER",5192,"07",0
			
 
				+"BER",291,"07",0
			
 
				+"BER",4806,"07",0
			
 
				+"BER",486,"07",0
			
 
				+"BER",3455,"07",0
			
 
				+"BER",7996,"07",0
			
 
				+"BER",6937,"07",0
			
 
				+"BER",7563,"12",0
			
 
				+"BER",3916,"08",0
			
 
				+"BER",3777,"08",0
			
 
				+"BER",4780,"08",0
			
 
				+"BER",8109,"08",0
			
 
				+"BER",6563,"08",0
			
 
				+"BER",6035,"08",0
			
 
				+"BER",1137,"08",0
			
 
				+"BER",712,"08",0
			
 
				+"BER",8797,"08",0
			
 
				+"BER",6049,"08",0
			
 
				+"BER",7870,"08",0
			
 
				+"BER",1596,"16",0
			
 
				+"BER",4180,"09",0
			
 
				+"BER",5976,"09",0
			
 
				+"BER",9039,"09",0
			
 
				+"BER",5157,"09",0
			
 
				+"BER",6018,"09",0
			
 
				+"BER",2470,"09",0
			
 
				+"BER",1401,"09",0
			
 
				+"BER",3063,"09",0
			
 
				+"BER",9453,"09",0
			
 
				+"BER",1075,"09",0
			
 
				+"BER",7758,"09",0
			
 
				+"BER",9244,"19",0
			
 
				+"BER",1575,"10",1
			
 
				+"BER",3702,"10",1
			
 
				+"BER",5228,"10",1
			
 
				+"BER",414,"10",1
			
 
				+"BER",9036,"10",1
			
 
				+"BER",5242,"10",1
			
 
				+"BER",6200,"10",1
			
 
				+"BER",1310,"10",1
			
 
				+"BER",6441,"10",1
			
 
				+"BER",5622,"10",1
			
 
				+"BER",4473,"10",1
			
 
				+"BER",1416,"20",0
			
 
				+"BER",6957,"11",0
			
 
				+"BER",466,"11",0
			
 
				+"BER",2858,"11",0
			
 
				+"BER",2016,"11",0
			
 
				+"BER",8595,"11",0
			
 
				+"BER",6954,"11",0
			
 
				+"BER",1414,"11",0
			
 
				+"BER",3749,"11",0
			
 
				+"BER",5777,"11",0
			
 
				+"BER",1405,"11",0
			
 
				+"BER",7392,"11",0
			
 
				+"BER",980,"21",0
			
 
				+"BER",5980,"12",0
			
 
				+"BER",1665,"12",0
			
 
				+"BER",8922,"12",0
			
 
				+"BER",9487,"12",0
			
 
				+"BER",5606,"12",0
			
 
				+"BER",485,"12",0
			
 
				+"BER",4622,"12",0
			
 
				+"BER",9392,"12",0
			
 
				+"BER",7324,"12",0
			
 
				+"BER",5706,"12",0
			
 
				+"BER",284,"12",0
			
 
				+"BER",5092,"22",0
			
 
				+"BER",3949,"13",2
			
 
				+"BER",9310,"13",2
			
 
				+"BER",2075,"13",2
			
 
				+"BER",1208,"13",2
			
 
				+"BER",9863,"13",2
			
 
				+"BER",174,"13",2
			
 
				+"BER",2876,"13",2
			
 
				+"BER",1395,"13",2
			
 
				+"BER",1904,"13",2
			
 
				+"BER",4706,"13",2
			
 
				+"BER",4703,"13",2
			
 
				+"BER",6249,"25",0
			
 
				+"BER",6023,"14",1
			
 
				+"BER",5430,"14",1
			
 
				+"BER",5483,"14",1
			
 
				+"BER",6174,"14",1
			
 
				+"BER",4731,"14",1
			
 
				+"BER",226,"14",1
			
 
				+"BER",2927,"14",1
			
 
				+"BER",2580,"14",1
			
 
				+"BER",3024,"14",1
			
 
				+"BER",3395,"14",1
			
 
				+"BER",1698,"14",1
			
 
				+"BER",1620,"27",0
			
 
				+"BER",7269,"15",2
			
 
				+"BER",6433,"15",2
			
 
				+"BER",2615,"15",2
			
 
				+"BER",7377,"15",2
			
 
				+"BER",2345,"15",2
			
 
				+"BER",7175,"15",2
			
 
				+"BER",4280,"15",2
			
 
				+"BER",2418,"15",2
			
 
				+"BER",7239,"15",2
			
 
				+"BER",5778,"15",2
			
 
				+"BER",8147,"15",2
			
 
				+"BER",6402,"28",0
			
 
				+"BER",8246,"16",0
			
 
				+"BER",774,"16",0
			
 
				+"BER",7829,"16",0
			
 
				+"BER",1777,"16",0
			
 
				+"BER",1931,"16",0
			
 
				+"BER",395,"16",0
			
 
				+"BER",2191,"16",0
			
 
				+"BER",4032,"16",0
			
 
				+"BER",7115,"16",0
			
 
				+"BER",2779,"16",0
			
 
				+"BER",9692,"16",0
			
 
				+"BER",8742,"29",0
			
 
				+"BER",6267,"17",1
			
 
				+"BER",3641,"17",1
			
 
				+"BER",8319,"17",1
			
 
				+"BER",4172,"17",1
			
 
				+"BER",1889,"17",1
			
 
				+"BER",1500,"17",1
			
 
				+"BER",1458,"17",1
			
 
				+"BER",3654,"17",1
			
 
				+"BER",2777,"17",1
			
 
				+"BER",8688,"17",1
			
 
				+"BER",5109,"17",1
			
 
				+"BER",5454,"30",0
			
 
				+"BER",8323,"18",1
			
 
				+"BER",990,"18",1
			
 
				+"BER",5938,"18",1
			
 
				+"BER",9230,"18",1
			
 
				+"BER",6471,"18",1
			
 
				+"BER",9997,"18",1
			
 
				+"BER",2852,"18",1
			
 
				+"BER",3538,"18",1
			
 
				+"BER",2348,"18",1
			
 
				+"BER",52,"18",1
			
 
				+"BER",7011,"18",1
			
 
				+"BER",532,"33",0
			
 
				+"BER",131,"19",0
			
 
				+"BER",5988,"19",0
			
 
				+"BER",6499,"19",0
			
 
				+"BER",7679,"19",0
			
 
				+"BER",8786,"19",0
			
 
				+"BER",2564,"19",0
			
 
				+"BER",8020,"19",0
			
 
				+"BER",3739,"19",0
			
 
				+"BER",993,"19",0
			
 
				+"BER",4536,"34",0
			
 
				+"BER",6209,"20",0
			
 
				+"BER",9756,"20",0
			
 
				+"BER",459,"20",0
			
 
				+"BER",9009,"20",0
			
 
				+"BER",8491,"20",0
			
 
				+"BER",9012,"20",0
			
 
				+"BER",7604,"20",0
			
 
				+"BER",6745,"20",0
			
 
				+"BER",3068,"20",0
			
 
				+"BER",3340,"20",0
			
 
				+"BER",9779,"20",0
			
 
				+"BER",3149,"35",0
			
 
				+"BER",8629,"21",0
			
 
				+"BER",7448,"21",0
			
 
				+"BER",2405,"21",0
			
 
				+"BER",4096,"21",0
			
 
				+"BER",7099,"21",0
			
 
				+"BER",9348,"21",0
			
 
				+"BER",2447,"21",0
			
 
				+"BER",4439,"21",0
			
 
				+"BER",6201,"21",0
			
 
				+"BER",5240,"21",0
			
 
				+"BER",527,"21",0
			
 
				+"BER",2795,"43",0
			
 
				+"BER",6968,"22",0
			
 
				+"BER",9454,"22",0
			
 
				+"BER",7385,"22",0
			
 
				+"BER",6935,"22",0
			
 
				+"BER",7162,"22",0
			
 
				+"BER",109,"22",0
			
 
				+"BER",2318,"22",0
			
 
				+"BER",8393,"22",0
			
 
				+"BER",2197,"22",0
			
 
				+"BER",7549,"22",0
			
 
				+"BER",7041,"46",0
			
 
				+"BER",2882,"23",4
			
 
				+"BER",9077,"23",4
			
 
				+"BER",5897,"23",4
			
 
				+"BER",41,"23",4
			
 
				+"BER",5074,"23",4
			
 
				+"BER",7362,"23",4
			
 
				+"BER",7523,"23",4
			
 
				+"BER",9996,"23",4
			
 
				+"BER",8770,"23",4
			
 
				+"BER",3526,"23",4
			
 
				+"BER",6604,"23",4
			
 
				+"BER",9803,"01",1
			
 
				+"BER",2474,"25",0
			
 
				+"BER",1537,"25",0
			
 
				+"BER",639,"25",0
			
 
				+"BER",7505,"25",0
			
 
				+"BER",3511,"25",0
			
 
				+"BER",2970,"25",0
			
 
				+"BER",8189,"25",0
			
 
				+"BER",7465,"25",0
			
 
				+"BER",7965,"25",0
			
 
				+"BER",4626,"25",0
			
 
				+"BER",6971,"25",0
			
 
				+"BER",6047,"03",1
			
 
				+"BER",1618,"26",1
			
 
				+"BER",113,"26",1
			
 
				+"BER",8948,"26",1
			
 
				+"BER",9294,"26",1
			
 
				+"BER",6586,"26",1
			
 
				+"BER",4873,"26",1
			
 
				+"BER",8049,"26",1
			
 
				+"BER",4880,"26",1
			
 
				+"BER",9409,"26",1
			
 
				+"BER",293,"26",1
			
 
				+"BER",1800,"26",1
			
 
				+"BER",4572,"10",1
			
 
				+"BER",8993,"27",0
			
 
				+"BER",4301,"27",0
			
 
				+"BER",240,"27",0
			
 
				+"BER",6831,"27",0
			
 
				+"BER",6938,"27",0
			
 
				+"BER",9717,"27",0
			
 
				+"BER",4291,"27",0
			
 
				+"BER",4989,"27",0
			
 
				+"BER",8655,"27",0
			
 
				+"BER",1345,"27",0
			
 
				+"BER",956,"27",0
			
 
				+"BER",7462,"14",1
			
 
				+"BER",820,"28",0
			
 
				+"BER",5449,"28",0
			
 
				+"BER",4644,"28",0
			
 
				+"BER",1844,"28",0
			
 
				+"BER",9762,"28",0
			
 
				+"BER",2437,"28",0
			
 
				+"BER",2253,"28",0
			
 
				+"BER",7170,"28",0
			
 
				+"BER",5694,"28",0
			
 
				+"BER",4704,"28",0
			
 
				+"BER",4675,"28",0
			
 
				+"BER",7137,"17",1
			
 
				+"BER",9770,"29",0
			
 
				+"BER",6788,"29",0
			
 
				+"BER",8885,"29",0
			
 
				+"BER",6733,"29",0
			
 
				+"BER",4482,"29",0
			
 
				+"BER",4266,"29",0
			
 
				+"BER",2880,"29",0
			
 
				+"BER",8110,"29",0
			
 
				+"BER",3048,"29",0
			
 
				+"BER",1145,"29",0
			
 
				+"BER",397,"29",0
			
 
				+"BER",1805,"18",1
			
 
				+"BER",145,"30",0
			
 
				+"BER",2902,"30",0
			
 
				+"BER",3548,"30",0
			
 
				+"BER",833,"30",0
			
 
				+"BER",4550,"30",0
			
 
				+"BER",3758,"30",0
			
 
				+"BER",9317,"30",0
			
 
				+"BER",6205,"30",0
			
 
				+"BER",4144,"30",0
			
 
				+"BER",2330,"30",0
			
 
				+"BER",176,"30",0
			
 
				+"BER",5849,"26",1
			
 
				+"BER",4988,"31",1
			
 
				+"BER",9337,"31",1
			
 
				+"BER",16,"31",1
			
 
				+"BER",4966,"31",1
			
 
				+"BER",9291,"31",1
			
 
				+"BER",4726,"31",1
			
 
				+"BER",4041,"31",1
			
 
				+"BER",3991,"31",1
			
 
				+"BER",1403,"31",1
			
 
				+"BER",9241,"31",1
			
 
				+"BER",5315,"31",1
			
 
				+"BER",3135,"31",1
			
 
				+"BER",7186,"32",2
			
 
				+"BER",7270,"32",2
			
 
				+"BER",1238,"32",2
			
 
				+"BER",1668,"32",2
			
 
				+"BER",9643,"32",2
			
 
				+"BER",3662,"32",2
			
 
				+"BER",404,"32",2
			
 
				+"BER",7339,"32",2
			
 
				+"BER",2941,"32",2
			
 
				+"BER",6835,"32",2
			
 
				+"BER",3994,"32",2
			
 
				+"BER",5407,"37",1
			
 
				+"BER",7643,"33",0
			
 
				+"BER",5065,"33",0
			
 
				+"BER",6143,"33",0
			
 
				+"BER",1328,"33",0
			
 
				+"BER",2379,"33",0
			
 
				+"BER",56,"33",0
			
 
				+"BER",4590,"33",0
			
 
				+"BER",1760,"33",0
			
 
				+"BER",955,"33",0
			
 
				+"BER",4636,"33",0
			
 
				+"BER",6315,"33",0
			
 
				+"BER",4071,"38",1
			
 
				+"BER",9411,"34",0
			
 
				+"BER",7221,"34",0
			
 
				+"BER",2744,"34",0
			
 
				+"BER",4073,"34",0
			
 
				+"BER",4520,"34",0
			
 
				+"BER",6002,"34",0
			
 
				+"BER",5906,"34",0
			
 
				+"BER",4964,"34",0
			
 
				+"BER",3778,"34",0
			
 
				+"BER",65,"34",0
			
 
				+"BER",9645,"34",0
			
 
				+"BER",127,"41",1
			
 
				+"BER",7066,"35",0
			
 
				+"BER",2973,"35",0
			
 
				+"BER",9973,"35",0
			
 
				+"BER",4141,"35",0
			
 
				+"BER",1862,"35",0
			
 
				+"BER",6735,"35",0
			
 
				+"BER",4679,"35",0
			
 
				+"BER",818,"35",0
			
 
				+"BER",7117,"35",0
			
 
				+"BER",563,"35",0
			
 
				+"BER",5187,"35",0
			
 
				+"BER",2231,"42",1
			
 
				+"BER",5750,"36",3
			
 
				+"BER",9333,"36",3
			
 
				+"BER",2948,"36",3
			
 
				+"BER",7701,"36",3
			
 
				+"BER",2048,"36",3
			
 
				+"BER",1103,"36",3
			
 
				+"BER",8594,"36",3
			
 
				+"BER",2984,"36",3
			
 
				+"BER",8263,"36",3
			
 
				+"BER",2074,"36",3
			
 
				+"BER",4534,"36",3
			
 
				+"BER",8102,"13",2
			
 
				+"BER",4127,"37",1
			
 
				+"BER",9531,"37",1
			
 
				+"BER",701,"37",1
			
 
				+"BER",6258,"37",1
			
 
				+"BER",4128,"37",1
			
 
				+"BER",559,"37",1
			
 
				+"BER",8790,"37",1
			
 
				+"BER",6924,"37",1
			
 
				+"BER",9993,"37",1
			
 
				+"BER",4897,"37",1
			
 
				+"BER",2410,"37",1
			
 
				+"BER",5480,"15",2
			
 
				+"BER",9998,"38",1
			
 
				+"BER",5193,"38",1
			
 
				+"BER",9457,"38",1
			
 
				+"BER",6345,"38",1
			
 
				+"BER",2519,"38",1
			
 
				+"BER",7072,"38",1
			
 
				+"BER",7953,"38",1
			
 
				+"BER",6483,"38",1
			
 
				+"BER",8510,"38",1
			
 
				+"BER",4168,"38",1
			
 
				+"BER",8991,"38",1
			
 
				+"BER",4698,"32",2
			
 
				+"BER",6495,"39",2
			
 
				+"BER",571,"39",2
			
 
				+"BER",8235,"39",2
			
 
				+"BER",1849,"39",2
			
 
				+"BER",4091,"39",2
			
 
				+"BER",4182,"39",2
			
 
				+"BER",2828,"39",2
			
 
				+"BER",5991,"39",2
			
 
				+"BER",2088,"39",2
			
 
				+"BER",9701,"39",2
			
 
				+"BER",1899,"39",2
			
 
				+"BER",1550,"39",2
			
 
				+"BER",677,"40",2
			
 
				+"BER",9228,"40",2
			
 
				+"BER",6997,"40",2
			
 
				+"BER",8193,"40",2
			
 
				+"BER",2517,"40",2
			
 
				+"BER",1801,"40",2
			
 
				+"BER",3447,"40",2
			
 
				+"BER",777,"40",2
			
 
				+"BER",4078,"40",2
			
 
				+"BER",576,"40",2
			
 
				+"BER",4839,"40",2
			
 
				+"BER",5604,"40",2
			
 
				+"BER",4886,"41",1
			
 
				+"BER",7823,"41",1
			
 
				+"BER",625,"41",1
			
 
				+"BER",7619,"41",1
			
 
				+"BER",4152,"41",1
			
 
				+"BER",3405,"41",1
			
 
				+"BER",1189,"41",1
			
 
				+"BER",8069,"41",1
			
 
				+"BER",5633,"41",1
			
 
				+"BER",2024,"41",1
			
 
				+"BER",5605,"41",1
			
 
				+"BER",547,"36",3
			
 
				+"BER",4503,"42",1
			
 
				+"BER",8114,"42",1
			
 
				+"BER",4823,"42",1
			
 
				+"BER",2090,"42",1
			
 
				+"BER",4528,"42",1
			
 
				+"BER",5016,"42",1
			
 
				+"BER",7025,"42",1
			
 
				+"BER",6570,"42",1
			
 
				+"BER",9892,"42",1
			
 
				+"BER",6801,"42",1
			
 
				+"BER",2489,"42",1
			
 
				+"BER",2798,"44",3
			
 
				+"BER",3663,"43",0
			
 
				+"BER",6939,"43",0
			
 
				+"BER",2476,"43",0
			
 
				+"BER",4718,"43",0
			
 
				+"BER",4051,"43",0
			
 
				+"BER",7942,"43",0
			
 
				+"BER",357,"43",0
			
 
				+"BER",6479,"43",0
			
 
				+"BER",396,"43",0
			
 
				+"BER",8,"43",0
			
 
				+"BER",3489,"43",0
			
 
				+"BER",9369,"04",4
			
 
				+"BER",3108,"44",3
			
 
				+"BER",4755,"44",3
			
 
				+"BER",1196,"44",3
			
 
				+"BER",2261,"44",3
			
 
				+"BER",5678,"44",3
			
 
				+"BER",2200,"44",3
			
 
				+"BER",7609,"44",3
			
 
				+"BER",2417,"44",3
			
 
				+"BER",2307,"44",3
			
 
				+"BER",5267,"44",3
			
 
				+"BER",8104,"44",3
			
 
				+"BER",1713,"45",1
			
 
				+"BER",6336,"45",1
			
 
				+"BER",4604,"45",1
			
 
				+"BER",21,"45",1
			
 
				+"BER",6222,"45",1
			
 
				+"BER",4596,"45",1
			
 
				+"BER",8101,"45",1
			
 
				+"BER",9541,"45",1
			
 
				+"BER",5402,"45",1
			
 
				+"BER",1306,"45",1
			
 
				+"BER",6411,"45",1
			
 
				+"BER",2242,"23",4
			
 
				+"BER",5162,"46",0
			
 
				+"BER",2446,"46",0
			
 
				+"BER",1427,"46",0
			
 
				+"BER",6457,"46",0
			
 
				+"BER",6507,"46",0
			
 
				+"BER",4714,"46",0
			
 
				+"BER",2855,"46",0
			
 
				+"BER",8222,"46",0
			
 
				+"BER",2957,"46",0
			
 
				+"BER",1330,"46",0
			
 
				+"BER",4841,"46",0
			
 
				+"ROW",8458,"C004",0
			
 
				+"ROW",3664,"C004",0
			
 
				+"ROW",7613,"C004",0
			
 
				+"ROW",1875,"C004",0
			
 
				+"ROW",7320,"C004",0
			
 
				+"ROW",2099,"C004",0
			
 
				+"ROW",7102,"C004",0
			
 
				+"ROW",1916,"C006",1
			
 
				+"ROW",5670,"C006",1
			
 
				+"ROW",4009,"C006",1
			
 
				+"ROW",3430,"C006",1
			
 
				+"ROW",2006,"C006",1
			
 
				+"ROW",6822,"C006",1
			
 
				+"ROW",572,"C006",1
			
 
				+"ROW",8080,"C008",2
			
 
				+"ROW",6164,"C008",2
			
 
				+"ROW",4955,"C008",2
			
 
				+"ROW",8636,"C008",2
			
 
				+"ROW",4877,"C008",2
			
 
				+"ROW",2542,"C008",2
			
 
				+"ROW",3825,"C008",2
			
 
				+"ROW",3417,"C010",1
			
 
				+"ROW",5644,"C010",1
			
 
				+"ROW",1672,"C010",1
			
 
				+"ROW",6276,"C010",1
			
 
				+"ROW",6969,"C010",1
			
 
				+"ROW",4224,"C010",1
			
 
				+"ROW",5715,"C010",1
			
 
				+"ROW",4913,"C012",0
			
 
				+"ROW",5355,"C012",0
			
 
				+"ROW",7090,"C012",0
			
 
				+"ROW",1768,"C012",0
			
 
				+"ROW",3350,"C012",0
			
 
				+"ROW",1065,"C012",0
			
 
				+"ROW",454,"C012",0
			
 
				+"ROW",8807,"C016",1
			
 
				+"ROW",3219,"C016",1
			
 
				+"ROW",6970,"C016",1
			
 
				+"ROW",1815,"C016",1
			
 
				+"ROW",8669,"C016",1
			
 
				+"ROW",7144,"C016",1
			
 
				+"ROW",3486,"C016",1
			
 
				+"ROW",1129,"C018",1
			
 
				+"ROW",8738,"C018",1
			
 
				+"ROW",9801,"C018",1
			
 
				+"ROW",4431,"C018",1
			
 
				+"ROW",3761,"C018",1
			
 
				+"ROW",5289,"C018",1
			
 
				+"ROW",2196,"C018",1
			
 
				+"ROW",1081,"C019",0
			
 
				+"ROW",2945,"C019",0
			
 
				+"ROW",7980,"C019",0
			
 
				+"ROW",4865,"C019",0
			
 
				+"ROW",9408,"C019",0
			
 
				+"ROW",8905,"C019",0
			
 
				+"ROW",4425,"C023",2
			
 
				+"ROW",7345,"C023",2
			
 
				+"ROW",4132,"C023",2
			
 
				+"ROW",7132,"C023",2
			
 
				+"ROW",2925,"C023",2
			
 
				+"ROW",948,"C023",2
			
 
				+"ROW",7288,"C024",0
			
 
				+"ROW",2516,"C024",0
			
 
				+"ROW",2745,"C024",0
			
 
				+"ROW",8034,"C024",0
			
 
				+"ROW",2014,"C024",0
			
 
				+"ROW",9497,"C024",0
			
 
				+"ROW",7999,"C024",0
			
 
				+"ROW",9245,"C025",0
			
 
				+"ROW",3438,"C025",0
			
 
				+"ROW",1132,"C025",0
			
 
				+"ROW",9620,"C025",0
			
 
				+"ROW",490,"C025",0
			
 
				+"ROW",6339,"C025",0
			
 
				+"ROW",2296,"C025",0
			
 
				+"ROW",9269,"C026",2
			
 
				+"ROW",7866,"C026",2
			
 
				+"ROW",3628,"C026",2
			
 
				+"ROW",3584,"C026",2
			
 
				+"ROW",1156,"C026",2
			
 
				+"ROW",7897,"C026",2
			
 
				+"ROW",558,"C026",2
			
 
				+"ROW",2759,"C029",0
			
 
				+"ROW",9826,"C029",0
			
 
				+"ROW",4148,"C029",0
			
 
				+"ROW",7088,"C029",0
			
 
				+"ROW",96,"C029",0
			
 
				+"ROW",7148,"C029",0
			
 
				+"ROW",9057,"C030",1
			
 
				+"ROW",4816,"C030",1
			
 
				+"ROW",2391,"C030",1
			
 
				+"ROW",7437,"C030",1
			
 
				+"ROW",1522,"C030",1
			
 
				+"ROW",5536,"C030",1
			
 
				+"ROW",8008,"C030",1
			
 
				+"ROW",7413,"C035",0
			
 
				+"ROW",6648,"C035",0
			
 
				+"ROW",5600,"C035",0
			
 
				+"ROW",1905,"C035",0
			
 
				+"ROW",6834,"C035",0
			
 
				+"ROW",3945,"C035",0
			
 
				+"ROW",3988,"C035",0
			
 
				+"ROW",5272,"C036",2
			
 
				+"ROW",6431,"C036",2
			
 
				+"ROW",4214,"C036",2
			
 
				+"ROW",2298,"C036",2
			
 
				+"ROW",9170,"C036",2
			
 
				+"ROW",2041,"C036",2
			
 
				+"ROW",3391,"C036",2
			
 
				+"ROW",1680,"C037",0
			
 
				+"ROW",5318,"C037",0
			
 
				+"ROW",7765,"C037",0
			
 
				+"ROW",3673,"C037",0
			
 
				+"ROW",5182,"C037",0
			
 
				+"ROW",66,"C037",0
			
 
				+"ROW",6368,"C037",0
			
 
				+"ROW",8431,"C038",0
			
 
				+"ROW",3905,"C038",0
			
 
				+"ROW",9498,"C038",0
			
 
				+"ROW",2176,"C038",0
			
 
				+"ROW",7585,"C038",0
			
 
				+"ROW",1937,"C038",0
			
 
				+"ROW",4630,"C038",0
			
 
				+"ROW",270,"C040",0
			
 
				+"ROW",4392,"C040",0
			
 
				+"ROW",8105,"C040",0
			
 
				+"ROW",5655,"C040",0
			
 
				+"ROW",5903,"C040",0
			
 
				+"ROW",2956,"C040",0
			
 
				+"ROW",2271,"C040",0
			
 
				+"ROW",3366,"C041",2
			
 
				+"ROW",9775,"C041",2
			
 
				+"ROW",8108,"C041",2
			
 
				+"ROW",40,"C041",2
			
 
				+"ROW",4011,"C041",2
			
 
				+"ROW",8031,"C041",2
			
 
				+"ROW",5818,"C041",2
			
 
				+"ROW",2693,"C042",0
			
 
				+"ROW",1102,"C042",0
			
 
				+"ROW",668,"C042",0
			
 
				+"ROW",2774,"C042",0
			
 
				+"ROW",8299,"C042",0
			
 
				+"ROW",3978,"C042",0
			
 
				+"ROW",8421,"C042",0
			
 
				+"ROW",8357,"C046",1
			
 
				+"ROW",6027,"C046",1
			
 
				+"ROW",8296,"C046",1
			
 
				+"ROW",6133,"C046",1
			
 
				+"ROW",908,"C046",1
			
 
				+"ROW",8083,"C046",1
			
 
				+"ROW",8041,"C046",1
			
 
				+"ROW",112,"C047",0
			
 
				+"ROW",2834,"C047",0
			
 
				+"ROW",2731,"C047",0
			
 
				+"ROW",2144,"C047",0
			
 
				+"ROW",8044,"C047",0
			
 
				+"ROW",5931,"C047",0
			
 
				+"ROW",7468,"C047",0
			
 
				+"ROW",1324,"C049",1
			
 
				+"ROW",2671,"C049",1
			
 
				+"ROW",9074,"C049",1
			
 
				+"ROW",3583,"C049",1
			
 
				+"ROW",2400,"C049",1
			
 
				+"ROW",4602,"C049",1
			
 
				+"ROW",5368,"C050",0
			
 
				+"ROW",2028,"C050",0
			
 
				+"ROW",7995,"C050",0
			
 
				+"ROW",5420,"C050",0
			
 
				+"ROW",4761,"C050",0
			
 
				+"ROW",5830,"C050",0
			
 
				+"ROW",9271,"C050",0
			
 
				+"ROW",3282,"C054",1
			
 
				+"ROW",405,"C054",1
			
 
				+"ROW",5250,"C054",1
			
 
				+"ROW",1735,"C054",1
			
 
				+"ROW",1199,"C054",1
			
 
				+"ROW",138,"C054",1
			
 
				+"ROW",2574,"C054",1
			
 
				+"ROW",5802,"C055",1
			
 
				+"ROW",9492,"C055",1
			
 
				+"ROW",6670,"C055",1
			
 
				+"ROW",482,"C055",1
			
 
				+"ROW",7945,"C055",1
			
 
				+"ROW",4684,"C055",1
			
 
				+"ROW",3307,"C055",1
			
 
				+"ROW",4131,"C056",2
			
 
				+"ROW",2713,"C056",2
			
 
				+"ROW",934,"C056",2
			
 
				+"ROW",9444,"C056",2
			
 
				+"ROW",603,"C056",2
			
 
				+"ROW",7266,"C056",2
			
 
				+"ROW",9204,"C056",2
			
 
				+"ROW",2811,"C061",1
			
 
				+"ROW",7427,"C061",1
			
 
				+"ROW",3909,"C061",1
			
 
				+"ROW",8679,"C061",1
			
 
				+"ROW",8897,"C061",1
			
 
				+"ROW",4086,"C061",1
			
 
				+"ROW",272,"C066",0
			
 
				+"ROW",4967,"C066",0
			
 
				+"ROW",5684,"C066",0
			
 
				+"ROW",7411,"C066",0
			
 
				+"ROW",3676,"C066",0
			
 
				+"ROW",4302,"C066",0
			
 
				+"ROW",5746,"C066",0
			
 
				+"ROW",2803,"C067",1
			
 
				+"ROW",8152,"C067",1
			
 
				+"ROW",7973,"C067",1
			
 
				+"ROW",9018,"C067",1
			
 
				+"ROW",49,"C067",1
			
 
				+"ROW",8617,"C067",1
			
 
				+"ROW",5752,"C067",1
			
 
				+"ROW",5674,"C068",0
			
 
				+"ROW",2365,"C068",0
			
 
				+"ROW",851,"C068",0
			
 
				+"ROW",310,"C068",0
			
 
				+"ROW",3881,"C068",0
			
 
				+"ROW",1278,"C068",0
			
 
				+"ROW",4603,"C068",0
			
 
				+"ROW",2452,"C076",0
			
 
				+"ROW",2657,"C076",0
			
 
				+"ROW",2792,"C076",0
			
 
				+"ROW",1294,"C076",0
			
 
				+"ROW",9145,"C076",0
			
 
				+"ROW",9033,"C078",0
			
 
				+"ROW",9222,"C078",0
			
 
				+"ROW",8185,"C078",0
			
 
				+"ROW",6825,"C078",0
			
 
				+"ROW",2127,"C078",0
			
 
				+"ROW",3852,"C078",0
			
 
				+"ROW",7085,"C078",0
			
 
				+"ROW",7798,"C083",0
			
 
				+"ROW",5083,"C083",0
			
 
				+"ROW",4820,"C083",0
			
 
				+"ROW",3534,"C083",0
			
 
				+"SOD",9733,"CW167",0
			
 
				+"SOD",4787,"CW167",0
			
 
				+"SOD",2135,"CW167",0
			
 
				+"SOD",9858,"CW167",0
			
 
				+"SOD",1902,"CW168",0
			
 
				+"SOD",8445,"CW168",0
			
 
				+"SOD",5426,"CW168",0
			
 
				+"SOD",9302,"CW168",0
			
 
				+"SOD",9854,"CW173",0
			
 
				+"SOD",3942,"CW173",0
			
 
				+"SOD",4852,"CW173",0
			
 
				+"SOD",3263,"CW173",0
			
 
				+"SOD",7525,"CW173",0
			
 
				+"SOD",5134,"CW174",0
			
 
				+"SOD",8891,"CW170",0
			
 
				+"SOD",3542,"CW170",0
			
 
				+"SOD",9106,"CW170",0
			
 
				+"SOD",4081,"CW170",0
			
 
				+"SOD",8496,"CW176",0
			
 
				+"SOD",8059,"CW176",0
			
 
				+"SOD",5223,"CW176",0
			
 
				+"SOD",8768,"CW176",0
			
 
				+"SOD",5477,"CW186",0
			
 
				+"SOD",2042,"CW186",0
			
 
				+"SOD",8531,"CW186",0
			
 
				+"SOD",6045,"CW186",0
			
 
				+"SOD",9774,"CW184",0
			
 
				+"SOD",4705,"CW184",0
			
 
				+"SOD",516,"CW184",0
			
 
				+"SOD",4736,"CW184",0
			
 
				+"SOD",6549,"CW184",0
			
 
				+"SOD",8560,"CW181",0
			
 
				+"SOD",8924,"CW181",0
			
 
				+"SOD",4483,"CW180",0
			
 
				+"SOD",3451,"CW180",0
			
 
				+"SOD",2790,"CW121",0
			
 
				+"SOD",8181,"CW121",0
			
 
				+"SOD",1499,"CW121",0
			
 
				+"SOD",5333,"CW121",0
			
 
				+"SOD",9440,"CW121",0
			
 
				+"SOD",9527,"CW004",1
			
 
				+"SOD",8822,"CW004",1
			
 
				+"SOD",3634,"CW004",1
			
 
				+"WAR",7928,"204",0
			
 
				+"WAR",1988,"204",0
			
 
				+"WAR",8684,"274",3
			
 
				+"WAR",7734,"274",3
			
 
				+"WAR",8010,"274",3
			
 
				+"WAR",3833,"300",3
			
 
				+"WAR",3528,"300",3
			
 
				+"WAR",9755,"340",1
			
 
				+"WAR",4878,"340",1
			
 
				+"WAR",453,"340",1
			
 
				+"WAR",3090,"530",1
			
 
				+"WAR",2337,"530",1
			
 
				+"WAR",7372,"530",1
			
 
				+"WAR",8554,"530",1
			
 
				+"WAR",7282,"583",1
			
 
				+"WAR",8525,"583",1
			
 
				+"WAR",4995,"583",1
			
 
				+"WAR",3174,"623",0
			
 
				+"WAR",2974,"651",0
			
 
				+"WAR",5940,"651",0
			
 
				+"WAR",5959,"651",0
			
 
				+"WAR",2535,"747",0
			
 
				+"WAR",5243,"747",0
			
 
				+"WAR",4707,"747",0
			
 
				+"WAR",5792,"804",1
			
 
				+"WAR",602,"833",0
			
 
				+"WAR",9398,"833",0
			
 
				+"WAR",5481,"848",1
			
 
				+"WAR",6958,"848",1
			
 
				+"WAR",1130,"848",1
			
 
				+"WAR",676,"857",1
			
 
				+"WAR",5613,"857",1
			
 
				+"WAR",8743,"857",1
			
 
				+"WAR",5835,"955",4
			
 
				+"WAR",5501,"955",4
			
 
				+"WAR",8602,"955",4
			
 
				+"WAR",4552,"973",0
			
 
				+"WAR",9769,"973",0
			
 
				+"WAR",9427,"973",0
			
 
				+"WAR",7975,"973",0
			
--- a/input/siblings.csv
+++ b/input/siblings.csv
@@ -0,0 +1,107 @@
 
				+,corpus,child_id,n_siblings
			
 
				+0,bergelson,1,1
			
 
				+1,bergelson,2,1
			
 
				+2,bergelson,3,1
			
 
				+3,bergelson,4,4
			
 
				+4,bergelson,6,0
			
 
				+5,bergelson,7,0
			
 
				+6,bergelson,8,0
			
 
				+7,bergelson,9,0
			
 
				+8,bergelson,10,1
			
 
				+9,bergelson,11,0
			
 
				+10,bergelson,12,0
			
 
				+11,bergelson,13,2
			
 
				+12,bergelson,14,1
			
 
				+13,bergelson,15,2
			
 
				+14,bergelson,16,0
			
 
				+15,bergelson,17,1
			
 
				+16,bergelson,18,1
			
 
				+17,bergelson,19,0
			
 
				+18,bergelson,20,0
			
 
				+19,bergelson,21,0
			
 
				+20,bergelson,22,0
			
 
				+21,bergelson,23,4
			
 
				+22,bergelson,25,0
			
 
				+23,bergelson,26,1
			
 
				+24,bergelson,27,0
			
 
				+25,bergelson,28,0
			
 
				+26,bergelson,29,0
			
 
				+27,bergelson,30,0
			
 
				+28,bergelson,31,1
			
 
				+29,bergelson,32,2
			
 
				+30,bergelson,33,0
			
 
				+31,bergelson,34,0
			
 
				+32,bergelson,35,0
			
 
				+33,bergelson,36,3
			
 
				+34,bergelson,37,1
			
 
				+35,bergelson,38,1
			
 
				+36,bergelson,39,2
			
 
				+37,bergelson,40,2
			
 
				+38,bergelson,41,1
			
 
				+39,bergelson,42,1
			
 
				+40,bergelson,43,0
			
 
				+41,bergelson,44,3
			
 
				+42,bergelson,45,1
			
 
				+43,bergelson,46,0
			
 
				+44,lucid,C004,0
			
 
				+45,lucid,C006,1
			
 
				+46,lucid,C008,2
			
 
				+47,lucid,C010,1
			
 
				+48,lucid,C012,0
			
 
				+49,lucid,C016,1
			
 
				+50,lucid,C018,1
			
 
				+51,lucid,C019,0
			
 
				+52,lucid,C023,2
			
 
				+53,lucid,C024,0
			
 
				+54,lucid,C025,0
			
 
				+55,lucid,C026,2
			
 
				+56,lucid,C029,0
			
 
				+57,lucid,C030,1
			
 
				+58,lucid,C035,0
			
 
				+59,lucid,C036,2
			
 
				+60,lucid,C037,0
			
 
				+61,lucid,C038,0
			
 
				+62,lucid,C040,0
			
 
				+63,lucid,C041,2
			
 
				+64,lucid,C042,0
			
 
				+65,lucid,C046,1
			
 
				+66,lucid,C047,0
			
 
				+67,lucid,C049,1
			
 
				+68,lucid,C050,0
			
 
				+69,lucid,C054,1
			
 
				+70,lucid,C055,1
			
 
				+71,lucid,C056,2
			
 
				+72,lucid,C061,1
			
 
				+73,lucid,C066,0
			
 
				+74,lucid,C067,1
			
 
				+75,lucid,C068,0
			
 
				+76,lucid,C076,0
			
 
				+77,lucid,C078,0
			
 
				+78,lucid,C083,0
			
 
				+79,warlaumont,204,0
			
 
				+80,warlaumont,274,3
			
 
				+81,warlaumont,300,3
			
 
				+82,warlaumont,340,1
			
 
				+83,warlaumont,530,1
			
 
				+84,warlaumont,583,1
			
 
				+85,warlaumont,623,0
			
 
				+86,warlaumont,651,0
			
 
				+87,warlaumont,747,0
			
 
				+88,warlaumont,804,1
			
 
				+89,warlaumont,833,0
			
 
				+90,warlaumont,848,1
			
 
				+91,warlaumont,857,1
			
 
				+92,warlaumont,955,4
			
 
				+93,warlaumont,973,0
			
 
				+94,winnipeg,C004,1
			
 
				+95,winnipeg,C121,0
			
 
				+96,winnipeg,C167,0
			
 
				+97,winnipeg,C168,0
			
 
				+98,winnipeg,C170,0
			
 
				+99,winnipeg,C173,0
			
 
				+100,winnipeg,C174,0
			
 
				+101,winnipeg,C176,0
			
 
				+102,winnipeg,C180,0
			
 
				+103,winnipeg,C181,0
			
 
				+104,winnipeg,C184,0
			
 
				+105,winnipeg,C186,0