Lucas Gautheron 1 year ago
parent
commit
9d2339e988
4 changed files with 178 additions and 78 deletions
  1. 72 5
      README.md
  2. 103 72
      code/models/main.py
  3. 0 1
      requirements.txt
  4. 3 0
      requirements.txt

+ 72 - 5
README.md

@@ -1,7 +1,74 @@
-# Project <insert name>
+# Speaker confusion model
 
-## Dataset structure
+The Voice Type Classifier (VTC) is a speech detection algorithm that classify each detected speech segment into one of four classes: Female adults, Male adults, the Key child, and Other children. This allows to measure the input children are exposed to as they are acquiring language, as well as their own speech production.
 
-- All inputs (i.e. building blocks from other sources) are located in
-  `inputs/`.
-- All custom code is located in `code/`.
+However, the VTC is not perfectly accurate, and the speaker may be misidentified (or, speech may be left undetected).
+For instance, the key child is occasionally confused with other children. Not only such confusions have consequences on the amount of speech measured within each of these four categories, but they also tend to generate spurious correlations
+between the amount of speech produced by each kind of speaker. For instance, more actual "Key child" (CHI) speech will systematically and spuriously lead to more "Other children" (OCH) detected speech due to speaker misidentification, which in turn will appear as if there was a true correlation between CHI and OCH amounts' of speech.
+
+The present bayesian model captures the errors of the VTC (false negatives and confusion between speakers) in order to assess the impact of such errors on correlation analyses. Such a model, in principle, also allows to recover the unbiased values of the parameters of interest (e.g., the true correlation between two speakers), by translating bias into larger posteriors on the parameters.
+
+## Installation
+
+1. The installation of this repository requires DataLad (``apt install datalad`` on Linux; ``brew install datalad`` on MacOS; read more, including instructions for windows, [here](https://handbook.datalad.org/en/latest/intro/installation.html))
+
+2. Once DataLad is installed, the repository can be installed:
+
+```bash
+datalad install -r git@gin.g-node.org:/LAAC-LSCP/speaker-confusion-model.git
+```
+
+This requires access to the corpora that are used to train the model.
+
+3. Install necessary packages
+
+```bash
+pip install -r requirements.txt
+```
+
+## Run the model
+
+### Command-Line Arguments
+
+The model can be found in ``code/models/main.py``. Running the model simultaneously performs the two following tasks:
+
+1. Deriving the confusion rates and their posterior distribution
+2. Generating null-hypothesis samples (i.e. assuming no true correlation between speakers' amounts of speech) simulating a 40 children corpora with 8 recorded hours each.
+
+```bash
+$ python code/models/main.py  --help
+usage: main.py [-h] [--group {corpus,child}] [--chains CHAINS] [--samples SAMPLES] [--validation VALIDATION] [--output OUTPUT]
+
+main model described throughout the notes.
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --group {corpus,child}
+  --chains CHAINS
+  --samples SAMPLES
+  --validation VALIDATION
+  --output OUTPUT
+  ```
+
+  The ``--group`` parameter controls the primary level of the hierarchical model. The model indeed assumes that confusion rates (i.e. confusion probabilities) vary across corpora (``corpus``) or children (``child``).
+
+  The ``--chains`` parameter sets the amount of MCMC chains, and ``--samples`` controls the amount of MCMC samples, warmup excluded.
+
+  The ``--validation`` parameter sets the amount of annotation clips used for validation rather than training. Set it to 0 in order to use as much data for training as possible.
+
+  The ``--output`` parameter controls the output destination. Training data will be saved to ``output/samples/data_{output}.pickle`` and the MCMC samples are saved as ``output/samples/fit_{output}.parquet``
+
+  ### Confusion probabilities
+
+  The marginal posterior distribution of the confusion matrix is shown below:
+
+  ![](output/fit_vanuatu.png)
+
+
+  ### Speech distribution
+
+  Speech distributions used to generate the simulated "null-hypothesis" corpora are fitted against the training data using Gamma distributions. The code used to fit these distributions is found in ``code/models/speech_distribution``. 
+
+  The match between the training data and the Gamma parametrization can be observed in various plots in ``output``. See below for the Key child:
+
+  ![](output/dist_CHI.png)

+ 103 - 72
code/models/main.py

@@ -19,15 +19,18 @@ from pyannote.core import Annotation, Segment, Timeline
 
 import stan
 
-parser = argparse.ArgumentParser(description = 'main model described throughout the notes.')
-parser.add_argument('--group', default = 'child', choices = ['corpus', 'child'])
-parser.add_argument('--chains', default = 4, type = int)
-parser.add_argument('--samples', default = 2000, type = int)
-parser.add_argument('--validation', default = 0, type = float)
-parser.add_argument('--output', default = 'model3')
+parser = argparse.ArgumentParser(
+    description="main model described throughout the notes."
+)
+parser.add_argument("--group", default="child", choices=["corpus", "child"])
+parser.add_argument("--chains", default=4, type=int)
+parser.add_argument("--samples", default=2000, type=int)
+parser.add_argument("--validation", default=0, type=float)
+parser.add_argument("--output", default="model3")
 args = parser.parse_args()
 
-def extrude(self, removed, mode: str = 'intersection'):
+
+def extrude(self, removed, mode: str = "intersection"):
     if isinstance(removed, Segment):
         removed = Timeline([removed])
 
@@ -37,89 +40,117 @@ def extrude(self, removed, mode: str = 'intersection'):
         mode = "strict"
     elif mode == "strict":
         mode = "loose"
-    
+
     return self.crop(truncating_support, mode=mode)
 
+
 def compute_counts(parameters):
-    corpus = parameters['corpus']
-    annotator = parameters['annotator']
-    speakers = ['CHI', 'OCH', 'FEM', 'MAL']
+    corpus = parameters["corpus"]
+    annotator = parameters["annotator"]
+    speakers = ["CHI", "OCH", "FEM", "MAL"]
 
-    project = ChildProject(parameters['path'])
+    project = ChildProject(parameters["path"])
     am = AnnotationManager(project)
     am.read()
 
-    intersection = AnnotationManager.intersection(
-        am.annotations, ['vtc', annotator]
-    )
+    intersection = AnnotationManager.intersection(am.annotations, ["vtc", annotator])
 
-    intersection['path'] = intersection.apply(
-        lambda r: opj(project.path, 'annotations', r['set'], 'converted', r['annotation_filename']),
-        axis = 1
+    intersection["path"] = intersection.apply(
+        lambda r: opj(
+            project.path, "annotations", r["set"], "converted", r["annotation_filename"]
+        ),
+        axis=1,
     )
-    datalad.api.get(list(intersection['path'].unique()))
+    datalad.api.get(list(intersection["path"].unique()))
 
-    intersection = intersection.merge(project.recordings[['recording_filename', 'child_id']], how = 'left')
-    intersection['child'] = corpus + '_' + intersection['child_id'].astype(str)
-    intersection['duration'] = intersection['range_offset']-intersection['range_onset']
-    print(corpus, annotator, (intersection['duration']/1000/2).sum()/3600)
+    intersection = intersection.merge(
+        project.recordings[["recording_filename", "child_id"]], how="left"
+    )
+    intersection["child"] = corpus + "_" + intersection["child_id"].astype(str)
+    intersection["duration"] = (
+        intersection["range_offset"] - intersection["range_onset"]
+    )
+    print(corpus, annotator, (intersection["duration"] / 1000 / 2).sum() / 3600)
 
     data = []
-    for child, ann in intersection.groupby('child'):
-        #print(corpus, child)
+    for child, ann in intersection.groupby("child"):
+        # print(corpus, child)
 
         segments = am.get_collapsed_segments(ann)
-        if 'speaker_type' not in segments.columns:
+        if "speaker_type" not in segments.columns:
             continue
 
-        segments = segments[segments['speaker_type'].isin(speakers)]
-        
+        segments = segments[segments["speaker_type"].isin(speakers)]
+
         vtc = {
-            speaker: segments_to_annotation(segments[(segments['set'] == 'vtc') & (segments['speaker_type'] == speaker)], 'speaker_type').get_timeline()
+            speaker: segments_to_annotation(
+                segments[
+                    (segments["set"] == "vtc") & (segments["speaker_type"] == speaker)
+                ],
+                "speaker_type",
+            ).get_timeline()
             for speaker in speakers
         }
 
         truth = {
-            speaker: segments_to_annotation(segments[(segments['set'] == annotator) & (segments['speaker_type'] == speaker)], 'speaker_type').get_timeline()
+            speaker: segments_to_annotation(
+                segments[
+                    (segments["set"] == annotator)
+                    & (segments["speaker_type"] == speaker)
+                ],
+                "speaker_type",
+            ).get_timeline()
             for speaker in speakers
         }
 
         for speaker_A in speakers:
-            vtc[f'{speaker_A}_vocs_explained'] = vtc[speaker_A].crop(truth[speaker_A], mode = 'loose')
-            vtc[f'{speaker_A}_vocs_fp'] = extrude(vtc[speaker_A], vtc[f'{speaker_A}_vocs_explained'])
-            vtc[f'{speaker_A}_vocs_fn'] = extrude(truth[speaker_A], truth[speaker_A].crop(vtc[speaker_A], mode = 'loose'))
+            vtc[f"{speaker_A}_vocs_explained"] = vtc[speaker_A].crop(
+                truth[speaker_A], mode="loose"
+            )
+            vtc[f"{speaker_A}_vocs_fp"] = extrude(
+                vtc[speaker_A], vtc[f"{speaker_A}_vocs_explained"]
+            )
+            vtc[f"{speaker_A}_vocs_fn"] = extrude(
+                truth[speaker_A], truth[speaker_A].crop(vtc[speaker_A], mode="loose")
+            )
 
             for speaker_B in speakers:
-                vtc[f'{speaker_A}_vocs_fp_{speaker_B}'] = vtc[f'{speaker_A}_vocs_fp'].crop(truth[speaker_B], mode = 'loose')
+                vtc[f"{speaker_A}_vocs_fp_{speaker_B}"] = vtc[
+                    f"{speaker_A}_vocs_fp"
+                ].crop(truth[speaker_B], mode="loose")
 
                 for speaker_C in speakers:
                     if speaker_C != speaker_B and speaker_C != speaker_A:
-                        vtc[f'{speaker_A}_vocs_fp_{speaker_B}'] = extrude(
-                            vtc[f'{speaker_A}_vocs_fp_{speaker_B}'],
-                            vtc[f'{speaker_A}_vocs_fp_{speaker_B}'].crop(truth[speaker_C], mode = 'loose')
+                        vtc[f"{speaker_A}_vocs_fp_{speaker_B}"] = extrude(
+                            vtc[f"{speaker_A}_vocs_fp_{speaker_B}"],
+                            vtc[f"{speaker_A}_vocs_fp_{speaker_B}"].crop(
+                                truth[speaker_C], mode="loose"
+                            ),
                         )
 
-
         d = {}
         for i, speaker_A in enumerate(speakers):
             for j, speaker_B in enumerate(speakers):
                 if i != j:
-                    z = len(vtc[f'{speaker_A}_vocs_fp_{speaker_B}'])
+                    z = len(vtc[f"{speaker_A}_vocs_fp_{speaker_B}"])
                 else:
-                    z = min(len(vtc[f'{speaker_A}_vocs_explained']), len(truth[speaker_A]))
+                    z = min(
+                        len(vtc[f"{speaker_A}_vocs_explained"]), len(truth[speaker_A])
+                    )
 
-                d[f'vtc_{i}_{j}'] = z
+                d[f"vtc_{i}_{j}"] = z
 
-            d[f'truth_{i}'] = len(truth[speaker_A])
-            d['child'] = child
+            d[f"truth_{i}"] = len(truth[speaker_A])
+            d["child"] = child
 
-        d['duration'] = ann['duration'].sum()/2/1000
+        d["duration"] = ann["duration"].sum() / 2 / 1000
         data.append(d)
 
     return pd.DataFrame(data).assign(
-        corpus = corpus,
+        corpus=corpus,
     )
 
+
 stan_code = """
 data {
   int<lower=1> n_clips;   // number of clips
@@ -239,47 +270,47 @@ generated quantities {
 """
 
 if __name__ == "__main__":
-    annotators = pd.read_csv('input/annotators.csv')
-    annotators['path'] = annotators['corpus'].apply(lambda c: opj('input', c))
+    annotators = pd.read_csv("input/annotators.csv")
+    annotators["path"] = annotators["corpus"].apply(lambda c: opj("input", c))
 
-    with mp.Pool(processes = 8) as pool:
-        data = pd.concat(pool.map(compute_counts, annotators.to_dict(orient = 'records')))
+    with mp.Pool(processes=8) as pool:
+        data = pd.concat(pool.map(compute_counts, annotators.to_dict(orient="records")))
 
-    data = data.sample(frac = 1)
-    duration = data['duration'].sum()
+    data = data.sample(frac=1)
+    duration = data["duration"].sum()
 
-    vtc = np.moveaxis([[data[f'vtc_{j}_{i}'].values for i in range(4)] for j in range(4)], -1, 0)
-    truth = np.transpose([data[f'truth_{i}'].values for i in range(4)])
+    vtc = np.moveaxis(
+        [[data[f"vtc_{j}_{i}"].values for i in range(4)] for j in range(4)], -1, 0
+    )
+    truth = np.transpose([data[f"truth_{i}"].values for i in range(4)])
 
     print(vtc.shape)
 
-    rates = pd.read_csv('output/speech_dist.csv')
+    rates = pd.read_csv("output/speech_dist.csv")
 
     data = {
-        'n_clips': truth.shape[0],
-        'n_classes': truth.shape[1],
-        'n_groups': data[args.group].nunique(),
-        'n_validation': max(1, int(truth.shape[0]*args.validation)),
-        'n_sim': 40,
-        'group': 1+data[args.group].astype('category').cat.codes.values,
-        'truth': truth.astype(int),
-        'vtc': vtc.astype(int),
-        'rates_alphas': rates['alpha'].values,
-        'rates_betas': rates['beta'].values
+        "n_clips": truth.shape[0],
+        "n_classes": truth.shape[1],
+        "n_groups": data[args.group].nunique(),
+        "n_validation": max(1, int(truth.shape[0] * args.validation)),
+        "n_sim": 40,
+        "group": 1 + data[args.group].astype("category").cat.codes.values,
+        "truth": truth.astype(int),
+        "vtc": vtc.astype(int),
+        "rates_alphas": rates["alpha"].values,
+        "rates_betas": rates["beta"].values,
     }
 
     print(f"clips: {data['n_clips']}")
     print(f"groups: {data['n_groups']}")
-    print("true vocs: {}".format(np.sum(data['truth'])))
-    print("vtc vocs: {}".format(np.sum(data['vtc'])))
+    print("true vocs: {}".format(np.sum(data["truth"])))
+    print("vtc vocs: {}".format(np.sum(data["vtc"])))
     print("duration: {}".format(duration))
 
-    with open(f'output/samples/data_{args.output}.pickle', 'wb') as fp:
+    with open(f"output/samples/data_{args.output}.pickle", "wb") as fp:
         pickle.dump(data, fp, pickle.HIGHEST_PROTOCOL)
 
-    posterior = stan.build(stan_code, data = data)
-    fit = posterior.sample(num_chains = args.chains, num_samples = args.samples)
+    posterior = stan.build(stan_code, data=data)
+    fit = posterior.sample(num_chains=args.chains, num_samples=args.samples)
     df = fit.to_frame()
-    df.to_parquet(f'output/samples/fit_{args.output}.parquet')
-
-
+    df.to_parquet(f"output/samples/fit_{args.output}.parquet")

+ 0 - 1
requirements.txt

@@ -1 +0,0 @@
-.git/annex/objects/12/Gf/MD5E-s19--2ee0a9c2f76b7dcd1f79cd2ab7022f73.txt/MD5E-s19--2ee0a9c2f76b7dcd1f79cd2ab7022f73.txt

+ 3 - 0
requirements.txt

@@ -0,0 +1,3 @@
+ChildProject
+pystan
+pyannote-audio