12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697 |
- \documentclass{article}
- \usepackage[utf8]{inputenc}
- \usepackage{amsmath}
- \usepackage{amssymb}
- \usepackage{bbm}
- \usepackage{stmaryrd}
- \title{Speaker confusion models}
- \author{}
- \date{}
- \begin{document}
- \maketitle
- \tableofcontents
- \section{Models}
- \subsection{Simple model}
- This simple model assumes that confusion rates (the probabilities that the algorithm attributes a vocalization from a certain speaker to another speaker) depend on the children only, and that they all derive from the same distribution, regardless of the corpus (and the surveyed population).
- The simple model is defined as follows:
- \begin{align}
- \mu_{ij} &\sim \mathcal{U}(0,1) \\
- \eta_{ij} &\sim \mathrm{Pareto}(1.5,1) \\
- \alpha_{ij} &= \mu_{ij} \eta_{ij} \\
- \beta_{ij} &= (1-\mu_{ij}) \eta_{ij} \\
- p_{c,i,j}|\alpha_{ij},\beta_{ij} &\sim \mathrm{Beta(\alpha_{ij},\beta_{ij})}\label{eqref:pcij} \\
- N_{k,i,j}|t_{k,j},p_{c,j,i} &\sim \mathrm{Binomial(t_{k,j},p_{c,i,j})}
- \end{align}
- Where:
- \begin{itemize}
- \item $i$ is the speaker the diarizer returns (one of FEM, MAL, CHI, OCH)
- \item $j$ is the speaker the human detected (one of FEM, MAL, CHI, OCH)
- \item $k$ is a clip (i.e., a recording section that has been annotated by both a human and a diarizer)
- \item $c$ is the key child to which a clip belongs
- \item $N_{k,i,j}$ is the number of vocalizations the human attributed to $j$ and the diarizer attributed to $i$ for the clip $k$ ($i$ and $j$ could be the same or different categories)
- \item $t_{k,j}$ is the number of vocalizations returned by the human for the clip $k$ and speaker $j$ observed in the data
- \item $p_{c,i,j}$ is the probability that a vocalization from the speaker $j$ will trigger a detection for the speaker $i$, for the child $c$.
- \item $\alpha_{ij}$ are the $\alpha$ hyperparameters for the Beta priors
- \item $\beta_{ij}$ are the $\beta$ hyperparameters for the Beta priors
- \item $\mu_{ij} = \alpha_{ij}/(\alpha_{ij}+\beta_{ij})$ are the success probabilities of the Beta priors
- \item $\eta_{ij} = \alpha_{ij}+\beta_{ij}$ are the effective sample sizes of the Beta priors
- \end{itemize}
- \subsection{Model with corpus bias}
- We extended the previous model by including the effect of potential biases at the level of each corpus. In this model, the confusion rates do not directly derive from a Beta distribution as in \eqref{eqref:pcij}; they are shifted by some amount depending on the corpus:
- \begin{align}
- \sigma_{i,j} &\sim \mathrm{HalfNormal}(0, 1) \\
- b_{\text{corpus},i,j} &\sim \mathrm{Normal}(0, \sigma_{i,j}) \\
- \pi_{c,i,j}|\alpha_{ij},\beta_{ij} &\sim \mathrm{Beta(\alpha_{ij},\beta_{ij})} \\
- \text{logit} (p_{c,i,j}) &= \text{logit}(\pi_{c,i,j}) + b_{\text{corpus},i,j}\label{eqref:pcij_bias}
- \end{align}
- In this model, $\pi_{c,i,j}$ (which still derive from a Beta distribution) captures the child-level effects, and $b_{\text{corpus},i,j}$ captures corpus-level biases.
- \section{Synthetic datasets}
- We generate datasets under the null-hypothesis, i.e. the hypothesis that the amounts of speech from each speaker are uncorrelated:
- \begin{align}
- t_{c,j} | \lambda_{c,j} &\sim \mathrm{Poisson}(\lambda_{c,j}) \\
- \lambda_{c,j} &\sim \mathrm{Gamma}(a_j, b_j)\\
- c &\in \llbracket 1,n_{\text{children}}\rrbracket
- \end{align}
- Where:
- \begin{itemize}
- \item $t_{c,j}$ is the amount of true vocalizations from speaker $j$ of child $c$
- \item $\lambda_{c,j}$ is the latent expected amount of vocalizations for the speaker $j$ and child $c$ (assuming 9 recorded hours per child)
- \item $n_{\text{children}}$ is the number of children
- \item $a_j$ and $b_j$ are parameters fitted to speech rates distribution derived from manual annotations of recordings within 9am and 6pm.
- \end{itemize}
- We simultaneously simulate the effect of the diarization algorithm by applying the selected model to generated datasets:
- \begin{align}
- v_{c,i,j} | t_{c,j},p_{c,i,j} &\sim \mathrm{Binomial}(t_{c,j},p_{c,i,j}) \\
- v_{c,i} &= \sum_{j} v_{c,i,j}\\
- c &\in \llbracket 1,n_{\text{children}}\rrbracket
- \end{align}
- Where $p_{c,i,j}$ is sampled according to the distributions derived using the selected model (\eqref{eqref:pcij} or \eqref{eqref:pcij_bias}).
- For the model that includes corpus-level bias, the corpus from which the corresponding bias should be applied is defined by the user.
- \end{document}
|