|
@@ -27,6 +27,7 @@ parser.add_argument("--apply-bias-from", type=str, default="")
|
|
|
parser.add_argument("--chains", default=4, type=int)
|
|
|
parser.add_argument("--samples", default=2000, type=int)
|
|
|
parser.add_argument("--validation", default=0, type=float)
|
|
|
+parser.add_argument("--simulated-children", default=40, type=int)
|
|
|
parser.add_argument("--output", default="corpus_bias")
|
|
|
args = parser.parse_args()
|
|
|
|
|
@@ -130,6 +131,7 @@ def compute_counts(parameters):
|
|
|
)
|
|
|
|
|
|
d = {}
|
|
|
+ keep_child = True
|
|
|
for i, speaker_A in enumerate(speakers):
|
|
|
for j, speaker_B in enumerate(speakers):
|
|
|
if i != j:
|
|
@@ -141,11 +143,16 @@ def compute_counts(parameters):
|
|
|
|
|
|
d[f"vtc_{i}_{j}"] = z
|
|
|
|
|
|
+ if z > len(truth[speaker_B]):
|
|
|
+ keep_child = False
|
|
|
+
|
|
|
d[f"truth_{i}"] = len(truth[speaker_A])
|
|
|
d["child"] = child
|
|
|
|
|
|
d["duration"] = ann["duration"].sum() / 2 / 1000
|
|
|
- data.append(d)
|
|
|
+
|
|
|
+ if keep_child:
|
|
|
+ data.append(d)
|
|
|
|
|
|
return pd.DataFrame(data).assign(
|
|
|
corpus=corpus,
|
|
@@ -345,7 +352,7 @@ if __name__ == "__main__":
|
|
|
"n_groups": data["child"].nunique(),
|
|
|
"n_corpora": data["corpus"].nunique(),
|
|
|
"n_validation": max(1, int(truth.shape[0] * args.validation)),
|
|
|
- "n_sim": 40,
|
|
|
+ "n_sim": args.simulated_children,
|
|
|
"group": 1 + data["child"].astype("category").cat.codes.values,
|
|
|
"corpus": 1 + corpora,
|
|
|
"selected_corpus": (
|