speech_distribution.py 2.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. #!/usr/bin/env python3
  2. from ChildProject.projects import ChildProject
  3. from ChildProject.annotations import AnnotationManager
  4. from ChildProject.pipelines.metrics import AclewMetrics
  5. import argparse
  6. import datalad.api
  7. from os.path import join as opj
  8. from os.path import basename, exists
  9. import multiprocessing as mp
  10. import numpy as np
  11. import pandas as pd
  12. import pickle
  13. from scipy.stats import gamma
  14. from matplotlib import pyplot as plt
  15. def rates(parameters):
  16. corpus = parameters["corpus"]
  17. annotator = parameters["annotator"]
  18. speakers = ["CHI", "OCH", "FEM", "MAL"]
  19. project = ChildProject(parameters["path"])
  20. am = AnnotationManager(project)
  21. am.read()
  22. pipeline = AclewMetrics(
  23. project,
  24. vtc=annotator,
  25. alice=None,
  26. vcm=None,
  27. from_time="09:00",
  28. to_time="18:00",
  29. by="child_id",
  30. )
  31. metrics = pipeline.extract()
  32. print(metrics)
  33. return pd.DataFrame(metrics).assign(corpus=corpus)
  34. if __name__ == "__main__":
  35. annotators = pd.read_csv("input/annotators.csv")
  36. annotators = annotators[~annotators['annotator'].str.startswith('eaf_2021')]
  37. annotators["path"] = annotators["corpus"].apply(lambda c: opj("input", c))
  38. with mp.Pool(processes=8) as pool:
  39. rates = pd.concat(pool.map(rates, annotators.to_dict(orient="records")))
  40. rates.dropna(inplace = True)
  41. params = []
  42. speakers = ['CHI', 'OCH', 'FEM', 'MAL']
  43. for speaker in speakers:
  44. data = rates[f"voc_{speaker.lower()}_ph"]*9
  45. a, loc, scale = gamma.fit(data, floc=0)
  46. x = np.linspace(0, data.max(), 100)
  47. y = gamma.pdf(x, a, loc, scale)
  48. plt.cla()
  49. plt.clf()
  50. plt.hist(data, bins = 10, density = True)
  51. plt.plot(x, y)
  52. plt.savefig(f'output/dist_{speaker}.png')
  53. params.append({
  54. 'alpha': a,
  55. 'beta': 1/scale,
  56. 'speaker': speaker
  57. })
  58. pd.DataFrame(params).set_index('speaker').to_csv('output/speech_dist.csv')