Scheduled service maintenance on November 22


On Friday, November 22, 2024, between 06:00 CET and 18:00 CET, GIN services will undergo planned maintenance. Extended service interruptions should be expected. We will try to keep downtimes to a minimum, but recommend that users avoid critical tasks, large data uploads, or DOI requests during this time.

We apologize for any inconvenience.

speech_distribution.py 2.0 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. #!/usr/bin/env python3
  2. from ChildProject.projects import ChildProject
  3. from ChildProject.annotations import AnnotationManager
  4. from ChildProject.pipelines.metrics import AclewMetrics
  5. import argparse
  6. import datalad.api
  7. from os.path import join as opj
  8. from os.path import basename, exists
  9. import multiprocessing as mp
  10. import numpy as np
  11. import pandas as pd
  12. import pickle
  13. from scipy.stats import gamma
  14. from matplotlib import pyplot as plt
  15. def rates(parameters):
  16. corpus = parameters["corpus"]
  17. annotator = parameters["annotator"]
  18. speakers = ["CHI", "OCH", "FEM", "MAL"]
  19. project = ChildProject(parameters["path"])
  20. am = AnnotationManager(project)
  21. am.read()
  22. pipeline = AclewMetrics(
  23. project,
  24. vtc=annotator,
  25. alice=None,
  26. vcm=None,
  27. from_time="09:00",
  28. to_time="18:00",
  29. by="child_id",
  30. )
  31. metrics = pipeline.extract()
  32. print(metrics)
  33. return pd.DataFrame(metrics).assign(corpus=corpus)
  34. if __name__ == "__main__":
  35. annotators = pd.read_csv("input/annotators.csv")
  36. annotators = annotators[~annotators['annotator'].str.startswith('eaf_2021')]
  37. annotators["path"] = annotators["corpus"].apply(lambda c: opj("input", c))
  38. with mp.Pool(processes=8) as pool:
  39. rates = pd.concat(pool.map(rates, annotators.to_dict(orient="records")))
  40. rates.dropna(inplace = True)
  41. params = []
  42. speakers = ['CHI', 'OCH', 'FEM', 'MAL']
  43. for speaker in speakers:
  44. data = rates[f"voc_{speaker.lower()}_ph"]*9
  45. a, loc, scale = gamma.fit(data, floc=0)
  46. x = np.linspace(0, data.max(), 100)
  47. y = gamma.pdf(x, a, loc, scale)
  48. plt.cla()
  49. plt.clf()
  50. plt.hist(data, bins = 10, density = True)
  51. plt.plot(x, y)
  52. plt.savefig(f'output/dist_{speaker}.png')
  53. params.append({
  54. 'alpha': a,
  55. 'beta': 1/scale,
  56. 'speaker': speaker
  57. })
  58. pd.DataFrame(params).set_index('speaker').to_csv('output/speech_dist.csv')