messages.py 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321
  1. #!usr/bin/env python
  2. # -*- coding: utf8 -*-
  3. import csv
  4. # -----------------------------------------------------------------------------
  5. # File: messages.py (as part of project URUMETRICS)
  6. # Created: 29/07/2022 15:35
  7. # Last Modified: 29/07/2022 15:35
  8. # -----------------------------------------------------------------------------
  9. # Author: William N. Havard
  10. # Postdoctoral Researcher
  11. #
  12. # Mail : william.havard@ens.fr / william.havard@gmail.com
  13. #
  14. # Institution: ENS / Laboratoire de Sciences Cognitives et Psycholinguistique
  15. #
  16. # ------------------------------------------------------------------------------
  17. # Description:
  18. # •
  19. # -----------------------------------------------------------------------------
  20. import logging
  21. import os
  22. import math
  23. from datetime import datetime
  24. import pandas as pd
  25. import yaml
  26. from ChildProject.annotations import AnnotationManager
  27. from ChildProject.projects import ChildProject
  28. logger = logging.getLogger(__name__)
  29. def _read_yaml(yaml_path):
  30. with open(yaml_path, 'r') as in_yaml:
  31. data = yaml.load(in_yaml, Loader=yaml.FullLoader)
  32. return data
  33. def get_metrics(project_path, metrics_file):
  34. """
  35. given a dataset and an output metrics file
  36. return a merge of the metrics and recording info, as well as a list of all the metrics labels
  37. :param project_path: path to the dataset
  38. :type project_path: str
  39. :param metrics_file: path to the metrics csv file
  40. :type metrics_file: str
  41. :return: dataframe of metrics, existing metrics columns (columns of the dataframe minus metadata columns)
  42. :rtype: pandas.Dataframe, list[str]
  43. """
  44. project = ChildProject(project_path)
  45. am = AnnotationManager(project)
  46. am.read()
  47. # Read metrics file and get metrics columns
  48. metrics = pd.read_csv(metrics_file)
  49. metrics_columns = list(set(metrics.columns) - set(['recording_filename', 'child_id']))
  50. # Merge with recordings to get date_iso
  51. metrics_recordings = pd.merge(metrics, project.recordings, on='recording_filename', suffixes=('', '_drop'))
  52. metrics_recordings.drop([col for col in metrics_recordings.columns if 'drop' in col], axis=1, inplace=True)
  53. # Handle file with the same child_id that have the same date -> keep the longest one
  54. metrics_recordings = (metrics_recordings.groupby(['child_id', 'date_iso'], as_index=False)
  55. # Keep only the first segment for each candidate speaker
  56. .apply(lambda rows: (rows.sort_values(by='start_time', ascending=False) # take last instead
  57. .head(n=1))))
  58. return metrics_recordings, metrics_columns
  59. def fill_template(template_key, messages, metrics_evolution, date, start_date, end_date):
  60. """
  61. given the full list of templates, a template key and a measure of the evolution of the metric,
  62. returns the wanted template filled with the correct evolution indication
  63. :param template_key: positive or negative evolution of metrics, used to select a master template
  64. :type template_key: [bool, bool]
  65. :param messages: dictionary of templates (taken from yaml file)
  66. :type messages: dict
  67. :param metrics_evolution: list of tuple containing each metric and its evolution
  68. :type metrics_evolution: [(str, float, bool),(str, float, bool)]
  69. :param date: date for wich to fill the template
  70. :type date: datetime.datetime
  71. :param start_date: date of the start of the experiment
  72. :type start_date: datetime.datetime
  73. :param duration_experiment: total duration of the experiment in days
  74. :type duration_experiment: int
  75. :return: filled template
  76. :rtype: str
  77. """
  78. template = messages['templates']['_{}_{}'.format(*template_key)]
  79. for positivity_item_index, (positivity_item, _, positivity_direction) in enumerate(metrics_evolution, 1):
  80. template = template.replace('#{}'.format(positivity_item_index),
  81. messages['metrics'][positivity_item][positivity_direction])
  82. # message_variables = [msg for msg in messages if msg.startswith('_')]
  83. # for message_variable in message_variables:
  84. # message_variable = message_variable[1:]
  85. # template = template.replace('#{}'.format(message_variable),
  86. # messages['_{}'.format(message_variable)])
  87. # template = template.capitalize()
  88. for msg_key in messages['others']['fixed']:
  89. template = template.replace('#{}'.format(msg_key),messages['others']['fixed'][msg_key])
  90. if end_date <= start_date : raise ValueError('start_date {} should be before end_date {}'.format(start_date,end_date))
  91. total_days = end_date - start_date
  92. time_elapsed = date.date() - start_date.date()
  93. for msg_key in messages['others']['variable']:
  94. nb_keys = len(messages['others']['variable'][msg_key].keys())
  95. if nb_keys == 0 : raise ValueError('Message must have at least one version')
  96. msg_fraction = 1 / nb_keys
  97. time_fraction = time_elapsed.days / total_days.days
  98. index = math.ceil(time_fraction / msg_fraction)
  99. if index < 1 :index = 1 #if before the start of the experiment, use the first message
  100. if index > nb_keys : index = nb_keys #if after the end of the experiment, use last message
  101. #index = str(index)
  102. if index not in messages['others']['variable'][msg_key].keys() : raise ValueError('Could not find message : <others: variable : {} : {}> in yaml. Messages listed in a variable sentence shoud be 1, 2, ...'.format(msg_key,index))
  103. template = template.replace('#{}'.format(msg_key), messages['others']['variable'][msg_key][index])
  104. return template
  105. def fill_default(messages, date, start_date, end_date):
  106. """
  107. given the full list of templates, the date to consider and the dates of the experiment,
  108. takes the default messages template and fills it according to the keywords
  109. :param messages: dictionary of templates (taken from yaml file)
  110. :type messages: dict
  111. :param date: date for wich to fill the template
  112. :type date: datetime.datetime
  113. :param start_date: date of the start of the experiment
  114. :type start_date: datetime.datetime
  115. :param duration_experiment: total duration of the experiment in days
  116. :type duration_experiment: int
  117. :return: filled default message
  118. :rtype: str
  119. """
  120. template = messages['others']['fixed']['default']
  121. for msg_key in messages['others']['fixed']:
  122. if msg_key == 'default': continue #skip default message as it is our base template (could lead to infinite loop of filling templates otherwise)
  123. template = template.replace('#{}'.format(msg_key),messages['others']['fixed'][msg_key])
  124. if end_date <= start_date : raise ValueError('start_date {} should be before end_date {}'.format(start_date,end_date))
  125. total_days = end_date - start_date
  126. time_elapsed = date.date() - start_date.date()
  127. for msg_key in messages['others']['variable']:
  128. nb_keys = len(messages['others']['variable'][msg_key].keys())
  129. if nb_keys == 0 : raise ValueError('Message must have at least one version')
  130. msg_fraction = 1 / nb_keys
  131. time_fraction = time_elapsed.days / total_days.days
  132. index = math.ceil(time_fraction / msg_fraction)
  133. if index < 1 :index = 1 #if before the start of the experiment, use the first message
  134. if index > nb_keys : index = nb_keys #if after the end of the experiment, use last message
  135. #index = str(index)
  136. if index not in messages['others']['variable'][msg_key].keys() : raise ValueError('Could not find message : <others: variable : {} : {}> in yaml. Messages listed in a variable sentence shoud be 1, 2, ...'.format(msg_key,index))
  137. template = template.replace('#{}'.format(msg_key), messages['others']['variable'][msg_key][index])
  138. return template
  139. def build_messages(metrics_recordings, metrics_columns, message_file_path, date):
  140. """
  141. from a datadrame of computed metrics and a date, computes the evolution of all metrics,
  142. computes the metric evolution between the considered date and the previous one,
  143. then builds the messages to output based on those evolution
  144. :param metrics_recordings: Dataframe of the metrics computed for the dataset
  145. :type metrics_recordings: pandas.Dataframe
  146. :param metrics_columns: list of all metrics labels present in the file
  147. :type metrics_columns: list[str]
  148. :param message_file_path: path to the yaml where templates are stored
  149. :type message_file_path: str
  150. :param date: date for which to build the messages, row belonging to another date will not generate messages
  151. :type date: str
  152. :return: dataframe containing messages associated with a recording filename
  153. :rtype: pandas.Dataframe
  154. """
  155. og_date = date
  156. try:
  157. date_time = datetime.strptime(date, "%Y%m%d")
  158. date = date_time.strftime("%Y-%m-%d")
  159. except:
  160. raise ValueError('--date format should be YYYYMMDD without any separators.')
  161. # Get metrics of interest and messages
  162. metric_messages = _read_yaml(message_file_path)
  163. metrics_of_interest = [item for item in list(metric_messages['metrics'].keys())]
  164. experiment_start_date = datetime.strptime(metric_messages['start_date'], "%Y-%m-%d")
  165. experiment_end_date = datetime.strptime(metric_messages['end_date'], "%Y-%m-%d")
  166. # Keep only rows for which the date is below or equal to the one we want
  167. metrics_recordings = metrics_recordings[metrics_recordings['date_iso'] <= date]
  168. # Generate messages
  169. output_messages = []
  170. metrics_grouped = metrics_recordings.groupby('child_id', as_index=False)
  171. for _ignored_child_id, metrics_grouped_item in metrics_grouped:
  172. sorted_metrics_grouped_items = metrics_grouped_item.sort_values(by=['date_iso', 'imported_at'],
  173. ascending=False)
  174. # If the first row is not the desired date, skip as no message was/will be generated for this family as
  175. # this recording is too old
  176. #if sorted_metrics_grouped_items.iloc[0]['date_iso'] != date:
  177. # continue
  178. #tmp fix where the date iso of the recording os not the date is was submitted, in this case,
  179. #we want to compute according to the reception date and note the recording date_iso
  180. if sorted_metrics_grouped_items.iloc[0]['recording_filename'].split('/')[0] != og_date:
  181. continue
  182. # Only one audio (first week), generated default message
  183. if len(metrics_grouped_item) == 1:
  184. recording_filename = metrics_grouped_item.iloc[0]['recording_filename']
  185. message = fill_default(metric_messages, date_time, experiment_start_date, experiment_end_date)
  186. #message = metric_messages['others']['fixed']['default']
  187. # More than one audio file: generate a message
  188. else:
  189. todays_row = sorted_metrics_grouped_items.iloc[0]
  190. previous_row = sorted_metrics_grouped_items.iloc[1]
  191. # Compute the difference between the two sets of metrics
  192. diff_metrics = (todays_row[metrics_columns] - previous_row[metrics_columns])[metrics_of_interest]
  193. diff_metrics = diff_metrics.to_dict()
  194. metrics_evolution = [(metric, diff_metrics[metric], diff_metrics[metric] > 0)
  195. for metric in metrics_of_interest]
  196. # Message sorting
  197. metrics_evolution = sorted(metrics_evolution, key=lambda tup: (abs(tup[1]), tup[2]))[:2]
  198. template_key = list([tpl_key for (_, _, tpl_key) in metrics_evolution])
  199. recording_filename = todays_row['recording_filename']
  200. message = fill_template(template_key, metric_messages, metrics_evolution, date_time, experiment_start_date, experiment_end_date)
  201. output_messages.append({'recording_filename': recording_filename,
  202. 'message': message})
  203. df_out = pd.DataFrame(output_messages)
  204. return df_out
  205. def generate_messages(project_path, metrics_file, message_definition, date):
  206. """
  207. given a path to a dataset, to a metrics file and a message definition yaml file,
  208. creates all the messages for a given date, then stores it in the dataset (extra/messages/generated/messages_YYYYMMDD.csv)
  209. :param project_path: path to childproject dataset
  210. :type project_path: str
  211. :param metrics_file: path to the extracted metrics
  212. :type metrics_file: str
  213. :param message_definition: path to the file defining the messages templates
  214. :type message_definition: str
  215. :param date: date in format YYYYMMDD for which to generate messages
  216. :type date: str
  217. """
  218. message_out_path = os.path.join(project_path, 'extra', 'messages', 'generated', 'messages_{}.csv'.format(date))
  219. message_out_dir = os.path.dirname(message_out_path)
  220. if not os.path.exists(message_out_dir):
  221. os.makedirs(message_out_dir)
  222. # Make sure we have all the files we need
  223. metrics_recordings, metrics_columns = get_metrics(project_path, metrics_file)
  224. messages = build_messages(metrics_recordings, metrics_columns, message_definition, date)
  225. if not os.path.exists(message_out_path):
  226. if len(messages):
  227. messages.to_csv(message_out_path, index=False, quoting=csv.QUOTE_NONNUMERIC,sep=";")
  228. logger.info('{} messages generated.'.format(len(messages)))
  229. else:
  230. logger.warning('No message needs to be generated for date {}.'.format(date))
  231. else:
  232. raise IOError('File {} already exists!'.format(message_out_path))
  233. def main(project_path, **kwargs):
  234. project_path = os.path.abspath(project_path)
  235. expected_metrics_file = os.path.join(project_path, 'extra', 'metrics', 'metrics.csv')
  236. expected_message_definition = os.path.join(project_path, 'extra', 'messages', 'definition', 'metrics_messages.yaml')
  237. assert os.path.exists(expected_metrics_file) and os.path.exists(expected_message_definition), \
  238. ValueError('Expected metrics ({}) and/or message definition file ({}) not found. Are you sure to be running this '
  239. 'command from the root of the data set?'.format(expected_metrics_file, expected_message_definition))
  240. generate_messages(project_path=project_path, metrics_file=expected_metrics_file,
  241. message_definition=expected_message_definition, **kwargs)
  242. def _parse_args(argv):
  243. import argparse
  244. parser = argparse.ArgumentParser(description='Generate feedback messages.')
  245. parser.add_argument('--project-path', required=False, type=str, default='',
  246. help="Path to a ChildProject/datalad project (useful for debugging purposes).")
  247. parser.add_argument('--date', type=str, default=datetime.now().strftime("%Y%m%d"),
  248. help='Date for which to generate messages.')
  249. args = parser.parse_args(argv)
  250. return vars(args)
  251. if __name__ == '__main__':
  252. import sys
  253. pgrm_name, argv = sys.argv[0], sys.argv[1:]
  254. args = _parse_args(argv)
  255. logging.basicConfig(level=logging.INFO)
  256. try:
  257. main(**args)
  258. sys.exit(0)
  259. except Exception as e:
  260. logger.exception(e)
  261. sys.exit(1)