Bladeren bron

Merge branch 'multiple_nb_per_child'

William N. Havard 1 jaar geleden
bovenliggende
commit
b0ca60170f
1 gewijzigde bestanden met toevoegingen van 31 en 3 verwijderingen
  1. 31 3
      import_data/import_recordings.py

+ 31 - 3
import_data/import_recordings.py

@@ -69,7 +69,29 @@ def _get_children(children_path):
     return data
 
 
-def _build_recording_metadata(recordings_path, recording, experiment, recording_device_type):
+#ac2pl
+def _get_correspondance(correspondance_path):
+    """
+    Returns a DataFrame of correspondances across child ID (phone numbers) or an empty DataFrame if `correspondance.csv` does not
+    exist
+    :param recordings_path: Path to the `correspondance.csv` metadata file
+    :type correspondance_path: str
+    :return: dataframe of correspondances across child ID (phone numbers) or empty dataframe
+    :rtype: pandas.DataFrame
+    """
+    try:
+        data = pd.read_csv(correspondance_path)
+    except:
+        columns = ['new_number', 'original_number']
+        data = pd.DataFrame(columns=columns)
+
+    # Change to string
+    data = data.astype({cname:'string' for cname in data.columns})
+
+    return dict(data.values.tolist())
+
+
+def _build_recording_metadata(recordings_path, recording, experiment, recording_device_type, correspondance):
     """
     Return the metadata corresponding to a given file (date, time, duration, etc.)
     :param recordings_path: path to the directory storing the WAV files
@@ -86,7 +108,7 @@ def _build_recording_metadata(recordings_path, recording, experiment, recording_
     raw_filename, _ = os.path.splitext(os.path.basename(recording))
     try:
         child_id_, *experiment_stage, date_iso_, start_time_ = raw_filename.split('_')
-        child_id = 'chi_{}'.format(child_id_) # coerce ID to be a string (prevent later mistakes)
+        child_id = 'chi_{}'.format(correspondance.get(child_id_, child_id_)) # coerce ID to be a string (prevents later mistakes)
         date_iso = datetime.strptime(date_iso_, '%Y%m%d').strftime('%Y-%m-%d')
         start_time = datetime.strptime(start_time_, '%H%M%S').strftime('%H:%M:%S')
         session_id = '{}_{}'.format(child_id, date_iso_)
@@ -121,9 +143,13 @@ def import_recordings(project_path, experiment, recording_device_type):
     :rtype: None
     """
     recordings_metadata_path = os.path.join(project_path, 'metadata', 'recordings.csv')
+    correspondance_metadata_path = os.path.join(project_path, 'metadata', 'correspondance.csv')
+
     recordings = _get_recordings(recordings_metadata_path)
     recordings_count = len(recordings)
 
+    correspondance = _get_correspondance(correspondance_metadata_path)
+
     recordings_path = os.path.join(project_path, 'recordings', 'raw')
     recording_file_list = walk_dir(recordings_path, ext='wav', return_full_path=False)
 
@@ -131,7 +157,8 @@ def import_recordings(project_path, experiment, recording_device_type):
         if recording_file in recordings['recording_filename'].values: continue
 
         recording_metadata = _build_recording_metadata(recordings_path, recording_file,
-                                                       experiment, recording_device_type)
+                                                       experiment, recording_device_type,
+                                                       correspondance)
         # Add new recordings only
         if not recording_metadata:
             continue
@@ -164,6 +191,7 @@ def import_children(project_path, experiment):
     child_id_recordings = set(recordings['child_id'])
     missing_children = child_id_recordings - set(children['child_id'])
 
+
     for child_id in missing_children:
         child_metadata = {
             'experiment': experiment,