check_for_mishaps_ninox_file.py 2.5 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Mon Jun 24 08:37:33 2024
  4. @author: arefk
  5. """
  6. import os
  7. import pandas as pd
  8. import numpy as np
  9. # Get the directory where the code file is located
  10. code_dir = os.path.dirname(os.path.abspath(__file__))
  11. # Get the parent directory of the code directory
  12. parent_dir = os.path.dirname(code_dir)
  13. # Specify the input file path relative to the code file
  14. input_file_path = os.path.join(parent_dir, 'input', 'Ninox_data_overview_all.csv')
  15. final_csv = os.path.join(parent_dir, 'input', 'Ninox_data_overview_all_precheck.csv')
  16. # Read the CSV file into a Pandas DataFrame
  17. df = pd.read_csv(input_file_path)
  18. # Column names that need to be processed
  19. columns_to_process = [
  20. 'RightHindlimbDrop_RB', 'Speed_RB', 'Distance_RB',
  21. 'PercentRightPawDragPerTouch_CT', 'PercentLeftTouch_CT',
  22. 'TotalNumberOfTouches_CT', 'NumberOfFootFaultsDividedByTotalNumber_GW'
  23. ]
  24. columns_short_name = [
  25. 'HLD_RB_count', 'Speed_RB_count', 'Distance_RB_count',
  26. 'PawDrang_CT_count', 'PercentLeftTouch_CT_count',
  27. 'TotalNumberOfTouches_CT_count', 'FootFaults_GW_count'
  28. ]
  29. # Initialize new columns in the DataFrame
  30. df['NumberOfTimepoints'] = 0
  31. for col in columns_short_name:
  32. df[col] = 0
  33. df['TimepointEqualsElements'] = False
  34. # Iterate through each unique StudyID
  35. for ss in df["StudyID"].unique():
  36. temp_df = df[df["StudyID"] == ss]
  37. idx = temp_df.index
  38. # Calculate the number of time points
  39. time_points = temp_df["TimePointsBehavior"].values[0].split(", ")
  40. num_timepoints = len(time_points)
  41. # Update the NumberOfTimepoints column
  42. df.loc[idx, 'NumberOfTimepoints'] = num_timepoints
  43. # Check the number of elements in each column to process
  44. all_counts_match = True
  45. for col, short_col in zip(columns_to_process, columns_short_name):
  46. value = temp_df[col].values[0]
  47. if isinstance(value, str):
  48. elements = value.split(", ")
  49. element_count = len(elements)
  50. else:
  51. element_count = 0 # or handle NaN differently if needed
  52. # Update the corresponding count column
  53. df.loc[idx, short_col] = element_count
  54. # Check if the count matches the number of time points
  55. if element_count != num_timepoints:
  56. all_counts_match = False
  57. # Update the TimepointEqualsElements column
  58. df.loc[idx, 'TimepointEqualsElements'] = all_counts_match
  59. # Save the processed DataFrame to a new CSV file
  60. df.to_csv(final_csv, index=False)