ztransform_data.py 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Tue Jun 25 14:30:18 2024
  4. @author: arefks
  5. """
  6. import os
  7. import pandas as pd
  8. from scipy.stats import zscore
  9. # Get the directory where the code file is located
  10. code_dir = os.path.dirname(os.path.abspath(__file__))
  11. # Get the parent directory of the code directory
  12. parent_dir = os.path.dirname(code_dir)
  13. # Specify the input file path relative to the code file
  14. input_file_path = os.path.join(parent_dir, 'output', 'Merged_behaviour_data.csv')
  15. output_file_path = os.path.join(parent_dir, 'output', 'Merged_behaviour_data_ztansform.csv')
  16. # Read the CSV file into a Pandas DataFrame
  17. df = pd.read_csv(input_file_path)
  18. # Define the columns of interest
  19. columns_of_interest = ['C_PawDragPercent', 'GW_FootFault', 'RB_HindlimbDrop']
  20. # Convert specified columns to float and handle conversion errors
  21. for col in columns_of_interest:
  22. try:
  23. df[col] = df[col].astype(float)
  24. except ValueError:
  25. print(f"Conversion error in column: {col}")
  26. # Identify non-convertible values and print them
  27. non_convertible = df[col].apply(pd.to_numeric, errors='coerce').isna()
  28. print(f"Non-convertible values in {col}:\n", df.loc[non_convertible, col])
  29. # Convert the column with errors='coerce' to set non-convertible values to NaN
  30. df[col] = pd.to_numeric(df[col], errors='coerce')
  31. # Replace outliers with NaN based on IQR rule for each tt, gg, cc
  32. for tt in df["TimePointMerged"].unique():
  33. for gg in df["Group"].unique():
  34. df_temp = df[(df["TimePointMerged"] == tt) & (df["Group"] == gg)].copy()
  35. for cc in columns_of_interest:
  36. Q1 = df_temp[cc].quantile(0.25)
  37. Q3 = df_temp[cc].quantile(0.75)
  38. IQR = Q3 - Q1
  39. lower_bound = Q1 - 1.5 * IQR
  40. upper_bound = Q3 + 1.5 * IQR
  41. df.loc[df_temp.index, cc] = df_temp[cc].apply(lambda x: x if lower_bound <= x <= upper_bound else float('nan'))
  42. # Apply Z-transformation normalization to the entire cohort for each type of score
  43. for cc in columns_of_interest:
  44. # Z-transformation
  45. df[f"Z_{cc}"] = zscore(df[cc], nan_policy="omit")
  46. # Calculate the DeficitScore as the average of all Z-scored tests in columns_of_interest
  47. df["DeficitScore"] = df[[f"Z_{col}" for col in columns_of_interest]].mean(axis=1)
  48. # Adjust DeficitScore to ensure all values are non-negative
  49. df["DeficitScore"] = abs(df["DeficitScore"].min()) + df["DeficitScore"]
  50. # Save the DataFrame to a CSV file
  51. df.to_csv(output_file_path, index=False)
  52. print("Normalization applied and results saved to:", output_file_path)