12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667 |
- # -*- coding: utf-8 -*-
- """
- Created on Tue Jun 25 14:30:18 2024
- @author: arefks
- """
- import os
- import pandas as pd
- from scipy.stats import zscore
- # Get the directory where the code file is located
- code_dir = os.path.dirname(os.path.abspath(__file__))
- # Get the parent directory of the code directory
- parent_dir = os.path.dirname(code_dir)
- # Specify the input file path relative to the code file
- input_file_path = os.path.join(parent_dir, 'output', 'Merged_behaviour_data.csv')
- output_file_path = os.path.join(parent_dir, 'output', 'Merged_behaviour_data_ztansform.csv')
- # Read the CSV file into a Pandas DataFrame
- df = pd.read_csv(input_file_path)
- # Define the columns of interest
- columns_of_interest = ['C_PawDragPercent', 'GW_FootFault', 'RB_HindlimbDrop']
- # Convert specified columns to float and handle conversion errors
- for col in columns_of_interest:
- try:
- df[col] = df[col].astype(float)
- except ValueError:
- print(f"Conversion error in column: {col}")
- # Identify non-convertible values and print them
- non_convertible = df[col].apply(pd.to_numeric, errors='coerce').isna()
- print(f"Non-convertible values in {col}:\n", df.loc[non_convertible, col])
- # Convert the column with errors='coerce' to set non-convertible values to NaN
- df[col] = pd.to_numeric(df[col], errors='coerce')
- # Replace outliers with NaN based on IQR rule for each tt, gg, cc
- for tt in df["TimePointMerged"].unique():
- for gg in df["Group"].unique():
- df_temp = df[(df["TimePointMerged"] == tt) & (df["Group"] == gg)].copy()
-
- for cc in columns_of_interest:
- Q1 = df_temp[cc].quantile(0.25)
- Q3 = df_temp[cc].quantile(0.75)
- IQR = Q3 - Q1
- lower_bound = Q1 - 1.5 * IQR
- upper_bound = Q3 + 1.5 * IQR
- df.loc[df_temp.index, cc] = df_temp[cc].apply(lambda x: x if lower_bound <= x <= upper_bound else float('nan'))
- # Apply Z-transformation normalization to the entire cohort for each type of score
- for cc in columns_of_interest:
- # Z-transformation
- df[f"Z_{cc}"] = zscore(df[cc], nan_policy="omit")
-
- # Calculate the DeficitScore as the average of all Z-scored tests in columns_of_interest
- df["DeficitScore"] = df[[f"Z_{col}" for col in columns_of_interest]].mean(axis=1)
- # Adjust DeficitScore to ensure all values are non-negative
- df["DeficitScore"] = abs(df["DeficitScore"].min()) + df["DeficitScore"]
- # Save the DataFrame to a CSV file
- df.to_csv(output_file_path, index=False)
- print("Normalization applied and results saved to:", output_file_path)
|