doi
/
2024_Kalantari_PRR
forked from Aswendt_Lab/2024_Kalantari_PRR


			
			
				
					
						
							12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667
							# -*- coding: utf-8 -*-
"""
Created on Tue Jun 25 14:30:18 2024

@author: arefks
"""

import os
import pandas as pd
from scipy.stats import zscore

# Get the directory where the code file is located
code_dir = os.path.dirname(os.path.abspath(__file__))

# Get the parent directory of the code directory
parent_dir = os.path.dirname(code_dir)

# Specify the input file path relative to the code file
input_file_path = os.path.join(parent_dir, 'output', 'Merged_behaviour_data.csv')
output_file_path = os.path.join(parent_dir, 'output', 'Merged_behaviour_data_ztansform.csv')

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(input_file_path)

# Define the columns of interest
columns_of_interest = ['C_PawDragPercent', 'GW_FootFault', 'RB_HindlimbDrop']

# Convert specified columns to float and handle conversion errors
for col in columns_of_interest:
    try:
        df[col] = df[col].astype(float)
    except ValueError:
        print(f"Conversion error in column: {col}")
        # Identify non-convertible values and print them
        non_convertible = df[col].apply(pd.to_numeric, errors='coerce').isna()
        print(f"Non-convertible values in {col}:\n", df.loc[non_convertible, col])
        # Convert the column with errors='coerce' to set non-convertible values to NaN
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Replace outliers with NaN based on IQR rule for each tt, gg, cc
for tt in df["TimePointMerged"].unique():
    for gg in df["Group"].unique():
        df_temp = df[(df["TimePointMerged"] == tt) & (df["Group"] == gg)].copy()
        
        for cc in columns_of_interest:
            Q1 = df_temp[cc].quantile(0.25)
            Q3 = df_temp[cc].quantile(0.75)
            IQR = Q3 - Q1
            lower_bound = Q1 - 1.5 * IQR
            upper_bound = Q3 + 1.5 * IQR
            df.loc[df_temp.index, cc] = df_temp[cc].apply(lambda x: x if lower_bound <= x <= upper_bound else float('nan'))

# Apply Z-transformation normalization to the entire cohort for each type of score
for cc in columns_of_interest:
    # Z-transformation
    df[f"Z_{cc}"] = zscore(df[cc], nan_policy="omit")
    
# Calculate the DeficitScore as the average of all Z-scored tests in columns_of_interest
df["DeficitScore"] = df[[f"Z_{col}" for col in columns_of_interest]].mean(axis=1)

# Adjust DeficitScore to ensure all values are non-negative
df["DeficitScore"] = abs(df["DeficitScore"].min()) + df["DeficitScore"]

# Save the DataFrame to a CSV file
df.to_csv(output_file_path, index=False)

print("Normalization applied and results saved to:", output_file_path)