doi
/
2024_Kalantari_PRR
geforkt von Aswendt_Lab/2024_Kalantari_PRR


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305
							# -*- coding: utf-8 -*-
"""
Created on Fri May 31 17:09:18 2024

@author: arefks
"""

import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import linregress

# Get the directory where the code file is located
code_dir = os.path.dirname(os.path.abspath(__file__))
# Get the parent directory of the code directory
parent_dir = os.path.dirname(code_dir)

# Specify the input file path relative to the code file
input_file_path = os.path.join(parent_dir, 'output', 'behavior_data_processed.csv')
outfile_path = os.path.join(parent_dir, "output", "figures")
input_file_path_new = os.path.join(parent_dir, 'output', 'behavior_data_processed_prr.csv')

# Read the CSV file into a Pandas DataFrame
df = pd.read_csv(input_file_path)

df["averageScore_shifted"] = df["averageScore"] + np.abs(np.min(df["averageScore"]))
# Filter the DataFrame for stroke group and non-"S" in "2 Cluster"
df_stroke = df[(df["Group"] == "Stroke") & (df["2 Cluster"] != "S")]

# Calculate the average baseline score in case of NaN
average_baseline_incase_of_nan = df_stroke[df_stroke["TimePoint_merged"] == 0]["averageScore_shifted"].mean()
average_28_incase_of_nan = df_stroke[df_stroke["TimePoint_merged"] == 28]["averageScore_shifted"].mean()

# Initialize an empty list to store proportional recovery data
proportional_recovery_list = []

# Loop through unique StudyIDs in the stroke group
for ss in df_stroke["StudyID"].unique():
    df_temp = df[df["StudyID"] == ss]
    average_score_day3 = df_temp[df_temp["TimePoint_merged"] == 3]["averageScore_shifted"].values
    average_score_baseline = df_temp[df_temp["TimePoint_merged"] == 0]["averageScore_shifted"].values
    average_score_day28 = df_temp[df_temp["TimePoint_merged"] == 28]["averageScore_shifted"].values
    cluster = df_temp["3 Cluster"].values[1]
    # Handle cases with NaN in baseline score
    if np.isnan(average_score_baseline):
        continue
    else:
        average_score_baseline = average_score_baseline[0]

    if len(average_score_day28) > 0:
        average_score_day28 = average_score_day28[0]
    else:
        average_score_day28 = df_temp[df_temp["TimePoint_merged"] == 21]["averageScore_shifted"].values
        if len(average_score_day28) > 0:
            average_score_day28 = average_score_day28[0]
        else:
            continue

    if len(average_score_day3) > 0:
        average_score_day3 = average_score_day3[0]
        initial_imapairment = average_score_day3 - average_score_baseline
        if initial_imapairment <= 0 or np.isnan(initial_imapairment):
            continue

    ## calculating Change Observed
    observed_change = -(average_score_day28 - initial_imapairment)
    error = -0.4 
    estimated_change = 0.7 * initial_imapairment + error

    proportional_recovery_list.append({"StudyID": ss, "initial impairment": initial_imapairment, 'change predicted': estimated_change, 'change observed': observed_change, "P3": average_score_day3, "BL": average_score_baseline, "P28": average_score_day28, "cluster": cluster})

# Convert the list to a DataFrame
proportional_recovery = pd.DataFrame(proportional_recovery_list)

# Output the proportional recovery DataFrame
print(proportional_recovery)
# Divide the data into 3 groups based on "initial impairment"
proportional_recovery['cluster ii'] = pd.qcut(proportional_recovery['initial impairment'], 3, labels=['Low ii', 'Medium ii', 'High ii'])
proportional_recovery['cluster co'] = pd.qcut(proportional_recovery['change observed'], 3, labels=['Low co', 'Medium co', 'High co'])

# Initialize 'linecluster' to 'own_regression'
proportional_recovery['linecluster'] = "own_regression"

# Perform initial linear regression on the whole dataset to determine outliers
x_initial = proportional_recovery['initial impairment']
y_initial = proportional_recovery['change observed']
slope_initial, intercept_initial, r_value_initial, p_value_initial, std_err_initial = linregress(x_initial, y_initial)

# Define the initial regression line
regression_line_initial = slope_initial * x_initial + intercept_initial

# Calculate residuals for the whole dataset
residuals_initial = y_initial - regression_line_initial

# Calculate interquartile range (IQR)
Q1_initial = np.percentile(residuals_initial, 25)
Q3_initial = np.percentile(residuals_initial, 75)
IQR_initial = Q3_initial - Q1_initial

# Set a threshold for identifying outliers using IQR method
threshold_initial = 1.5 * IQR_initial

# Identify outliers for the whole dataset
outliers = proportional_recovery[(residuals_initial < Q1_initial - threshold_initial) | (residuals_initial > Q3_initial + threshold_initial)]
R = 15
# Loop up to 10 times for iterative refinement
for cc in range(R):
    # Perform linear regression on 'own_regression' points
    x = proportional_recovery[proportional_recovery["linecluster"] == "own_regression"]['initial impairment']
    y = proportional_recovery[proportional_recovery["linecluster"] == "own_regression"]['change observed']
    slope, intercept, r_value, p_value, std_err = linregress(x, y)
    intercept = round(intercept, 2)  # Round the intercept to two decimal points

    # Define the regression line
    regression_line = slope * x + intercept

    # Calculate distances to the regression line and PRR line
    dist_to_regression = np.abs(proportional_recovery['change observed'] - (slope * proportional_recovery['initial impairment'] + intercept))
    dist_to_PRR = np.abs(proportional_recovery['change observed'] - (0.7 * proportional_recovery['initial impairment'] + intercept))

    # Assign points to the closest line
    proportional_recovery['linecluster'] = np.where(dist_to_regression < dist_to_PRR, 'own_regression', 'PRR')

    # Set figure size
    cm = 1 / 2.54  # Conversion factor from inches to cm
    plt.figure(figsize=(18 * cm, 9 * cm), dpi=300)  # 9 cm by 9 cm

    # Plot the results
    sns.scatterplot(x='initial impairment', y='change observed', hue='linecluster', data=proportional_recovery, palette='viridis', marker='o')
    sns.scatterplot(x=outliers['initial impairment'], y=outliers['change observed'], color='red', marker='x', s=100, label='Outliers')
    sns.lineplot(x=x, y=regression_line, color='red', label=f'Linear fit: y = {slope:.2f}x + {intercept:.2f}')
    plt.plot([x.min(), x.max()], [0.7 * x.min() + intercept, 0.7 * x.max() + intercept], color='gray', linestyle="--", linewidth=1, label=f'y = 0.7x + {intercept}')

    # Set plot title and labels
    plt.xlabel('initial impairment (P3 - BL)', fontsize=10, fontweight='bold')
    plt.ylabel('change observed (P28 - ii)', fontsize=10, fontweight='bold')

    # Set legend properties
    plt.legend(frameon=False, fontsize=7)  # No legend border, smaller font size

    # Remove grid
    plt.grid(False)

    # Set font size and weight for tick labels
    plt.xticks(fontsize=10, fontweight='bold')
    plt.yticks(fontsize=10, fontweight='bold')

    # Set plot frame properties
    ax = plt.gca()  # Get the current axes
    ax.spines['right'].set_visible(False)  # Turn off the right frame
    ax.spines['top'].set_visible(False)  # Turn off the top frame

    if cc == (R - 1):
        # Calculate residuals for the final outlier determination using the PRR rule
        final_slope = 0.7
        final_intercept = round(intercept, 2)  # Round the intercept to two decimal points
        
        # Calculate the PRR line
        prr_line = final_slope * proportional_recovery[proportional_recovery["linecluster"] == "PRR"]['initial impairment'] + final_intercept
        
        # Calculate residuals based on the PRR rule
        final_residuals = proportional_recovery[proportional_recovery["linecluster"] == "PRR"]['change observed'] - prr_line
        
        # Calculate interquartile range (IQR) for residuals
        Q1_final = np.percentile(final_residuals, 25)
        Q3_final = np.percentile(final_residuals, 75)
        IQR_final = Q3_final - Q1_final
        
        
        # Set a threshold for identifying new outliers using the IQR method
        threshold_final = 1.5 * IQR_final
        
        # Identify new outliers based on the PRR rule
        new_outliers = proportional_recovery[
            (final_residuals < Q1_final - threshold_final) | (final_residuals > Q3_final + threshold_final) & (proportional_recovery["linecluster"] == "PRR")
        ]
        # Update the "linecluster" column to "outlier" for the new outliers
        proportional_recovery.loc[new_outliers.index, 'linecluster'] = "outlier"
    
        #sns.scatterplot(x=new_outliers['initial impairment'], y=new_outliers['change observed'], color='blue', marker='d', s=100, label='New Outliers')
        plt.legend(frameon=False, fontsize=8)  # No legend border, smaller font size
        
        # Save the plot at the specified outfile_path
        plt.savefig(os.path.join(outfile_path, "ppr_rule.png"), dpi=300)
        plt.savefig(os.path.join(outfile_path, "ppr_rule.svg"))
        
        plt.show()
    
    # Show plot
    plt.tight_layout()  # Adjust layout to prevent clipping of labels
    plt.show()

# Calculate percentage of those who follow the PRR rule
# Count of data points in each linecluster
ppr_count = len(proportional_recovery[proportional_recovery["linecluster"] == "PRR"])
own_regression_count = len(proportional_recovery[proportional_recovery["linecluster"] == "own_regression"])

print("Count of subjects in PRR cluster:", ppr_count)
print("Count of subjects in own_regression cluster:", own_regression_count)
print("percent of subjects that follow the PRR: %", ppr_count / (own_regression_count + ppr_count) * 100)

# Map the 'linecluster' from proportional_recovery to df based on 'StudyID'
df = df.merge(proportional_recovery[['StudyID', 'linecluster','cluster ii','cluster co']], on='StudyID', how='left')
# Step 2: Set values to 'S' for the 'Sham' group

# Step 2: Add 'S' to the categories of the categorical columns
for col in ['linecluster', 'cluster ii', 'cluster co']:
    if col in df.columns:
        df[col] = df[col].astype('category')
        if 'S' not in df[col].cat.categories:
            df[col] = df[col].cat.add_categories('S')

# Step 3: Set values to 'S' for the 'Sham' group
sham_indices = df['Group'] == "Sham"
df.loc[sham_indices, ['linecluster', 'cluster ii', 'cluster co']] = "S"

# Display the updated dataframe
print(df)
df.to_csv(input_file_path_new, index=False)

#%%
# Perform hierarchical clustering using Ward's method
Z = linkage(pdist(proportional_recovery[['change predicted', 'change observed']]), method='ward')


# Assuming Z is the linkage matrix obtained from hierarchical clustering
cm = 1/2.53
# Plot dendrogram and obtain the distances
plt.figure(figsize=(9*cm, 9*cm),dpi = 300)
dendrogram(Z)
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Sample index')
plt.ylabel('Distance')
plt.grid(False)
plt.show()

# Extract distances from the dendrogram
num_samples = len(Z) + 1  # Number of samples
distances = Z[:, 2]
num_clusters = range(1, num_samples)

# Plot number of clusters against the distances
plt.figure(figsize=(10*cm, 7*cm),dpi = 300)
plt.plot(num_clusters, distances[::-1], marker='o')  # Reverse the distances for correct alignment
plt.title('Number of Clusters vs. Distance')
plt.xlabel('Number of Clusters')
plt.ylabel('Distance')
plt.grid(True)
plt.show()


max_d = 10
# Perform clustering with the chosen number of clusters
clusters = fcluster(Z, max_d, criterion='distance')

# Add cluster labels to the DataFrame
proportional_recovery['Cluster'] = clusters

# Perform linear regression within each cluster
for cluster in proportional_recovery['Cluster'].unique():
    cluster_data = proportional_recovery[proportional_recovery['Cluster'] == cluster]
    X = cluster_data[['change observed']]
    X = sm.add_constant(X)  # Add a constant for the intercept
    y = cluster_data['change predicted']
    model = sm.OLS(y, X).fit()
    
    # Print cluster information and model summary
    print(f"Cluster {cluster}:")
    print(model.summary())
    # Get the model coefficients
    intercept = model.params['const']
    slope = model.params['change observed']
    
    # Print the mathematical model
    print(f"The mathematical model for Cluster {cluster} is: y = {intercept:.4f} + {slope:.4f} * x")


# Plot 'change predicted' vs 'change observed' with hue representing clusters
plt.figure(figsize=(18*cm, 18*cm),dpi = 300)
sns.scatterplot(data=proportional_recovery, x='change observed', y='change predicted', hue='Cluster', palette='tab10')
plt.title('Change Predicted vs Change Observed by Cluster')
plt.xlabel('Change Observed')
plt.ylabel('Change Predicted')
plt.legend(title='Cluster')

# Add y = x line
max_val = max(proportional_recovery['change observed'].max(), proportional_recovery['change predicted'].max())
min_val = min(proportional_recovery['change observed'].min(), proportional_recovery['change predicted'].min())
plt.plot([min_val, max_val], [min_val, max_val], color='black', linestyle='--')

plt.show()

# Calculate the 'New_PR' column
proportional_recovery["New_PR"] = proportional_recovery["change observed"]/(proportional_recovery["P3"]-proportional_recovery["BL"])

# Filter out 'inf' values
valid_new_pr = proportional_recovery[(proportional_recovery["Cluster"] == 1) & (np.isfinite(proportional_recovery["New_PR"]))]

# Calculate the mean of the filtered values
mean_new_pr = valid_new_pr["New_PR"].mean()
print(mean_new_pr)