123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305 |
- # -*- coding: utf-8 -*-
- """
- Created on Fri May 31 17:09:18 2024
- @author: arefks
- """
- import os
- import pandas as pd
- import numpy as np
- import seaborn as sns
- import matplotlib.pyplot as plt
- from scipy.stats import linregress
- # Get the directory where the code file is located
- code_dir = os.path.dirname(os.path.abspath(__file__))
- # Get the parent directory of the code directory
- parent_dir = os.path.dirname(code_dir)
- # Specify the input file path relative to the code file
- input_file_path = os.path.join(parent_dir, 'output', 'behavior_data_processed.csv')
- outfile_path = os.path.join(parent_dir, "output", "figures")
- input_file_path_new = os.path.join(parent_dir, 'output', 'behavior_data_processed_prr.csv')
- # Read the CSV file into a Pandas DataFrame
- df = pd.read_csv(input_file_path)
- df["averageScore_shifted"] = df["averageScore"] + np.abs(np.min(df["averageScore"]))
- # Filter the DataFrame for stroke group and non-"S" in "2 Cluster"
- df_stroke = df[(df["Group"] == "Stroke") & (df["2 Cluster"] != "S")]
- # Calculate the average baseline score in case of NaN
- average_baseline_incase_of_nan = df_stroke[df_stroke["TimePoint_merged"] == 0]["averageScore_shifted"].mean()
- average_28_incase_of_nan = df_stroke[df_stroke["TimePoint_merged"] == 28]["averageScore_shifted"].mean()
- # Initialize an empty list to store proportional recovery data
- proportional_recovery_list = []
- # Loop through unique StudyIDs in the stroke group
- for ss in df_stroke["StudyID"].unique():
- df_temp = df[df["StudyID"] == ss]
- average_score_day3 = df_temp[df_temp["TimePoint_merged"] == 3]["averageScore_shifted"].values
- average_score_baseline = df_temp[df_temp["TimePoint_merged"] == 0]["averageScore_shifted"].values
- average_score_day28 = df_temp[df_temp["TimePoint_merged"] == 28]["averageScore_shifted"].values
- cluster = df_temp["3 Cluster"].values[1]
- # Handle cases with NaN in baseline score
- if np.isnan(average_score_baseline):
- continue
- else:
- average_score_baseline = average_score_baseline[0]
- if len(average_score_day28) > 0:
- average_score_day28 = average_score_day28[0]
- else:
- average_score_day28 = df_temp[df_temp["TimePoint_merged"] == 21]["averageScore_shifted"].values
- if len(average_score_day28) > 0:
- average_score_day28 = average_score_day28[0]
- else:
- continue
- if len(average_score_day3) > 0:
- average_score_day3 = average_score_day3[0]
- initial_imapairment = average_score_day3 - average_score_baseline
- if initial_imapairment <= 0 or np.isnan(initial_imapairment):
- continue
- ## calculating Change Observed
- observed_change = -(average_score_day28 - initial_imapairment)
- error = -0.4
- estimated_change = 0.7 * initial_imapairment + error
- proportional_recovery_list.append({"StudyID": ss, "initial impairment": initial_imapairment, 'change predicted': estimated_change, 'change observed': observed_change, "P3": average_score_day3, "BL": average_score_baseline, "P28": average_score_day28, "cluster": cluster})
- # Convert the list to a DataFrame
- proportional_recovery = pd.DataFrame(proportional_recovery_list)
- # Output the proportional recovery DataFrame
- print(proportional_recovery)
- # Divide the data into 3 groups based on "initial impairment"
- proportional_recovery['cluster ii'] = pd.qcut(proportional_recovery['initial impairment'], 3, labels=['Low ii', 'Medium ii', 'High ii'])
- proportional_recovery['cluster co'] = pd.qcut(proportional_recovery['change observed'], 3, labels=['Low co', 'Medium co', 'High co'])
- # Initialize 'linecluster' to 'own_regression'
- proportional_recovery['linecluster'] = "own_regression"
- # Perform initial linear regression on the whole dataset to determine outliers
- x_initial = proportional_recovery['initial impairment']
- y_initial = proportional_recovery['change observed']
- slope_initial, intercept_initial, r_value_initial, p_value_initial, std_err_initial = linregress(x_initial, y_initial)
- # Define the initial regression line
- regression_line_initial = slope_initial * x_initial + intercept_initial
- # Calculate residuals for the whole dataset
- residuals_initial = y_initial - regression_line_initial
- # Calculate interquartile range (IQR)
- Q1_initial = np.percentile(residuals_initial, 25)
- Q3_initial = np.percentile(residuals_initial, 75)
- IQR_initial = Q3_initial - Q1_initial
- # Set a threshold for identifying outliers using IQR method
- threshold_initial = 1.5 * IQR_initial
- # Identify outliers for the whole dataset
- outliers = proportional_recovery[(residuals_initial < Q1_initial - threshold_initial) | (residuals_initial > Q3_initial + threshold_initial)]
- R = 15
- # Loop up to 10 times for iterative refinement
- for cc in range(R):
- # Perform linear regression on 'own_regression' points
- x = proportional_recovery[proportional_recovery["linecluster"] == "own_regression"]['initial impairment']
- y = proportional_recovery[proportional_recovery["linecluster"] == "own_regression"]['change observed']
- slope, intercept, r_value, p_value, std_err = linregress(x, y)
- intercept = round(intercept, 2) # Round the intercept to two decimal points
- # Define the regression line
- regression_line = slope * x + intercept
- # Calculate distances to the regression line and PRR line
- dist_to_regression = np.abs(proportional_recovery['change observed'] - (slope * proportional_recovery['initial impairment'] + intercept))
- dist_to_PRR = np.abs(proportional_recovery['change observed'] - (0.7 * proportional_recovery['initial impairment'] + intercept))
- # Assign points to the closest line
- proportional_recovery['linecluster'] = np.where(dist_to_regression < dist_to_PRR, 'own_regression', 'PRR')
- # Set figure size
- cm = 1 / 2.54 # Conversion factor from inches to cm
- plt.figure(figsize=(18 * cm, 9 * cm), dpi=300) # 9 cm by 9 cm
- # Plot the results
- sns.scatterplot(x='initial impairment', y='change observed', hue='linecluster', data=proportional_recovery, palette='viridis', marker='o')
- sns.scatterplot(x=outliers['initial impairment'], y=outliers['change observed'], color='red', marker='x', s=100, label='Outliers')
- sns.lineplot(x=x, y=regression_line, color='red', label=f'Linear fit: y = {slope:.2f}x + {intercept:.2f}')
- plt.plot([x.min(), x.max()], [0.7 * x.min() + intercept, 0.7 * x.max() + intercept], color='gray', linestyle="--", linewidth=1, label=f'y = 0.7x + {intercept}')
- # Set plot title and labels
- plt.xlabel('initial impairment (P3 - BL)', fontsize=10, fontweight='bold')
- plt.ylabel('change observed (P28 - ii)', fontsize=10, fontweight='bold')
- # Set legend properties
- plt.legend(frameon=False, fontsize=7) # No legend border, smaller font size
- # Remove grid
- plt.grid(False)
- # Set font size and weight for tick labels
- plt.xticks(fontsize=10, fontweight='bold')
- plt.yticks(fontsize=10, fontweight='bold')
- # Set plot frame properties
- ax = plt.gca() # Get the current axes
- ax.spines['right'].set_visible(False) # Turn off the right frame
- ax.spines['top'].set_visible(False) # Turn off the top frame
- if cc == (R - 1):
- # Calculate residuals for the final outlier determination using the PRR rule
- final_slope = 0.7
- final_intercept = round(intercept, 2) # Round the intercept to two decimal points
-
- # Calculate the PRR line
- prr_line = final_slope * proportional_recovery[proportional_recovery["linecluster"] == "PRR"]['initial impairment'] + final_intercept
-
- # Calculate residuals based on the PRR rule
- final_residuals = proportional_recovery[proportional_recovery["linecluster"] == "PRR"]['change observed'] - prr_line
-
- # Calculate interquartile range (IQR) for residuals
- Q1_final = np.percentile(final_residuals, 25)
- Q3_final = np.percentile(final_residuals, 75)
- IQR_final = Q3_final - Q1_final
-
-
- # Set a threshold for identifying new outliers using the IQR method
- threshold_final = 1.5 * IQR_final
-
- # Identify new outliers based on the PRR rule
- new_outliers = proportional_recovery[
- (final_residuals < Q1_final - threshold_final) | (final_residuals > Q3_final + threshold_final) & (proportional_recovery["linecluster"] == "PRR")
- ]
- # Update the "linecluster" column to "outlier" for the new outliers
- proportional_recovery.loc[new_outliers.index, 'linecluster'] = "outlier"
-
- #sns.scatterplot(x=new_outliers['initial impairment'], y=new_outliers['change observed'], color='blue', marker='d', s=100, label='New Outliers')
- plt.legend(frameon=False, fontsize=8) # No legend border, smaller font size
-
- # Save the plot at the specified outfile_path
- plt.savefig(os.path.join(outfile_path, "ppr_rule.png"), dpi=300)
- plt.savefig(os.path.join(outfile_path, "ppr_rule.svg"))
-
- plt.show()
-
- # Show plot
- plt.tight_layout() # Adjust layout to prevent clipping of labels
- plt.show()
- # Calculate percentage of those who follow the PRR rule
- # Count of data points in each linecluster
- ppr_count = len(proportional_recovery[proportional_recovery["linecluster"] == "PRR"])
- own_regression_count = len(proportional_recovery[proportional_recovery["linecluster"] == "own_regression"])
- print("Count of subjects in PRR cluster:", ppr_count)
- print("Count of subjects in own_regression cluster:", own_regression_count)
- print("percent of subjects that follow the PRR: %", ppr_count / (own_regression_count + ppr_count) * 100)
- # Map the 'linecluster' from proportional_recovery to df based on 'StudyID'
- df = df.merge(proportional_recovery[['StudyID', 'linecluster','cluster ii','cluster co']], on='StudyID', how='left')
- # Step 2: Set values to 'S' for the 'Sham' group
- # Step 2: Add 'S' to the categories of the categorical columns
- for col in ['linecluster', 'cluster ii', 'cluster co']:
- if col in df.columns:
- df[col] = df[col].astype('category')
- if 'S' not in df[col].cat.categories:
- df[col] = df[col].cat.add_categories('S')
- # Step 3: Set values to 'S' for the 'Sham' group
- sham_indices = df['Group'] == "Sham"
- df.loc[sham_indices, ['linecluster', 'cluster ii', 'cluster co']] = "S"
- # Display the updated dataframe
- print(df)
- df.to_csv(input_file_path_new, index=False)
- #%%
- # Perform hierarchical clustering using Ward's method
- Z = linkage(pdist(proportional_recovery[['change predicted', 'change observed']]), method='ward')
- # Assuming Z is the linkage matrix obtained from hierarchical clustering
- cm = 1/2.53
- # Plot dendrogram and obtain the distances
- plt.figure(figsize=(9*cm, 9*cm),dpi = 300)
- dendrogram(Z)
- plt.title('Hierarchical Clustering Dendrogram')
- plt.xlabel('Sample index')
- plt.ylabel('Distance')
- plt.grid(False)
- plt.show()
- # Extract distances from the dendrogram
- num_samples = len(Z) + 1 # Number of samples
- distances = Z[:, 2]
- num_clusters = range(1, num_samples)
- # Plot number of clusters against the distances
- plt.figure(figsize=(10*cm, 7*cm),dpi = 300)
- plt.plot(num_clusters, distances[::-1], marker='o') # Reverse the distances for correct alignment
- plt.title('Number of Clusters vs. Distance')
- plt.xlabel('Number of Clusters')
- plt.ylabel('Distance')
- plt.grid(True)
- plt.show()
- max_d = 10
- # Perform clustering with the chosen number of clusters
- clusters = fcluster(Z, max_d, criterion='distance')
- # Add cluster labels to the DataFrame
- proportional_recovery['Cluster'] = clusters
- # Perform linear regression within each cluster
- for cluster in proportional_recovery['Cluster'].unique():
- cluster_data = proportional_recovery[proportional_recovery['Cluster'] == cluster]
- X = cluster_data[['change observed']]
- X = sm.add_constant(X) # Add a constant for the intercept
- y = cluster_data['change predicted']
- model = sm.OLS(y, X).fit()
-
- # Print cluster information and model summary
- print(f"Cluster {cluster}:")
- print(model.summary())
- # Get the model coefficients
- intercept = model.params['const']
- slope = model.params['change observed']
-
- # Print the mathematical model
- print(f"The mathematical model for Cluster {cluster} is: y = {intercept:.4f} + {slope:.4f} * x")
- # Plot 'change predicted' vs 'change observed' with hue representing clusters
- plt.figure(figsize=(18*cm, 18*cm),dpi = 300)
- sns.scatterplot(data=proportional_recovery, x='change observed', y='change predicted', hue='Cluster', palette='tab10')
- plt.title('Change Predicted vs Change Observed by Cluster')
- plt.xlabel('Change Observed')
- plt.ylabel('Change Predicted')
- plt.legend(title='Cluster')
- # Add y = x line
- max_val = max(proportional_recovery['change observed'].max(), proportional_recovery['change predicted'].max())
- min_val = min(proportional_recovery['change observed'].min(), proportional_recovery['change predicted'].min())
- plt.plot([min_val, max_val], [min_val, max_val], color='black', linestyle='--')
- plt.show()
- # Calculate the 'New_PR' column
- proportional_recovery["New_PR"] = proportional_recovery["change observed"]/(proportional_recovery["P3"]-proportional_recovery["BL"])
- # Filter out 'inf' values
- valid_new_pr = proportional_recovery[(proportional_recovery["Cluster"] == 1) & (np.isfinite(proportional_recovery["New_PR"]))]
- # Calculate the mean of the filtered values
- mean_new_pr = valid_new_pr["New_PR"].mean()
- print(mean_new_pr)
|