proportional_recovery_plot_revised.py 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Fri May 31 17:09:18 2024
  4. @author: arefks
  5. """
  6. import os
  7. import pandas as pd
  8. import numpy as np
  9. import seaborn as sns
  10. import matplotlib.pyplot as plt
  11. from scipy.stats import linregress
  12. # Get the directory where the code file is located
  13. code_dir = os.path.dirname(os.path.abspath(__file__))
  14. # Get the parent directory of the code directory
  15. parent_dir = os.path.dirname(code_dir)
  16. # Specify the input file path relative to the code file
  17. input_file_path = os.path.join(parent_dir, 'output', 'behavior_data_processed.csv')
  18. outfile_path = os.path.join(parent_dir, "output", "figures")
  19. input_file_path_new = os.path.join(parent_dir, 'output', 'behavior_data_processed_prr.csv')
  20. # Read the CSV file into a Pandas DataFrame
  21. df = pd.read_csv(input_file_path)
  22. df["averageScore_shifted"] = df["averageScore"] + np.abs(np.min(df["averageScore"]))
  23. # Filter the DataFrame for stroke group and non-"S" in "2 Cluster"
  24. df_stroke = df[(df["Group"] == "Stroke") & (df["2 Cluster"] != "S")]
  25. # Calculate the average baseline score in case of NaN
  26. average_baseline_incase_of_nan = df_stroke[df_stroke["TimePoint_merged"] == 0]["averageScore_shifted"].mean()
  27. average_28_incase_of_nan = df_stroke[df_stroke["TimePoint_merged"] == 28]["averageScore_shifted"].mean()
  28. # Initialize an empty list to store proportional recovery data
  29. proportional_recovery_list = []
  30. # Loop through unique StudyIDs in the stroke group
  31. for ss in df_stroke["StudyID"].unique():
  32. df_temp = df[df["StudyID"] == ss]
  33. average_score_day3 = df_temp[df_temp["TimePoint_merged"] == 3]["averageScore_shifted"].values
  34. average_score_baseline = df_temp[df_temp["TimePoint_merged"] == 0]["averageScore_shifted"].values
  35. average_score_day28 = df_temp[df_temp["TimePoint_merged"] == 28]["averageScore_shifted"].values
  36. cluster = df_temp["3 Cluster"].values[1]
  37. # Handle cases with NaN in baseline score
  38. if np.isnan(average_score_baseline):
  39. continue
  40. else:
  41. average_score_baseline = average_score_baseline[0]
  42. if len(average_score_day28) > 0:
  43. average_score_day28 = average_score_day28[0]
  44. else:
  45. average_score_day28 = df_temp[df_temp["TimePoint_merged"] == 21]["averageScore_shifted"].values
  46. if len(average_score_day28) > 0:
  47. average_score_day28 = average_score_day28[0]
  48. else:
  49. continue
  50. if len(average_score_day3) > 0:
  51. average_score_day3 = average_score_day3[0]
  52. initial_imapairment = average_score_day3 - average_score_baseline
  53. if initial_imapairment <= 0 or np.isnan(initial_imapairment):
  54. continue
  55. ## calculating Change Observed
  56. observed_change = -(average_score_day28 - initial_imapairment)
  57. error = -0.4
  58. estimated_change = 0.7 * initial_imapairment + error
  59. proportional_recovery_list.append({"StudyID": ss, "initial impairment": initial_imapairment, 'change predicted': estimated_change, 'change observed': observed_change, "P3": average_score_day3, "BL": average_score_baseline, "P28": average_score_day28, "cluster": cluster})
  60. # Convert the list to a DataFrame
  61. proportional_recovery = pd.DataFrame(proportional_recovery_list)
  62. # Output the proportional recovery DataFrame
  63. print(proportional_recovery)
  64. # Divide the data into 3 groups based on "initial impairment"
  65. proportional_recovery['cluster ii'] = pd.qcut(proportional_recovery['initial impairment'], 3, labels=['Low ii', 'Medium ii', 'High ii'])
  66. proportional_recovery['cluster co'] = pd.qcut(proportional_recovery['change observed'], 3, labels=['Low co', 'Medium co', 'High co'])
  67. # Initialize 'linecluster' to 'own_regression'
  68. proportional_recovery['linecluster'] = "own_regression"
  69. # Perform initial linear regression on the whole dataset to determine outliers
  70. x_initial = proportional_recovery['initial impairment']
  71. y_initial = proportional_recovery['change observed']
  72. slope_initial, intercept_initial, r_value_initial, p_value_initial, std_err_initial = linregress(x_initial, y_initial)
  73. # Define the initial regression line
  74. regression_line_initial = slope_initial * x_initial + intercept_initial
  75. # Calculate residuals for the whole dataset
  76. residuals_initial = y_initial - regression_line_initial
  77. # Calculate interquartile range (IQR)
  78. Q1_initial = np.percentile(residuals_initial, 25)
  79. Q3_initial = np.percentile(residuals_initial, 75)
  80. IQR_initial = Q3_initial - Q1_initial
  81. # Set a threshold for identifying outliers using IQR method
  82. threshold_initial = 1.5 * IQR_initial
  83. # Identify outliers for the whole dataset
  84. outliers = proportional_recovery[(residuals_initial < Q1_initial - threshold_initial) | (residuals_initial > Q3_initial + threshold_initial)]
  85. R = 15
  86. # Loop up to 10 times for iterative refinement
  87. for cc in range(R):
  88. # Perform linear regression on 'own_regression' points
  89. x = proportional_recovery[proportional_recovery["linecluster"] == "own_regression"]['initial impairment']
  90. y = proportional_recovery[proportional_recovery["linecluster"] == "own_regression"]['change observed']
  91. slope, intercept, r_value, p_value, std_err = linregress(x, y)
  92. intercept = round(intercept, 2) # Round the intercept to two decimal points
  93. # Define the regression line
  94. regression_line = slope * x + intercept
  95. # Calculate distances to the regression line and PRR line
  96. dist_to_regression = np.abs(proportional_recovery['change observed'] - (slope * proportional_recovery['initial impairment'] + intercept))
  97. dist_to_PRR = np.abs(proportional_recovery['change observed'] - (0.7 * proportional_recovery['initial impairment'] + intercept))
  98. # Assign points to the closest line
  99. proportional_recovery['linecluster'] = np.where(dist_to_regression < dist_to_PRR, 'own_regression', 'PRR')
  100. # Set figure size
  101. cm = 1 / 2.54 # Conversion factor from inches to cm
  102. plt.figure(figsize=(18 * cm, 9 * cm), dpi=300) # 9 cm by 9 cm
  103. # Plot the results
  104. sns.scatterplot(x='initial impairment', y='change observed', hue='linecluster', data=proportional_recovery, palette='viridis', marker='o')
  105. sns.scatterplot(x=outliers['initial impairment'], y=outliers['change observed'], color='red', marker='x', s=100, label='Outliers')
  106. sns.lineplot(x=x, y=regression_line, color='red', label=f'Linear fit: y = {slope:.2f}x + {intercept:.2f}')
  107. plt.plot([x.min(), x.max()], [0.7 * x.min() + intercept, 0.7 * x.max() + intercept], color='gray', linestyle="--", linewidth=1, label=f'y = 0.7x + {intercept}')
  108. # Set plot title and labels
  109. plt.xlabel('initial impairment (P3 - BL)', fontsize=10, fontweight='bold')
  110. plt.ylabel('change observed (P28 - ii)', fontsize=10, fontweight='bold')
  111. # Set legend properties
  112. plt.legend(frameon=False, fontsize=7) # No legend border, smaller font size
  113. # Remove grid
  114. plt.grid(False)
  115. # Set font size and weight for tick labels
  116. plt.xticks(fontsize=10, fontweight='bold')
  117. plt.yticks(fontsize=10, fontweight='bold')
  118. # Set plot frame properties
  119. ax = plt.gca() # Get the current axes
  120. ax.spines['right'].set_visible(False) # Turn off the right frame
  121. ax.spines['top'].set_visible(False) # Turn off the top frame
  122. if cc == (R - 1):
  123. # Calculate residuals for the final outlier determination using the PRR rule
  124. final_slope = 0.7
  125. final_intercept = round(intercept, 2) # Round the intercept to two decimal points
  126. # Calculate the PRR line
  127. prr_line = final_slope * proportional_recovery[proportional_recovery["linecluster"] == "PRR"]['initial impairment'] + final_intercept
  128. # Calculate residuals based on the PRR rule
  129. final_residuals = proportional_recovery[proportional_recovery["linecluster"] == "PRR"]['change observed'] - prr_line
  130. # Calculate interquartile range (IQR) for residuals
  131. Q1_final = np.percentile(final_residuals, 25)
  132. Q3_final = np.percentile(final_residuals, 75)
  133. IQR_final = Q3_final - Q1_final
  134. # Set a threshold for identifying new outliers using the IQR method
  135. threshold_final = 1.5 * IQR_final
  136. # Identify new outliers based on the PRR rule
  137. new_outliers = proportional_recovery[
  138. (final_residuals < Q1_final - threshold_final) | (final_residuals > Q3_final + threshold_final) & (proportional_recovery["linecluster"] == "PRR")
  139. ]
  140. # Update the "linecluster" column to "outlier" for the new outliers
  141. proportional_recovery.loc[new_outliers.index, 'linecluster'] = "outlier"
  142. #sns.scatterplot(x=new_outliers['initial impairment'], y=new_outliers['change observed'], color='blue', marker='d', s=100, label='New Outliers')
  143. plt.legend(frameon=False, fontsize=8) # No legend border, smaller font size
  144. # Save the plot at the specified outfile_path
  145. plt.savefig(os.path.join(outfile_path, "ppr_rule.png"), dpi=300)
  146. plt.savefig(os.path.join(outfile_path, "ppr_rule.svg"))
  147. plt.show()
  148. # Show plot
  149. plt.tight_layout() # Adjust layout to prevent clipping of labels
  150. plt.show()
  151. # Calculate percentage of those who follow the PRR rule
  152. # Count of data points in each linecluster
  153. ppr_count = len(proportional_recovery[proportional_recovery["linecluster"] == "PRR"])
  154. own_regression_count = len(proportional_recovery[proportional_recovery["linecluster"] == "own_regression"])
  155. print("Count of subjects in PRR cluster:", ppr_count)
  156. print("Count of subjects in own_regression cluster:", own_regression_count)
  157. print("percent of subjects that follow the PRR: %", ppr_count / (own_regression_count + ppr_count) * 100)
  158. # Map the 'linecluster' from proportional_recovery to df based on 'StudyID'
  159. df = df.merge(proportional_recovery[['StudyID', 'linecluster','cluster ii','cluster co']], on='StudyID', how='left')
  160. # Step 2: Set values to 'S' for the 'Sham' group
  161. # Step 2: Add 'S' to the categories of the categorical columns
  162. for col in ['linecluster', 'cluster ii', 'cluster co']:
  163. if col in df.columns:
  164. df[col] = df[col].astype('category')
  165. if 'S' not in df[col].cat.categories:
  166. df[col] = df[col].cat.add_categories('S')
  167. # Step 3: Set values to 'S' for the 'Sham' group
  168. sham_indices = df['Group'] == "Sham"
  169. df.loc[sham_indices, ['linecluster', 'cluster ii', 'cluster co']] = "S"
  170. # Display the updated dataframe
  171. print(df)
  172. df.to_csv(input_file_path_new, index=False)
  173. #%%
  174. # Perform hierarchical clustering using Ward's method
  175. Z = linkage(pdist(proportional_recovery[['change predicted', 'change observed']]), method='ward')
  176. # Assuming Z is the linkage matrix obtained from hierarchical clustering
  177. cm = 1/2.53
  178. # Plot dendrogram and obtain the distances
  179. plt.figure(figsize=(9*cm, 9*cm),dpi = 300)
  180. dendrogram(Z)
  181. plt.title('Hierarchical Clustering Dendrogram')
  182. plt.xlabel('Sample index')
  183. plt.ylabel('Distance')
  184. plt.grid(False)
  185. plt.show()
  186. # Extract distances from the dendrogram
  187. num_samples = len(Z) + 1 # Number of samples
  188. distances = Z[:, 2]
  189. num_clusters = range(1, num_samples)
  190. # Plot number of clusters against the distances
  191. plt.figure(figsize=(10*cm, 7*cm),dpi = 300)
  192. plt.plot(num_clusters, distances[::-1], marker='o') # Reverse the distances for correct alignment
  193. plt.title('Number of Clusters vs. Distance')
  194. plt.xlabel('Number of Clusters')
  195. plt.ylabel('Distance')
  196. plt.grid(True)
  197. plt.show()
  198. max_d = 10
  199. # Perform clustering with the chosen number of clusters
  200. clusters = fcluster(Z, max_d, criterion='distance')
  201. # Add cluster labels to the DataFrame
  202. proportional_recovery['Cluster'] = clusters
  203. # Perform linear regression within each cluster
  204. for cluster in proportional_recovery['Cluster'].unique():
  205. cluster_data = proportional_recovery[proportional_recovery['Cluster'] == cluster]
  206. X = cluster_data[['change observed']]
  207. X = sm.add_constant(X) # Add a constant for the intercept
  208. y = cluster_data['change predicted']
  209. model = sm.OLS(y, X).fit()
  210. # Print cluster information and model summary
  211. print(f"Cluster {cluster}:")
  212. print(model.summary())
  213. # Get the model coefficients
  214. intercept = model.params['const']
  215. slope = model.params['change observed']
  216. # Print the mathematical model
  217. print(f"The mathematical model for Cluster {cluster} is: y = {intercept:.4f} + {slope:.4f} * x")
  218. # Plot 'change predicted' vs 'change observed' with hue representing clusters
  219. plt.figure(figsize=(18*cm, 18*cm),dpi = 300)
  220. sns.scatterplot(data=proportional_recovery, x='change observed', y='change predicted', hue='Cluster', palette='tab10')
  221. plt.title('Change Predicted vs Change Observed by Cluster')
  222. plt.xlabel('Change Observed')
  223. plt.ylabel('Change Predicted')
  224. plt.legend(title='Cluster')
  225. # Add y = x line
  226. max_val = max(proportional_recovery['change observed'].max(), proportional_recovery['change predicted'].max())
  227. min_val = min(proportional_recovery['change observed'].min(), proportional_recovery['change predicted'].min())
  228. plt.plot([min_val, max_val], [min_val, max_val], color='black', linestyle='--')
  229. plt.show()
  230. # Calculate the 'New_PR' column
  231. proportional_recovery["New_PR"] = proportional_recovery["change observed"]/(proportional_recovery["P3"]-proportional_recovery["BL"])
  232. # Filter out 'inf' values
  233. valid_new_pr = proportional_recovery[(proportional_recovery["Cluster"] == 1) & (np.isfinite(proportional_recovery["New_PR"]))]
  234. # Calculate the mean of the filtered values
  235. mean_new_pr = valid_new_pr["New_PR"].mean()
  236. print(mean_new_pr)