clusterBoxPlot_with_statistics.py 5.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. import os
  2. import pandas as pd
  3. import seaborn as sns
  4. import matplotlib.pyplot as plt
  5. import statsmodels.api as sm
  6. import statsmodels.formula.api as smf
  7. from statsmodels.stats.multicomp import MultiComparison
  8. import itertools
  9. import scipy.stats
  10. import statsmodels.stats.multitest
  11. # Get the directory where the code file is located
  12. code_dir = os.path.dirname(os.path.abspath(__file__))
  13. # Get the parent directory of the code directory
  14. parent_dir = os.path.dirname(code_dir)
  15. # Create a new folder for cluster analysis
  16. cluster_analysis_dir = os.path.join(parent_dir, 'output', 'cluster_analysis')
  17. os.makedirs(cluster_analysis_dir, exist_ok=True)
  18. # Specify the input file path relative to the code file
  19. input_file_path = os.path.join(parent_dir, 'output', 'Merged_behaviour_data_ztransform_kmeans_prr_clustered.csv')
  20. # Read the CSV file into a Pandas DataFrame
  21. df = pd.read_csv(input_file_path)
  22. df = df[df['linefit_cluster'] != "outlier"]
  23. # Define the clusters
  24. clusters = ['linefit_cluster', "2 Cluster", "3 Cluster", "4 Cluster", "5 Cluster", "cluster co"]
  25. # Create a list to store the results
  26. results = []
  27. def perform_sidac_test(df, cluster, score_column):
  28. sidak_results = []
  29. for time_point in df["TimePointMerged"].unique():
  30. df_time = df[df["TimePointMerged"] == time_point]
  31. group_combinations = list(itertools.combinations(df_time[cluster].unique(), 2))
  32. for group1, group2 in group_combinations:
  33. group1_data = df_time[df_time[cluster] == group1][score_column]
  34. group2_data = df_time[df_time[cluster] == group2][score_column]
  35. t_stat, p_val = scipy.stats.ttest_ind(group1_data, group2_data)
  36. sidak_p_val = statsmodels.stats.multitest.multipletests([p_val], method='sidak')[1][0]
  37. sidak_results.append({'Group1': group1, 'Group2': group2, 'TimePoint': time_point, 'PValue': sidak_p_val})
  38. return sidak_results
  39. Scores = ["DeficitScore", "C_PawDragPercent", "GW_FootFault", "RB_HindlimbDrop"]
  40. # Loop through each score and fit the mixed-effects model
  41. for score in Scores:
  42. for cluster in clusters:
  43. # Sort the data by the current score within each TimePointMerged
  44. df_sorted = df.sort_values(by=['TimePointMerged', score], ascending=[True, True])
  45. df_sorted = df_sorted.dropna(subset=[score])
  46. df_sorted = df_sorted.dropna(subset=['linefit_cluster'])
  47. # Fit the mixed-effects model
  48. model = smf.mixedlm(f"{score} ~ TimePointMerged", df_sorted, groups=df_sorted[cluster])
  49. result = model.fit()
  50. # Perform Tukey's HSD test for post hoc analysis
  51. tukey_results = {}
  52. for group in df_sorted[cluster].unique():
  53. df_group = df_sorted[df_sorted[cluster] == group]
  54. mc = MultiComparison(df_group[score], df_group['TimePointMerged'])
  55. tukey_result = mc.tukeyhsd()
  56. tukey_results[group] = tukey_result
  57. # Perform the Sidak multiple comparisons for group and time effect
  58. sidak_results = perform_sidac_test(df_sorted, cluster, score)
  59. # Create a combined log file for mixed model, Tukey, and Sidak results
  60. log_file_combined = os.path.join(cluster_analysis_dir, f'{cluster}_{score}_analysis_log.txt')
  61. with open(log_file_combined, 'w') as log:
  62. log.write(f'Model Name: {cluster}\n')
  63. log.write(str(result.summary()) + '\n')
  64. for group, tukey_result in tukey_results.items():
  65. log.write(f'Posthoc Analysis for {group}:\n')
  66. log.write(str(tukey_result.summary()) + '\n')
  67. log.write("Sidak Multiple Comparisons:\n")
  68. log.write("Group1\tGroup2\tTimePoint\tPValue\tSignificantDifference\n")
  69. for result in sidak_results:
  70. significant = 'Yes' if result['PValue'] < 0.05 else 'No'
  71. log.write(f"{result['Group1']}\t{result['Group2']}\t{result['TimePoint']}\t{result['PValue']}\t{significant}\n")
  72. # Append the results to the list
  73. results.append({'Cluster': cluster, 'Score': score, 'Mixed_Model': result, 'Tukey_HSD': tukey_results, 'Sidak_Comparisons': sidak_results})
  74. cm = 1 / 2.54
  75. plt.figure(figsize=(18*cm, 6*cm))
  76. # Define the properties for the outliers
  77. flierprops = dict(marker='o', markerfacecolor='b', markersize=3, linestyle='none')
  78. g = sns.catplot(
  79. data=df_sorted, x=cluster, y=score, hue="TimePointMerged",
  80. palette="Set2", kind="box", height=5*cm, aspect=3.6, errorbar=('ci', 95),
  81. flierprops=flierprops, legend=False
  82. )
  83. g.despine(left=True)
  84. g.set_axis_labels(cluster, score)
  85. g.set_titles(f'Boxplot for {cluster} - {score}')
  86. # Move the legend outside of the plot
  87. plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
  88. # Save the plot to a file
  89. plot_file = os.path.join(cluster_analysis_dir, f'{cluster}_{score}_boxplot.png')
  90. plt.savefig(plot_file, bbox_inches='tight', dpi=300)
  91. plt.close()