123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114 |
- import os
- import pandas as pd
- import seaborn as sns
- import matplotlib.pyplot as plt
- import statsmodels.api as sm
- import statsmodels.formula.api as smf
- from statsmodels.stats.multicomp import MultiComparison
- import itertools
- import scipy.stats
- import statsmodels.stats.multitest
- # Get the directory where the code file is located
- code_dir = os.path.dirname(os.path.abspath(__file__))
- # Get the parent directory of the code directory
- parent_dir = os.path.dirname(code_dir)
- # Create a new folder for cluster analysis
- cluster_analysis_dir = os.path.join(parent_dir, 'output', 'cluster_analysis')
- os.makedirs(cluster_analysis_dir, exist_ok=True)
- # Specify the input file path relative to the code file
- input_file_path = os.path.join(parent_dir, 'output', 'Merged_behaviour_data_ztransform_kmeans_prr_clustered.csv')
- # Read the CSV file into a Pandas DataFrame
- df = pd.read_csv(input_file_path)
- df = df[df['linefit_cluster'] != "outlier"]
- # Define the clusters
- clusters = ['linefit_cluster', "2 Cluster", "3 Cluster", "4 Cluster", "5 Cluster", "cluster co"]
- # Create a list to store the results
- results = []
- def perform_sidac_test(df, cluster, score_column):
- sidak_results = []
- for time_point in df["TimePointMerged"].unique():
- df_time = df[df["TimePointMerged"] == time_point]
- group_combinations = list(itertools.combinations(df_time[cluster].unique(), 2))
- for group1, group2 in group_combinations:
- group1_data = df_time[df_time[cluster] == group1][score_column]
- group2_data = df_time[df_time[cluster] == group2][score_column]
- t_stat, p_val = scipy.stats.ttest_ind(group1_data, group2_data)
- sidak_p_val = statsmodels.stats.multitest.multipletests([p_val], method='sidak')[1][0]
- sidak_results.append({'Group1': group1, 'Group2': group2, 'TimePoint': time_point, 'PValue': sidak_p_val})
- return sidak_results
- Scores = ["DeficitScore", "C_PawDragPercent", "GW_FootFault", "RB_HindlimbDrop"]
- # Loop through each score and fit the mixed-effects model
- for score in Scores:
- for cluster in clusters:
- # Sort the data by the current score within each TimePointMerged
- df_sorted = df.sort_values(by=['TimePointMerged', score], ascending=[True, True])
- df_sorted = df_sorted.dropna(subset=[score])
- df_sorted = df_sorted.dropna(subset=['linefit_cluster'])
-
- # Fit the mixed-effects model
- model = smf.mixedlm(f"{score} ~ TimePointMerged", df_sorted, groups=df_sorted[cluster])
- result = model.fit()
-
- # Perform Tukey's HSD test for post hoc analysis
- tukey_results = {}
- for group in df_sorted[cluster].unique():
- df_group = df_sorted[df_sorted[cluster] == group]
- mc = MultiComparison(df_group[score], df_group['TimePointMerged'])
- tukey_result = mc.tukeyhsd()
- tukey_results[group] = tukey_result
-
- # Perform the Sidak multiple comparisons for group and time effect
- sidak_results = perform_sidac_test(df_sorted, cluster, score)
-
- # Create a combined log file for mixed model, Tukey, and Sidak results
- log_file_combined = os.path.join(cluster_analysis_dir, f'{cluster}_{score}_analysis_log.txt')
- with open(log_file_combined, 'w') as log:
- log.write(f'Model Name: {cluster}\n')
- log.write(str(result.summary()) + '\n')
-
- for group, tukey_result in tukey_results.items():
- log.write(f'Posthoc Analysis for {group}:\n')
- log.write(str(tukey_result.summary()) + '\n')
-
- log.write("Sidak Multiple Comparisons:\n")
- log.write("Group1\tGroup2\tTimePoint\tPValue\tSignificantDifference\n")
- for result in sidak_results:
- significant = 'Yes' if result['PValue'] < 0.05 else 'No'
- log.write(f"{result['Group1']}\t{result['Group2']}\t{result['TimePoint']}\t{result['PValue']}\t{significant}\n")
-
- # Append the results to the list
- results.append({'Cluster': cluster, 'Score': score, 'Mixed_Model': result, 'Tukey_HSD': tukey_results, 'Sidak_Comparisons': sidak_results})
-
- cm = 1 / 2.54
-
- plt.figure(figsize=(18*cm, 6*cm))
-
- # Define the properties for the outliers
- flierprops = dict(marker='o', markerfacecolor='b', markersize=3, linestyle='none')
-
- g = sns.catplot(
- data=df_sorted, x=cluster, y=score, hue="TimePointMerged",
- palette="Set2", kind="box", height=5*cm, aspect=3.6, errorbar=('ci', 95),
- flierprops=flierprops, legend=False
- )
-
- g.despine(left=True)
- g.set_axis_labels(cluster, score)
- g.set_titles(f'Boxplot for {cluster} - {score}')
-
- # Move the legend outside of the plot
- plt.legend(loc='upper left', bbox_to_anchor=(1, 1))
-
- # Save the plot to a file
- plot_file = os.path.join(cluster_analysis_dir, f'{cluster}_{score}_boxplot.png')
- plt.savefig(plot_file, bbox_inches='tight', dpi=300)
- plt.close()
|