import os import pandas as pd import seaborn as sns import matplotlib.pyplot as plt import statsmodels.formula.api as smf from statsmodels.stats.multicomp import MultiComparison from itertools import combinations import statsmodels import scipy # Set up warning suppression for convergence import warnings warnings.simplefilter(action='ignore') # Define the PlotBoxplot function def PlotBoxplot(filtered_df, qq, mm, output_folder): """ Plot boxplots for filtered dataframe and save as images. Parameters: filtered_df (DataFrame): Filtered dataframe based on "Qtype" and "mask_name". qq (str): Qtype value. mm (str): Mask_name value. output_folder (str): Path to the output folder to save images. """ # Set the color palette to Set2 sns.set_palette("Set2") # Set the figure size to 18 cm in width plt.figure(figsize=(18/2.54, 4)) # Convert 18 cm to inches # Create a boxplot sns.boxplot(data=filtered_df, x="merged_timepoint", y="Value", hue="Group", fliersize=3 ,flierprops={"marker": "o"}) # Set title and labels plt.title(f'{mm}') plt.xlabel('Timepoint') plt.ylabel(qq) # Show legend without title and frame plt.legend(title=None, frameon=False) # Save the plot as an image output_path = os.path.join(output_folder, f'{qq}_{mm}_mixedlm_significant.png') plt.savefig(output_path, dpi=100) # Close the plot to avoid displaying it plt.close() # Get the directory where the code file is located code_dir = os.path.dirname(os.path.abspath(__file__)) # Get the parent directory of the code directory parent_dir = os.path.dirname(code_dir) # Create a new folder for mixed model analysis mixed_model_dir = os.path.join(parent_dir, 'output', "Final_Quantitative_output", 'mixed_model_analysis') os.makedirs(mixed_model_dir, exist_ok=True) # Step 4: Save the resulting dataframe to a CSV file input_file_path = os.path.join(parent_dir, 'output', "Final_Quantitative_output", 'Quantitative_results_from_dwi_processing.csv') df = pd.read_csv(input_file_path) qc_csv = os.path.join(parent_dir,"input","AIDAqc_ouptut_for_data","Voting_remapped.csv") df_qc = pd.read_csv(qc_csv) df_qc_5 = df_qc[(df_qc["Voting outliers (from 5)"]>3) & (df_qc["sequence_type"]=="diff")] # Filtering the dataframe based on conditions df_f0 = df[(df["dialation_amount"] == 3) & (~df["merged_timepoint"].isin([42, 56]))] # Merge df_f1 and df_qc_5 on "subjectID" and "merged_timepoint" merged_df = pd.merge(df_f0, df_qc_5, on=["subjectID", "merged_timepoint"], how="left", indicator=True) # Drop the entries that are present in both df_f1 and df_qc_5 df_f1 = merged_df[merged_df["_merge"] == "left_only"].drop(columns=["_merge"]) # Drop unnecessary columns with NaN values df_f1 = df_f1.dropna(axis=1, how="all") # Create lists to store results and terminal output results = [] # Iterate over unique values of "Qtype" and "mask_name" for qq in df["Qtype"].unique(): for mm in df["mask_name"].unique(): # Filter the dataframe based on current "Qtype" and "mask_name" filtered_df = df_f1[(df_f1["Qtype"] == qq) & (df_f1["mask_name"] == mm)] # Fit the mixed-effects model model_name = f"{qq}_{mm}" md = smf.mixedlm("Value ~ merged_timepoint", filtered_df, groups=filtered_df["Group"]) mdf = md.fit(method=["lbfgs"]) # Log file for each combination of qq and mm log_file = os.path.join(mixed_model_dir, f"{model_name}_log.txt") # Check if the p-value is significant if mdf.pvalues['merged_timepoint'] < 0.05: with open(log_file, 'w') as log: # Print model name to log file log.write(f"Model Name: {model_name}\n") # Print the model summary to log file log.write(str(mdf.summary()) + '\n') # Plot and save boxplots for significant models PlotBoxplot(filtered_df, qq, mm, mixed_model_dir) # Perform Tukey's HSD test for post hoc analysis for gg in filtered_df["Group"].unique(): filtered_df2 = filtered_df[filtered_df["Group"]==gg] mc = MultiComparison(filtered_df2["Value"], filtered_df2["merged_timepoint"]) tukey_result = mc.tukeyhsd() # Print the summary of the test to log file log.write(f"Posthoc Analysis for {gg}:\n") log.write(str(tukey_result.summary()) + '\n') # Perform the Sidak multiple comparisons for the group effect group_combinations = combinations(filtered_df["Group"].unique(), 2) log.write("Sidak Multiple Comparisons:\n") log.write("Group1\tGroup2\tTimePoint\tPValue\tSignificantDifference\n") for group1, group2 in group_combinations: for time_point in filtered_df["merged_timepoint"].unique(): group1_data = filtered_df[(filtered_df["Group"] == group1) & (filtered_df["merged_timepoint"] == time_point)]["Value"] group2_data = filtered_df[(filtered_df["Group"] == group2) & (filtered_df["merged_timepoint"] == time_point)]["Value"] sidak_result = statsmodels.stats.multitest.multipletests(scipy.stats.ttest_ind(group1_data, group2_data)[1], method='sidak') if sidak_result[0]: log.write(f"{group1}\t{group2}\t{time_point}\t{sidak_result[1]}\tYes\n") else: log.write(f"{group1}\t{group2}\t{time_point}\t{sidak_result[1]}\tNo\n") # Append the results to the list results.append({'Qtype': qq, 'mask_name': mm, 'p_value': mdf.pvalues['merged_timepoint']}) # Create a DataFrame from results results_df = pd.DataFrame(results) # Save results_df as CSV in mixed_model_analysis folder output_file_path = os.path.join(mixed_model_dir, 'mixed_model_results.csv') results_df.to_csv(output_file_path, index=False) # Print the table print(results_df)