ttest_BL_P28_differences.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114
  1. # -*- coding: utf-8 -*-
  2. """
  3. Created on Mon Oct 28 12:21:08 2024
  4. @author: arefk
  5. """
  6. import os
  7. import pandas as pd
  8. import numpy as np
  9. from scipy.stats import shapiro, ttest_ind, mannwhitneyu
  10. from tqdm import tqdm
  11. from concurrent.futures import ThreadPoolExecutor, as_completed
  12. # Get the directory where the code file is located
  13. code_dir = os.path.dirname(os.path.abspath(__file__))
  14. parent_dir = os.path.dirname(code_dir)
  15. # Load the CSV data
  16. input_file_path = os.path.join(parent_dir, 'output', "Quantitative_outputs",'Quantitative_results_from_dwi_processing_only_in_stroke_affected_slices.csv')
  17. df = pd.read_csv(input_file_path, low_memory=False)
  18. # Initialize an empty list to store results
  19. results = []
  20. # Get unique values of masks, qtypes, groups, and dilation amounts
  21. unique_masks = df['mask_name'].unique()
  22. unique_qtypes = df['Qtype'].unique()
  23. unique_groups = df['Group'].unique()
  24. unique_dilations = df['dialation_amount'].unique()
  25. # Prepare the combinations for parallel processing
  26. combinations = [(mask, qtype, group, dilation) for mask in unique_masks for qtype in unique_qtypes for group in unique_groups for dilation in unique_dilations]
  27. # Function to process each combination
  28. def process_combination(mask, qtype, group, dilation):
  29. result = None
  30. # Filter data for timepoints 0 and 28 separately
  31. df_timepoint_0 = df[(df['merged_timepoint'] == 0) &
  32. (df['Group'] == group) &
  33. (df['mask_name'] == mask) &
  34. (df['Qtype'] == qtype) &
  35. (df['dialation_amount'] == dilation)]
  36. df_timepoint_28 = df[(df['merged_timepoint'] == 28) &
  37. (df['Group'] == group) &
  38. (df['mask_name'] == mask) &
  39. (df['Qtype'] == qtype) &
  40. (df['dialation_amount'] == dilation)]
  41. # Drop NaN values for the 'Value' column
  42. timepoint_0_values = df_timepoint_0['Value'].dropna()
  43. timepoint_28_values = df_timepoint_28['Value'].dropna()
  44. # Filter data after dropping NaN values to get subjects with non-null values
  45. df_timepoint_0_filtered = df_timepoint_0[df_timepoint_0['Value'].notna()]
  46. df_timepoint_28_filtered = df_timepoint_28[df_timepoint_28['Value'].notna()]
  47. # Only proceed if there are more than 8 subjects in either timepoint after dropping NaNs
  48. if len(df_timepoint_0_filtered['subjectID'].unique()) > 8 and len(df_timepoint_28_filtered['subjectID'].unique()) > 8:
  49. # Check if we have enough values to perform statistical tests
  50. if len(timepoint_0_values) > 0 and len(timepoint_28_values) > 0:
  51. # Perform Shapiro-Wilk normality test
  52. shapiro_timepoint_0_p = shapiro(timepoint_0_values)[1]
  53. shapiro_timepoint_28_p = shapiro(timepoint_28_values)[1]
  54. # Check if data is normally distributed
  55. if shapiro_timepoint_0_p < 0.05 or shapiro_timepoint_28_p < 0.05:
  56. # Use Mann-Whitney U test if data is not normally distributed
  57. stat, p_value = mannwhitneyu(timepoint_0_values, timepoint_28_values, alternative='two-sided')
  58. else:
  59. # Use Welch's t-test if data is normally distributed
  60. stat, p_value = ttest_ind(timepoint_0_values, timepoint_28_values, equal_var=False)
  61. # Store the result
  62. result = {
  63. 'mask_name': mask,
  64. 'Qtype': qtype,
  65. 'Group': group,
  66. 'dialation_amount': dilation,
  67. 'Pvalue': p_value
  68. }
  69. return result
  70. # Parallel processing using ThreadPoolExecutor with 6 workers
  71. with ThreadPoolExecutor(max_workers=6) as executor:
  72. futures = {executor.submit(process_combination, mask, qtype, group, dilation): (mask, qtype, group, dilation) for mask, qtype, group, dilation in combinations}
  73. # Iterate through completed tasks with progress bar
  74. with tqdm(total=len(futures), desc="Processing combinations in parallel") as pbar:
  75. for future in as_completed(futures):
  76. combination = futures[future]
  77. try:
  78. result = future.result()
  79. if result:
  80. results.append(result)
  81. except Exception as e:
  82. print(f"Error processing combination {combination}: {e}")
  83. finally:
  84. pbar.update(1)
  85. # Convert results to a DataFrame
  86. results_df = pd.DataFrame(results)
  87. # Define output path for the new CSV
  88. output_file_path = os.path.join(parent_dir, 'output', "Quantitative_outputs", 'Significance_timepoint_0_vs_28_only_in_stroke_slices.csv')
  89. # Save results to CSV
  90. results_df.to_csv(output_file_path, index=False)
  91. print(f"Significance analysis results saved to {output_file_path}")