123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114 |
- # -*- coding: utf-8 -*-
- """
- Created on Mon Oct 28 12:21:08 2024
- @author: arefk
- """
- import os
- import pandas as pd
- import numpy as np
- from scipy.stats import shapiro, ttest_ind, mannwhitneyu
- from tqdm import tqdm
- from concurrent.futures import ThreadPoolExecutor, as_completed
- # Get the directory where the code file is located
- code_dir = os.path.dirname(os.path.abspath(__file__))
- parent_dir = os.path.dirname(code_dir)
- # Load the CSV data
- input_file_path = os.path.join(parent_dir, 'output', "Quantitative_outputs",'Quantitative_results_from_dwi_processing_only_in_stroke_affected_slices.csv')
- df = pd.read_csv(input_file_path, low_memory=False)
- # Initialize an empty list to store results
- results = []
- # Get unique values of masks, qtypes, groups, and dilation amounts
- unique_masks = df['mask_name'].unique()
- unique_qtypes = df['Qtype'].unique()
- unique_groups = df['Group'].unique()
- unique_dilations = df['dialation_amount'].unique()
- # Prepare the combinations for parallel processing
- combinations = [(mask, qtype, group, dilation) for mask in unique_masks for qtype in unique_qtypes for group in unique_groups for dilation in unique_dilations]
- # Function to process each combination
- def process_combination(mask, qtype, group, dilation):
- result = None
-
- # Filter data for timepoints 0 and 3 separately
- df_timepoint_0 = df[(df['merged_timepoint'] == 0) &
- (df['Group'] == group) &
- (df['mask_name'] == mask) &
- (df['Qtype'] == qtype) &
- (df['dialation_amount'] == dilation)]
- df_timepoint_3 = df[(df['merged_timepoint'] == 3) &
- (df['Group'] == group) &
- (df['mask_name'] == mask) &
- (df['Qtype'] == qtype) &
- (df['dialation_amount'] == dilation)]
- # Drop NaN values for the 'Value' column
- timepoint_0_values = df_timepoint_0['Value'].dropna()
- timepoint_3_values = df_timepoint_3['Value'].dropna()
- # Filter data after dropping NaN values to get subjects with non-null values
- df_timepoint_0_filtered = df_timepoint_0[df_timepoint_0['Value'].notna()]
- df_timepoint_3_filtered = df_timepoint_3[df_timepoint_3['Value'].notna()]
- # Only proceed if there are more than 8 subjects in either timepoint after dropping NaNs
- if len(df_timepoint_0_filtered['subjectID'].unique()) > 8 and len(df_timepoint_3_filtered['subjectID'].unique()) > 8:
- # Check if we have enough values to perform statistical tests
- if len(timepoint_0_values) > 0 and len(timepoint_3_values) > 0:
- # Perform Shapiro-Wilk normality test
- shapiro_timepoint_0_p = shapiro(timepoint_0_values)[1]
- shapiro_timepoint_3_p = shapiro(timepoint_3_values)[1]
- # Check if data is normally distributed
- if shapiro_timepoint_0_p < 0.05 or shapiro_timepoint_3_p < 0.05:
- # Use Mann-Whitney U test if data is not normally distributed
- stat, p_value = mannwhitneyu(timepoint_0_values, timepoint_3_values, alternative='two-sided')
- else:
- # Use Welch's t-test if data is normally distributed
- stat, p_value = ttest_ind(timepoint_0_values, timepoint_3_values, equal_var=False)
- # Store the result
- result = {
- 'mask_name': mask,
- 'Qtype': qtype,
- 'Group': group,
- 'dialation_amount': dilation,
- 'Pvalue': p_value
- }
- return result
- # Parallel processing using ThreadPoolExecutor with 6 workers
- with ThreadPoolExecutor(max_workers=6) as executor:
- futures = {executor.submit(process_combination, mask, qtype, group, dilation): (mask, qtype, group, dilation) for mask, qtype, group, dilation in combinations}
- # Iterate through completed tasks with progress bar
- with tqdm(total=len(futures), desc="Processing combinations in parallel") as pbar:
- for future in as_completed(futures):
- combination = futures[future]
- try:
- result = future.result()
- if result:
- results.append(result)
- except Exception as e:
- print(f"Error processing combination {combination}: {e}")
- finally:
- pbar.update(1)
- # Convert results to a DataFrame
- results_df = pd.DataFrame(results)
- # Define output path for the new CSV
- output_file_path = os.path.join(parent_dir, 'output', "Quantitative_outputs", 'Significance_timepoint_0_vs_3_only_in_stroke_slices.csv')
- # Save results to CSV
- results_df.to_csv(output_file_path, index=False)
- print(f"Significance analysis results saved to {output_file_path}")
|