Lucas Backes 3 тижнів тому
батько
коміт
b8a507aea3

+ 11 - 0
brainage/__init__.py

@@ -0,0 +1,11 @@
+from .calculate_features import calculate_voxelwise_features, calculate_parcelwise_features
+from .create_splits import stratified_splits
+from .xgboost_adapted import XGBoostAdapted
+from .zscore import ZScoreSubwise, ZScore
+from .create_splits import repeated_stratified_splits
+from .read_data import read_data_cross_site
+from .read_data import read_data
+from .define_models import define_models
+from sklearn.linear_model import LinearRegression
+from .performance_metric import performance_metric
+

BIN
brainage/__pycache__/__init__.cpython-311.pyc


BIN
brainage/__pycache__/__init__.cpython-312.pyc


BIN
brainage/__pycache__/__init__.cpython-39.pyc


BIN
brainage/__pycache__/calculate_features.cpython-311.pyc


BIN
brainage/__pycache__/calculate_features.cpython-312.pyc


BIN
brainage/__pycache__/calculate_features.cpython-39.pyc


BIN
brainage/__pycache__/create_splits.cpython-311.pyc


BIN
brainage/__pycache__/create_splits.cpython-39.pyc


BIN
brainage/__pycache__/define_models.cpython-311.pyc


BIN
brainage/__pycache__/define_models.cpython-39.pyc


BIN
brainage/__pycache__/performance_metric.cpython-311.pyc


BIN
brainage/__pycache__/performance_metric.cpython-39.pyc


BIN
brainage/__pycache__/read_data.cpython-311.pyc


BIN
brainage/__pycache__/read_data.cpython-39.pyc


BIN
brainage/__pycache__/xgboost_adapted.cpython-311.pyc


BIN
brainage/__pycache__/xgboost_adapted.cpython-39.pyc


BIN
brainage/__pycache__/zscore.cpython-311.pyc


BIN
brainage/__pycache__/zscore.cpython-39.pyc


+ 188 - 0
brainage/calculate_features.py

@@ -0,0 +1,188 @@
+import os.path
+import nilearn
+from nilearn import image
+import numpy as np
+import pandas as pd
+import nibabel as nib
+import nibabel.processing as npr
+
+def subsample_img(img, f):
+    """Reduce resample_to_img features of a 3D array by a given factor f."""
+
+    data = img.get_fdata()
+    mask = np.zeros(img.shape)
+    mask[::f, ::f, ::f] = 1
+    data = data * mask
+    return nib.Nifti1Image(data, img.affine, img.header)
+
+def binarize_3d(img, threshold):
+    """binarize 3D spatial image"""
+    return nib.Nifti1Image(
+        np.where(img.get_fdata() > threshold, 1, 0), img.affine, img.header
+    )
+
+def calculate_voxelwise_features(phenotype_file, mask_file, smooth_fwhm, resample_size):
+    """Calculate voxelwise features for the subjects
+
+    Args:
+        phenotype_file (csv or txt): A csv or text file with path to subject images
+        mask_file (nii): The GM mask file to be used to extract features
+        smooth_fwhm (int): Smooth images by applying a Gaussian filter by given FWHM (mm)
+        resample_size (int): Resample image to given voxel size
+
+    Returns:
+        data_resampled (dataframe): pandas dataframe of features (N subjects by M features)
+    """    
+
+    phenotype = pd.read_csv(phenotype_file, header=None)
+    
+    # don't need this anymore
+    # filename, file_extension = os.path.splitext(phenotype_file)
+    # if file_extension == ".txt":
+    #     phenotype = pd.read_csv(phenotype_file, header=None)
+    # elif file_extension == ".csv":
+    #     phenotype = pd.read_csv(phenotype_file, sep=",", header=None)
+    # else:
+    #     raise ValueError("Wrong file. Please imput either a csv or text file")
+
+    print(phenotype.shape)
+    print(phenotype.head())
+
+#    phenotype = phenotype.iloc[0:15]
+
+    data_resampled = np.array([])  # array to save resampled features from subjects mri
+    count = 0
+    for index, row in phenotype.iterrows():  # iterate over each row
+        sub_file = row.values[0]
+
+        if os.path.exists(sub_file):
+            print(f"\n-----Processing subject number {count}------")
+            sub_img = nib.load(sub_file)  # load subject image
+            mask_img = nib.load(mask_file)  # load mask image
+            print("Subject and mask image loaded")
+            print("sub affine original \n", sub_img.affine, sub_img.shape)
+            print("mask affine original \n", mask_img.affine, mask_img.shape)
+
+            print("Perform smoothing")
+            sub_img = image.smooth_img(
+                sub_img, smooth_fwhm
+            )  # smooth the image with 4 mm FWHM
+
+            print("Perform resampling")
+            # trying to match Gaser
+            mask_img_rs = npr.resample_to_output(
+                mask_img, [resample_size] * len(mask_img.shape), order=1
+            )  # resample mask
+            print(
+                "mask affine after resampling\n",
+                mask_img_rs.affine,
+                mask_img_rs.shape,
+            )
+
+            sub_img_rs = image.resample_to_img(
+                sub_img, mask_img_rs, interpolation="linear"
+            )  # resample subject
+            print(
+                "sub affine after resampling\n",
+                sub_img_rs.affine,
+                sub_img_rs.shape,
+            )
+
+            binary_mask_img_rs = binarize_3d(mask_img_rs, 0.5)  # binarize the mask
+            mask_rs = binary_mask_img_rs.get_fdata().astype(bool)
+
+            sub_data_rs = sub_img_rs.get_fdata()[
+                mask_rs
+            ]  # extract voxel using the binarized mask
+            sub_data_rs = sub_data_rs.reshape(1, -1)
+
+            if data_resampled.size == 0:
+                data_resampled = sub_data_rs
+            else:
+                data_resampled = np.concatenate((data_resampled, sub_data_rs), axis=0)
+            count = count + 1
+            print(data_resampled.shape)
+
+    print("\n *** Feature extraction done ***")
+
+    # renaming the columns and convering to dataframe
+    data_resampled = pd.DataFrame(data_resampled)
+    data_resampled.rename(columns=lambda X: "f_" + str(X), inplace=True)
+    print('Feature names:', data_resampled.columns)
+
+    print(f"The size of the feature space is {data_resampled.shape}")
+
+    return data_resampled
+
+
+
+def calculate_parcelwise_features(phenotype_file, mask_dir, num_parcels):
+    """Calculate parcelwise features for the subjects
+
+    Args:
+        phenotype_file (csv or text): A csv or text file with path to subject images
+        mask_dir (_type_): The GM mask file to be used to extract features
+        num_parcels (_type_): Number of parcels
+    
+    Returns:
+        data_parcels (dataframe): pandas dataframe of features (N subjects by M parcels)
+    """    
+
+    phenotype = pd.read_csv(phenotype_file, header=None)
+
+    # filename, file_extension = os.path.splitext(phenotype_file)
+
+    # if file_extension == '.txt':
+    #     phenotype = pd.read_csv(phenotype_file, header=None)
+    # elif file_extension == '.csv':
+    #     phenotype = pd.read_csv(phenotype_file, sep=',', header=None)
+    # else:
+    #     raise ValueError("Wrong file. Please imput either a csv or text file")
+
+    print(phenotype.shape)
+    print(phenotype.head())
+#    phenotype = phenotype.iloc[0:15]
+
+    data_parcels = [] #np.array([])  # array to save resampled features from subjects mri
+    count = 0
+
+    for index, row in phenotype.iterrows(): # iterate over each row
+        sub_file = row.values[0]
+
+        if os.path.exists(sub_file):
+            print(f'\nProcessing subject number {count}')
+            sub_img = nib.load(sub_file)  # load subject image
+            mask_img = nib.load(mask_dir)  # load mask image
+            print ('Subject and mask image loaded')
+            print(sub_file, sub_img.affine, mask_img.affine)
+
+            sub_data = sub_img.get_fdata()
+            sub_data[sub_data == 0] = np.nan # replace zeros with Nan
+            sub_data_parcels = []
+
+            if not np.array_equal(sub_img.affine, mask_img.affine):
+                mask_img = nilearn.image.resample_to_img(mask_img, sub_img, interpolation='linear')
+            else:
+                print("Subject and mask have same affine")
+
+            for num in range(1, int(num_parcels) + 1):
+                itemindex = np.where(mask_img.get_fdata() == num)  # get indices from the mask for a parcel
+                sub_mat = sub_data[itemindex]
+
+                if np.all(np.isnan(sub_mat)):
+                    sub_agg = 0
+                else:
+                    sub_agg = np.nanmean(sub_mat) # mean the data from the indices to get GM volume
+                sub_data_parcels.append(sub_agg)
+
+            data_parcels.append(sub_data_parcels)
+            print(len(data_parcels))
+            count = count + 1
+
+    print('\n *** Feature extraction done ***')
+    data_parcels = pd.DataFrame(data_parcels)
+    data_parcels.rename(columns=lambda X :'f_' + str(X), inplace=True)
+    print(data_parcels.columns)
+
+    print('final dataframe shape', data_parcels.shape)
+    return data_parcels

+ 98 - 0
brainage/create_splits.py

@@ -0,0 +1,98 @@
+#!/home/smore/.venvs/py3smore/bin/python3
+import math
+import pandas as pd
+from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
+
+
+# def create_splits(data_df, repeats):
+#     num_bins = math.ceil(len(data_df)/repeats) # calculate number of bins to be created
+#     print('num_bins', num_bins, len(data_df)/repeats)
+#
+#     qc = pd.cut(data_df.index, num_bins)
+#     df = pd.DataFrame({'bin': qc.codes})
+#
+#     max_num = max(df['bin'].value_counts())
+#     print(df['bin'].value_counts())
+#     print(max_num, 'max_num')
+#
+#     test_idx = {}
+#     for rpt_num in range(0, repeats):
+#         key = 'repeat_' + str(rpt_num)
+#         test_idx[key] = []
+#
+#     if repeats == max_num:
+#         for num in range(0, max_num):
+#             for bin_idx in df['bin'].unique():
+#                 test = df[df['bin'] == bin_idx]
+#                 if num < len(test):
+#                     key = 'repeat_' + str(num)
+#                     test_idx[key].append(test.index[num])
+#     return test_idx
+
+
+def stratified_splits(bins_on, num_bins, data, num_splits, shuffle, random_state):
+    """
+    :param bins_on: variable used to create bins
+    :param num_bins: num of bins/classes to create
+    :param data: data to create cv splits on
+    :param num_splits: number of cv splits to create
+    :param shuffle: shuffle the data or not
+    :param random_state: random seed to use if shuffle=True
+    :return: a dictionary with index
+    """
+    qc = pd.cut(bins_on.tolist(), num_bins)  # divides data in bins
+    cv = StratifiedKFold(n_splits=num_splits, shuffle=shuffle, random_state=random_state)
+    test_idx = {}
+    rpt_num = 0
+    for train_index, test_index in cv.split(data, qc.codes):
+        key = 'repeat_' + str(rpt_num)
+        test_idx[key] = test_index
+        rpt_num = rpt_num + 1
+    return test_idx
+
+
+def stratified_splits_class(bins_on, data, num_splits, shuffle, random_state):
+    """
+    :param bins_on: variable used to create bins
+    :param data: data to create cv splits on
+    :param num_splits: number of cv splits to create
+    :param shuffle: shuffle the data or not
+    :param random_state: random seed to use if shuffle=True
+    :return: a dictionary with index
+    """
+    cv = StratifiedKFold(n_splits=num_splits, shuffle=shuffle, random_state=random_state)
+    test_idx = {}
+    rpt_num = 0
+    for train_index, test_index in cv.split(data, bins_on):
+        key = 'repeat_' + str(rpt_num)
+        test_idx[key] = test_index
+        rpt_num = rpt_num + 1
+    return test_idx
+
+
+# def stratified_splits(bins_on, num_bins, data, num_splits, shuffle, random_state): # useful for run_cross_validation()
+#     """
+#     :param bins_on: variable used to create bins
+#     :param num_bins: num of bins/classes to create
+#     :param data: data to create cv splits on
+#     :param num_splits: number of cv splits to create
+#     :param shuffle: shuffle the data or not
+#     :param random_state: random seed to use if shuffle=True
+#     :return: cv iterator
+#     """
+#     qc = pd.cut(bins_on.tolist(), num_bins)
+#     cv = StratifiedKFold(n_splits=num_splits, shuffle=shuffle, random_state=random_state).split(data, qc.codes)
+#     return cv
+
+
+def repeated_stratified_splits(bins_on, num_bins, data, num_splits, num_repeats, random_state):
+    qc = pd.cut(bins_on.tolist(), num_bins)
+    cv = RepeatedStratifiedKFold(n_splits=num_splits, n_repeats=num_repeats, random_state=random_state)
+    test_idx = {}
+    rpt_num = 0
+    for train_index, test_index in cv.split(data, qc.codes):
+        key = 'repeat_' + str(rpt_num)
+        test_idx[key] = test_index
+        rpt_num = rpt_num + 1
+    return test_idx
+

+ 53 - 0
brainage/define_models.py

@@ -0,0 +1,53 @@
+import xgboost as xgb
+from skrvm import RVR
+from glmnet import ElasticNet
+import sklearn.gaussian_process as gp
+from sklearn.kernel_ridge import KernelRidge
+from sklearn.decomposition import PCA
+from brainage import XGBoostAdapted
+from sklearn.feature_selection import VarianceThreshold
+    
+def define_models():
+    # Define all models and model parameters
+    rvr_linear = RVR()
+    rvr_poly = RVR()
+    kernel_ridge = KernelRidge()
+    lasso = ElasticNet(alpha=1, standardize=False)
+    elasticnet = ElasticNet(alpha=0.5, standardize=False)
+    ridge = ElasticNet(alpha=0, standardize=False)
+    xgb = XGBoostAdapted(early_stopping_rounds=10, eval_metric='mae', eval_set_percent=0.2)
+    pca = PCA(n_components=None)  # max as many components as sample size
+
+
+    model_list = [ridge, 'rf', rvr_linear, kernel_ridge, 'gauss', lasso, elasticnet, rvr_poly, xgb]
+    model_para_list = [
+                    {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed},
+
+                    {'variancethreshold__threshold': var_threshold, 'rf__n_estimators': 500, 'rf__criterion': 'mse',
+                    'rf__max_features': 0.33, 'rf__min_samples_leaf': 5,
+                    'rf__random_state': rand_seed},
+
+                    {'variancethreshold__threshold': var_threshold, 'rvr__kernel': 'linear',
+                    'rvr__random_state': rand_seed},
+
+                    {'variancethreshold__threshold': var_threshold,
+                    'kernelridge__alpha': [0.0, 0.001, 0.01, 0.1, 0.5, 1.0, 10.0, 100.0, 1000.0],
+                    'kernelridge__kernel': 'polynomial', 'kernelridge__degree': [1, 2], 'cv': 5},
+
+                    {'variancethreshold__threshold': var_threshold,
+                    'gauss__kernel': gp.kernels.RBF(10.0, (1e-7, 10e7)), 'gauss__n_restarts_optimizer': 100,
+                    'gauss__normalize_y': True, 'gauss__random_state': rand_seed},
+
+                    {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed},
+
+                    {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed},
+
+                    {'variancethreshold__threshold': var_threshold, 'rvr__kernel': 'poly', 'rvr__degree': 1,
+                    'rvr__random_state': rand_seed},
+
+                    {'variancethreshold__threshold': var_threshold, 'xgboostadapted__n_jobs': 1,
+                    'xgboostadapted__max_depth': [1, 2, 3, 6, 8, 10, 12], 'xgboostadapted__n_estimators': 100,
+                    'xgboostadapted__reg_alpha': [0.001, 0.01, 0.05, 0.1, 0.2],
+                    'xgboostadapted__random_seed': rand_seed, 'cv': 5}]  # 'search_params':{'n_jobs': 5}]
+                    
+    return model_list, model_para_list

+ 8 - 0
brainage/performance_metric.py

@@ -0,0 +1,8 @@
+from sklearn.metrics import mean_absolute_error, mean_squared_error
+import numpy as np
+
+def performance_metric(y_true, y_pred):
+    mae = round(mean_absolute_error(y_true, y_pred), 3)
+    mse = round(mean_squared_error(y_true, y_pred), 3)
+    corr = round(np.corrcoef(y_pred, y_true)[1, 0], 3)
+    return mae, mse, corr

+ 46 - 0
brainage/read_data.py

@@ -0,0 +1,46 @@
+import pickle
+import pandas as pd
+
+def read_data_cross_site(data_file, train_status, confounds):
+    
+    data_df = pickle.load(open(data_file, 'rb'))
+    X = [col for col in data_df if col.startswith('f_')]
+    y = 'age'
+    data_df['age'] = data_df['age'].round().astype(int)  # round off age and convert to integer
+    data_df = data_df[data_df['age'].between(18, 90)].reset_index(drop=True)
+    duplicated_subs_1 = data_df[data_df.duplicated(['subject'], keep='first')] # check for duplicates (multiple sessions for one subject)
+    data_df = data_df.drop(duplicated_subs_1.index).reset_index(drop=True)  # remove duplicated subjects
+
+    if confounds is not None:  # convert sites in numbers to perform confound removal
+        if train_status == 'train':
+            site_name = data_df['site'].unique()
+            if type(site_name[0]) == str:
+                site_dict = {k: idx for idx, k in enumerate(site_name)}
+                data_df['site'] = data_df['site'].replace(site_dict)
+
+        elif train_status == 'test': # add site to features & convert site in a number to predict with model trained with  confound removal
+            X.append(confounds)
+            site_name = data_df['site'].unique()[0,]
+            if type(site_name) == str:
+                data_df['site'] = 10
+    return data_df, X, y
+    
+    
+    
+def read_data(features_file, demographics_file):
+    data_df = pickle.load(open(features_file, 'rb')) # read the data
+    demo = pd.read_csv(demographics_file)     # read demographics file
+    data_df = pd.concat([demo[['site', 'subject', 'age', 'gender']], data_df], axis=1) # merge them
+
+    print('Data columns:', data_df.columns)
+    print('Data Index:', data_df.index)
+
+    X = [col for col in data_df if col.startswith('f_')]
+    y = 'age'
+    data_df['age'] = data_df['age'].round().astype(int)  # round off age and convert to integer
+    data_df = data_df[data_df['age'].between(18, 90)].reset_index(drop=True)
+    data_df.sort_values(by='age', inplace=True, ignore_index=True)  # sort by age
+    duplicated_subs_1 = data_df[data_df.duplicated(['subject'], keep='first')] # check for duplicates (multiple sessions for one subject)
+    data_df = data_df.drop(duplicated_subs_1.index).reset_index(drop=True)  # remove duplicated subjects
+    return data_df, X, y
+

+ 46 - 0
brainage/xgboost_adapted.py

@@ -0,0 +1,46 @@
+from xgboost import XGBRegressor
+from sklearn.base import BaseEstimator
+from sklearn.model_selection import train_test_split
+import numpy as np
+
+class XGBoostAdapted(BaseEstimator):
+
+    def __init__(self, early_stopping_rounds=10, eval_metric=None, eval_set_percent=0.2, random_seed=None, n_jobs=1, max_depth=6, n_estimators=50, nthread=1, reg_alpha=0):
+        self.early_stopping_rounds = early_stopping_rounds
+        self.eval_metric = eval_metric
+        self.eval_set_percent = eval_set_percent
+        self.random_seed = random_seed
+        self.n_jobs = n_jobs
+        self.max_depth = max_depth
+        self.n_estimators = n_estimators
+        self.nthread = nthread
+        self.reg_alpha = reg_alpha
+
+            
+    def fit(self, X, y):
+        self._xgbregressor = XGBRegressor(n_jobs=self.n_jobs, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, reg_alpha=self.reg_alpha)
+
+        X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=self.eval_set_percent, random_state=self.random_seed)
+
+        eval_set = [(X_test, y_test)]
+
+        self._xgbregressor.fit(X_train, y_train, early_stopping_rounds=self.early_stopping_rounds, eval_metric=self.eval_metric, eval_set=eval_set)
+        
+        return self
+
+    def score(self, X, y, sample_weight=None):
+        return self._xgbregressor.score(X.values, y.values, sample_weight)
+
+    def predict(self, X):
+        return self._xgbregressor.predict(X.values)
+
+
+
+
+        
+        
+
+
+
+
+

+ 49 - 0
brainage/zscore.py

@@ -0,0 +1,49 @@
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.utils import check_array
+from scipy.stats import zscore
+
+
+class ZScore(BaseEstimator, TransformerMixin):
+
+    def __init__(self, axis=0):
+        self.axis = axis
+
+    def fit(self, X, y=None):
+        X = check_array(X)
+        self.mean_ = np.mean(X, axis=self.axis)
+        self.std_ = np.std(X, axis=self.axis)
+        return self
+
+    def transform(self, X):
+        X = check_array(X)
+        mean = (
+            self.mean_.reshape(-1, 1)
+            if self.axis
+            else self.mean_
+        )
+
+        std = (
+            self.std_.reshape(-1, 1)
+            if self.axis
+            else self.std_
+        )
+        # print(f"{X.shape = }")
+        # print(f"{mean.shape = }")
+        # print(f"{std.shape = }")
+
+        return (X - mean) / std
+
+
+class ZScoreSubwise(BaseEstimator, TransformerMixin):
+
+    def __init__(self, axis=0):
+        self.axis = axis
+
+    def fit(self, X, y=None):
+        return self
+
+    def transform(self, X):
+        X = check_array(X)
+        return zscore(X, axis=self.axis)
+