3 weeks ago · b8a507aea3
--- a/brainage/__init__.py
+++ b/brainage/__init__.py
@@ -0,0 +1,11 @@
 
																+from .calculate_features import calculate_voxelwise_features, calculate_parcelwise_features
															
 
																+from .create_splits import stratified_splits
															
 
																+from .xgboost_adapted import XGBoostAdapted
															
 
																+from .zscore import ZScoreSubwise, ZScore
															
 
																+from .create_splits import repeated_stratified_splits
															
 
																+from .read_data import read_data_cross_site
															
 
																+from .read_data import read_data
															
 
																+from .define_models import define_models
															
 
																+from sklearn.linear_model import LinearRegression
															
 
																+from .performance_metric import performance_metric
															
 
																+
															
--- a/brainage/__pycache__/__init__.cpython-311.pyc
+++ b/brainage/__pycache__/__init__.cpython-311.pyc
--- a/brainage/__pycache__/__init__.cpython-312.pyc
+++ b/brainage/__pycache__/__init__.cpython-312.pyc
--- a/brainage/__pycache__/__init__.cpython-39.pyc
+++ b/brainage/__pycache__/__init__.cpython-39.pyc
--- a/brainage/__pycache__/calculate_features.cpython-311.pyc
+++ b/brainage/__pycache__/calculate_features.cpython-311.pyc
--- a/brainage/__pycache__/calculate_features.cpython-312.pyc
+++ b/brainage/__pycache__/calculate_features.cpython-312.pyc
--- a/brainage/__pycache__/calculate_features.cpython-39.pyc
+++ b/brainage/__pycache__/calculate_features.cpython-39.pyc
--- a/brainage/__pycache__/create_splits.cpython-311.pyc
+++ b/brainage/__pycache__/create_splits.cpython-311.pyc
--- a/brainage/__pycache__/create_splits.cpython-39.pyc
+++ b/brainage/__pycache__/create_splits.cpython-39.pyc
--- a/brainage/__pycache__/define_models.cpython-311.pyc
+++ b/brainage/__pycache__/define_models.cpython-311.pyc
--- a/brainage/__pycache__/define_models.cpython-39.pyc
+++ b/brainage/__pycache__/define_models.cpython-39.pyc
--- a/brainage/__pycache__/performance_metric.cpython-311.pyc
+++ b/brainage/__pycache__/performance_metric.cpython-311.pyc
--- a/brainage/__pycache__/performance_metric.cpython-39.pyc
+++ b/brainage/__pycache__/performance_metric.cpython-39.pyc
--- a/brainage/__pycache__/read_data.cpython-311.pyc
+++ b/brainage/__pycache__/read_data.cpython-311.pyc
--- a/brainage/__pycache__/read_data.cpython-39.pyc
+++ b/brainage/__pycache__/read_data.cpython-39.pyc
--- a/brainage/__pycache__/xgboost_adapted.cpython-311.pyc
+++ b/brainage/__pycache__/xgboost_adapted.cpython-311.pyc
--- a/brainage/__pycache__/xgboost_adapted.cpython-39.pyc
+++ b/brainage/__pycache__/xgboost_adapted.cpython-39.pyc
--- a/brainage/__pycache__/zscore.cpython-311.pyc
+++ b/brainage/__pycache__/zscore.cpython-311.pyc
--- a/brainage/__pycache__/zscore.cpython-39.pyc
+++ b/brainage/__pycache__/zscore.cpython-39.pyc
--- a/brainage/calculate_features.py
+++ b/brainage/calculate_features.py
@@ -0,0 +1,188 @@
 
																+import os.path
															
 
																+import nilearn
															
 
																+from nilearn import image
															
 
																+import numpy as np
															
 
																+import pandas as pd
															
 
																+import nibabel as nib
															
 
																+import nibabel.processing as npr
															
 
																+
															
 
																+def subsample_img(img, f):
															
 
																+    """Reduce resample_to_img features of a 3D array by a given factor f."""
															
 
																+
															
 
																+    data = img.get_fdata()
															
 
																+    mask = np.zeros(img.shape)
															
 
																+    mask[::f, ::f, ::f] = 1
															
 
																+    data = data * mask
															
 
																+    return nib.Nifti1Image(data, img.affine, img.header)
															
 
																+
															
 
																+def binarize_3d(img, threshold):
															
 
																+    """binarize 3D spatial image"""
															
 
																+    return nib.Nifti1Image(
															
 
																+        np.where(img.get_fdata() > threshold, 1, 0), img.affine, img.header
															
 
																+    )
															
 
																+
															
 
																+def calculate_voxelwise_features(phenotype_file, mask_file, smooth_fwhm, resample_size):
															
 
																+    """Calculate voxelwise features for the subjects
															
 
																+
															
 
																+    Args:
															
 
																+        phenotype_file (csv or txt): A csv or text file with path to subject images
															
 
																+        mask_file (nii): The GM mask file to be used to extract features
															
 
																+        smooth_fwhm (int): Smooth images by applying a Gaussian filter by given FWHM (mm)
															
 
																+        resample_size (int): Resample image to given voxel size
															
 
																+
															
 
																+    Returns:
															
 
																+        data_resampled (dataframe): pandas dataframe of features (N subjects by M features)
															
 
																+    """    
															
 
																+
															
 
																+    phenotype = pd.read_csv(phenotype_file, header=None)
															
 
																+    
															
 
																+    # don't need this anymore
															
 
																+    # filename, file_extension = os.path.splitext(phenotype_file)
															
 
																+    # if file_extension == ".txt":
															
 
																+    #     phenotype = pd.read_csv(phenotype_file, header=None)
															
 
																+    # elif file_extension == ".csv":
															
 
																+    #     phenotype = pd.read_csv(phenotype_file, sep=",", header=None)
															
 
																+    # else:
															
 
																+    #     raise ValueError("Wrong file. Please imput either a csv or text file")
															
 
																+
															
 
																+    print(phenotype.shape)
															
 
																+    print(phenotype.head())
															
 
																+
															
 
																+#    phenotype = phenotype.iloc[0:15]
															
 
																+
															
 
																+    data_resampled = np.array([])  # array to save resampled features from subjects mri
															
 
																+    count = 0
															
 
																+    for index, row in phenotype.iterrows():  # iterate over each row
															
 
																+        sub_file = row.values[0]
															
 
																+
															
 
																+        if os.path.exists(sub_file):
															
 
																+            print(f"\n-----Processing subject number {count}------")
															
 
																+            sub_img = nib.load(sub_file)  # load subject image
															
 
																+            mask_img = nib.load(mask_file)  # load mask image
															
 
																+            print("Subject and mask image loaded")
															
 
																+            print("sub affine original \n", sub_img.affine, sub_img.shape)
															
 
																+            print("mask affine original \n", mask_img.affine, mask_img.shape)
															
 
																+
															
 
																+            print("Perform smoothing")
															
 
																+            sub_img = image.smooth_img(
															
 
																+                sub_img, smooth_fwhm
															
 
																+            )  # smooth the image with 4 mm FWHM
															
 
																+
															
 
																+            print("Perform resampling")
															
 
																+            # trying to match Gaser
															
 
																+            mask_img_rs = npr.resample_to_output(
															
 
																+                mask_img, [resample_size] * len(mask_img.shape), order=1
															
 
																+            )  # resample mask
															
 
																+            print(
															
 
																+                "mask affine after resampling\n",
															
 
																+                mask_img_rs.affine,
															
 
																+                mask_img_rs.shape,
															
 
																+            )
															
 
																+
															
 
																+            sub_img_rs = image.resample_to_img(
															
 
																+                sub_img, mask_img_rs, interpolation="linear"
															
 
																+            )  # resample subject
															
 
																+            print(
															
 
																+                "sub affine after resampling\n",
															
 
																+                sub_img_rs.affine,
															
 
																+                sub_img_rs.shape,
															
 
																+            )
															
 
																+
															
 
																+            binary_mask_img_rs = binarize_3d(mask_img_rs, 0.5)  # binarize the mask
															
 
																+            mask_rs = binary_mask_img_rs.get_fdata().astype(bool)
															
 
																+
															
 
																+            sub_data_rs = sub_img_rs.get_fdata()[
															
 
																+                mask_rs
															
 
																+            ]  # extract voxel using the binarized mask
															
 
																+            sub_data_rs = sub_data_rs.reshape(1, -1)
															
 
																+
															
 
																+            if data_resampled.size == 0:
															
 
																+                data_resampled = sub_data_rs
															
 
																+            else:
															
 
																+                data_resampled = np.concatenate((data_resampled, sub_data_rs), axis=0)
															
 
																+            count = count + 1
															
 
																+            print(data_resampled.shape)
															
 
																+
															
 
																+    print("\n *** Feature extraction done ***")
															
 
																+
															
 
																+    # renaming the columns and convering to dataframe
															
 
																+    data_resampled = pd.DataFrame(data_resampled)
															
 
																+    data_resampled.rename(columns=lambda X: "f_" + str(X), inplace=True)
															
 
																+    print('Feature names:', data_resampled.columns)
															
 
																+
															
 
																+    print(f"The size of the feature space is {data_resampled.shape}")
															
 
																+
															
 
																+    return data_resampled
															
 
																+
															
 
																+
															
 
																+
															
 
																+def calculate_parcelwise_features(phenotype_file, mask_dir, num_parcels):
															
 
																+    """Calculate parcelwise features for the subjects
															
 
																+
															
 
																+    Args:
															
 
																+        phenotype_file (csv or text): A csv or text file with path to subject images
															
 
																+        mask_dir (_type_): The GM mask file to be used to extract features
															
 
																+        num_parcels (_type_): Number of parcels
															
 
																+    
															
 
																+    Returns:
															
 
																+        data_parcels (dataframe): pandas dataframe of features (N subjects by M parcels)
															
 
																+    """    
															
 
																+
															
 
																+    phenotype = pd.read_csv(phenotype_file, header=None)
															
 
																+
															
 
																+    # filename, file_extension = os.path.splitext(phenotype_file)
															
 
																+
															
 
																+    # if file_extension == '.txt':
															
 
																+    #     phenotype = pd.read_csv(phenotype_file, header=None)
															
 
																+    # elif file_extension == '.csv':
															
 
																+    #     phenotype = pd.read_csv(phenotype_file, sep=',', header=None)
															
 
																+    # else:
															
 
																+    #     raise ValueError("Wrong file. Please imput either a csv or text file")
															
 
																+
															
 
																+    print(phenotype.shape)
															
 
																+    print(phenotype.head())
															
 
																+#    phenotype = phenotype.iloc[0:15]
															
 
																+
															
 
																+    data_parcels = [] #np.array([])  # array to save resampled features from subjects mri
															
 
																+    count = 0
															
 
																+
															
 
																+    for index, row in phenotype.iterrows(): # iterate over each row
															
 
																+        sub_file = row.values[0]
															
 
																+
															
 
																+        if os.path.exists(sub_file):
															
 
																+            print(f'\nProcessing subject number {count}')
															
 
																+            sub_img = nib.load(sub_file)  # load subject image
															
 
																+            mask_img = nib.load(mask_dir)  # load mask image
															
 
																+            print ('Subject and mask image loaded')
															
 
																+            print(sub_file, sub_img.affine, mask_img.affine)
															
 
																+
															
 
																+            sub_data = sub_img.get_fdata()
															
 
																+            sub_data[sub_data == 0] = np.nan # replace zeros with Nan
															
 
																+            sub_data_parcels = []
															
 
																+
															
 
																+            if not np.array_equal(sub_img.affine, mask_img.affine):
															
 
																+                mask_img = nilearn.image.resample_to_img(mask_img, sub_img, interpolation='linear')
															
 
																+            else:
															
 
																+                print("Subject and mask have same affine")
															
 
																+
															
 
																+            for num in range(1, int(num_parcels) + 1):
															
 
																+                itemindex = np.where(mask_img.get_fdata() == num)  # get indices from the mask for a parcel
															
 
																+                sub_mat = sub_data[itemindex]
															
 
																+
															
 
																+                if np.all(np.isnan(sub_mat)):
															
 
																+                    sub_agg = 0
															
 
																+                else:
															
 
																+                    sub_agg = np.nanmean(sub_mat) # mean the data from the indices to get GM volume
															
 
																+                sub_data_parcels.append(sub_agg)
															
 
																+
															
 
																+            data_parcels.append(sub_data_parcels)
															
 
																+            print(len(data_parcels))
															
 
																+            count = count + 1
															
 
																+
															
 
																+    print('\n *** Feature extraction done ***')
															
 
																+    data_parcels = pd.DataFrame(data_parcels)
															
 
																+    data_parcels.rename(columns=lambda X :'f_' + str(X), inplace=True)
															
 
																+    print(data_parcels.columns)
															
 
																+
															
 
																+    print('final dataframe shape', data_parcels.shape)
															
 
																+    return data_parcels
															
--- a/brainage/create_splits.py
+++ b/brainage/create_splits.py
@@ -0,0 +1,98 @@
 
																+#!/home/smore/.venvs/py3smore/bin/python3
															
 
																+import math
															
 
																+import pandas as pd
															
 
																+from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
															
 
																+
															
 
																+
															
 
																+# def create_splits(data_df, repeats):
															
 
																+#     num_bins = math.ceil(len(data_df)/repeats) # calculate number of bins to be created
															
 
																+#     print('num_bins', num_bins, len(data_df)/repeats)
															
 
																+#
															
 
																+#     qc = pd.cut(data_df.index, num_bins)
															
 
																+#     df = pd.DataFrame({'bin': qc.codes})
															
 
																+#
															
 
																+#     max_num = max(df['bin'].value_counts())
															
 
																+#     print(df['bin'].value_counts())
															
 
																+#     print(max_num, 'max_num')
															
 
																+#
															
 
																+#     test_idx = {}
															
 
																+#     for rpt_num in range(0, repeats):
															
 
																+#         key = 'repeat_' + str(rpt_num)
															
 
																+#         test_idx[key] = []
															
 
																+#
															
 
																+#     if repeats == max_num:
															
 
																+#         for num in range(0, max_num):
															
 
																+#             for bin_idx in df['bin'].unique():
															
 
																+#                 test = df[df['bin'] == bin_idx]
															
 
																+#                 if num < len(test):
															
 
																+#                     key = 'repeat_' + str(num)
															
 
																+#                     test_idx[key].append(test.index[num])
															
 
																+#     return test_idx
															
 
																+
															
 
																+
															
 
																+def stratified_splits(bins_on, num_bins, data, num_splits, shuffle, random_state):
															
 
																+    """
															
 
																+    :param bins_on: variable used to create bins
															
 
																+    :param num_bins: num of bins/classes to create
															
 
																+    :param data: data to create cv splits on
															
 
																+    :param num_splits: number of cv splits to create
															
 
																+    :param shuffle: shuffle the data or not
															
 
																+    :param random_state: random seed to use if shuffle=True
															
 
																+    :return: a dictionary with index
															
 
																+    """
															
 
																+    qc = pd.cut(bins_on.tolist(), num_bins)  # divides data in bins
															
 
																+    cv = StratifiedKFold(n_splits=num_splits, shuffle=shuffle, random_state=random_state)
															
 
																+    test_idx = {}
															
 
																+    rpt_num = 0
															
 
																+    for train_index, test_index in cv.split(data, qc.codes):
															
 
																+        key = 'repeat_' + str(rpt_num)
															
 
																+        test_idx[key] = test_index
															
 
																+        rpt_num = rpt_num + 1
															
 
																+    return test_idx
															
 
																+
															
 
																+
															
 
																+def stratified_splits_class(bins_on, data, num_splits, shuffle, random_state):
															
 
																+    """
															
 
																+    :param bins_on: variable used to create bins
															
 
																+    :param data: data to create cv splits on
															
 
																+    :param num_splits: number of cv splits to create
															
 
																+    :param shuffle: shuffle the data or not
															
 
																+    :param random_state: random seed to use if shuffle=True
															
 
																+    :return: a dictionary with index
															
 
																+    """
															
 
																+    cv = StratifiedKFold(n_splits=num_splits, shuffle=shuffle, random_state=random_state)
															
 
																+    test_idx = {}
															
 
																+    rpt_num = 0
															
 
																+    for train_index, test_index in cv.split(data, bins_on):
															
 
																+        key = 'repeat_' + str(rpt_num)
															
 
																+        test_idx[key] = test_index
															
 
																+        rpt_num = rpt_num + 1
															
 
																+    return test_idx
															
 
																+
															
 
																+
															
 
																+# def stratified_splits(bins_on, num_bins, data, num_splits, shuffle, random_state): # useful for run_cross_validation()
															
 
																+#     """
															
 
																+#     :param bins_on: variable used to create bins
															
 
																+#     :param num_bins: num of bins/classes to create
															
 
																+#     :param data: data to create cv splits on
															
 
																+#     :param num_splits: number of cv splits to create
															
 
																+#     :param shuffle: shuffle the data or not
															
 
																+#     :param random_state: random seed to use if shuffle=True
															
 
																+#     :return: cv iterator
															
 
																+#     """
															
 
																+#     qc = pd.cut(bins_on.tolist(), num_bins)
															
 
																+#     cv = StratifiedKFold(n_splits=num_splits, shuffle=shuffle, random_state=random_state).split(data, qc.codes)
															
 
																+#     return cv
															
 
																+
															
 
																+
															
 
																+def repeated_stratified_splits(bins_on, num_bins, data, num_splits, num_repeats, random_state):
															
 
																+    qc = pd.cut(bins_on.tolist(), num_bins)
															
 
																+    cv = RepeatedStratifiedKFold(n_splits=num_splits, n_repeats=num_repeats, random_state=random_state)
															
 
																+    test_idx = {}
															
 
																+    rpt_num = 0
															
 
																+    for train_index, test_index in cv.split(data, qc.codes):
															
 
																+        key = 'repeat_' + str(rpt_num)
															
 
																+        test_idx[key] = test_index
															
 
																+        rpt_num = rpt_num + 1
															
 
																+    return test_idx
															
 
																+
															
--- a/brainage/define_models.py
+++ b/brainage/define_models.py
@@ -0,0 +1,53 @@
 
																+import xgboost as xgb
															
 
																+from skrvm import RVR
															
 
																+from glmnet import ElasticNet
															
 
																+import sklearn.gaussian_process as gp
															
 
																+from sklearn.kernel_ridge import KernelRidge
															
 
																+from sklearn.decomposition import PCA
															
 
																+from brainage import XGBoostAdapted
															
 
																+from sklearn.feature_selection import VarianceThreshold
															
 
																+    
															
 
																+def define_models():
															
 
																+    # Define all models and model parameters
															
 
																+    rvr_linear = RVR()
															
 
																+    rvr_poly = RVR()
															
 
																+    kernel_ridge = KernelRidge()
															
 
																+    lasso = ElasticNet(alpha=1, standardize=False)
															
 
																+    elasticnet = ElasticNet(alpha=0.5, standardize=False)
															
 
																+    ridge = ElasticNet(alpha=0, standardize=False)
															
 
																+    xgb = XGBoostAdapted(early_stopping_rounds=10, eval_metric='mae', eval_set_percent=0.2)
															
 
																+    pca = PCA(n_components=None)  # max as many components as sample size
															
 
																+
															
 
																+
															
 
																+    model_list = [ridge, 'rf', rvr_linear, kernel_ridge, 'gauss', lasso, elasticnet, rvr_poly, xgb]
															
 
																+    model_para_list = [
															
 
																+                    {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed},
															
 
																+
															
 
																+                    {'variancethreshold__threshold': var_threshold, 'rf__n_estimators': 500, 'rf__criterion': 'mse',
															
 
																+                    'rf__max_features': 0.33, 'rf__min_samples_leaf': 5,
															
 
																+                    'rf__random_state': rand_seed},
															
 
																+
															
 
																+                    {'variancethreshold__threshold': var_threshold, 'rvr__kernel': 'linear',
															
 
																+                    'rvr__random_state': rand_seed},
															
 
																+
															
 
																+                    {'variancethreshold__threshold': var_threshold,
															
 
																+                    'kernelridge__alpha': [0.0, 0.001, 0.01, 0.1, 0.5, 1.0, 10.0, 100.0, 1000.0],
															
 
																+                    'kernelridge__kernel': 'polynomial', 'kernelridge__degree': [1, 2], 'cv': 5},
															
 
																+
															
 
																+                    {'variancethreshold__threshold': var_threshold,
															
 
																+                    'gauss__kernel': gp.kernels.RBF(10.0, (1e-7, 10e7)), 'gauss__n_restarts_optimizer': 100,
															
 
																+                    'gauss__normalize_y': True, 'gauss__random_state': rand_seed},
															
 
																+
															
 
																+                    {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed},
															
 
																+
															
 
																+                    {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed},
															
 
																+
															
 
																+                    {'variancethreshold__threshold': var_threshold, 'rvr__kernel': 'poly', 'rvr__degree': 1,
															
 
																+                    'rvr__random_state': rand_seed},
															
 
																+
															
 
																+                    {'variancethreshold__threshold': var_threshold, 'xgboostadapted__n_jobs': 1,
															
 
																+                    'xgboostadapted__max_depth': [1, 2, 3, 6, 8, 10, 12], 'xgboostadapted__n_estimators': 100,
															
 
																+                    'xgboostadapted__reg_alpha': [0.001, 0.01, 0.05, 0.1, 0.2],
															
 
																+                    'xgboostadapted__random_seed': rand_seed, 'cv': 5}]  # 'search_params':{'n_jobs': 5}]
															
 
																+                    
															
 
																+    return model_list, model_para_list
															
--- a/brainage/performance_metric.py
+++ b/brainage/performance_metric.py
@@ -0,0 +1,8 @@
 
																+from sklearn.metrics import mean_absolute_error, mean_squared_error
															
 
																+import numpy as np
															
 
																+
															
 
																+def performance_metric(y_true, y_pred):
															
 
																+    mae = round(mean_absolute_error(y_true, y_pred), 3)
															
 
																+    mse = round(mean_squared_error(y_true, y_pred), 3)
															
 
																+    corr = round(np.corrcoef(y_pred, y_true)[1, 0], 3)
															
 
																+    return mae, mse, corr
															
--- a/brainage/read_data.py
+++ b/brainage/read_data.py
@@ -0,0 +1,46 @@
 
																+import pickle
															
 
																+import pandas as pd
															
 
																+
															
 
																+def read_data_cross_site(data_file, train_status, confounds):
															
 
																+    
															
 
																+    data_df = pickle.load(open(data_file, 'rb'))
															
 
																+    X = [col for col in data_df if col.startswith('f_')]
															
 
																+    y = 'age'
															
 
																+    data_df['age'] = data_df['age'].round().astype(int)  # round off age and convert to integer
															
 
																+    data_df = data_df[data_df['age'].between(18, 90)].reset_index(drop=True)
															
 
																+    duplicated_subs_1 = data_df[data_df.duplicated(['subject'], keep='first')] # check for duplicates (multiple sessions for one subject)
															
 
																+    data_df = data_df.drop(duplicated_subs_1.index).reset_index(drop=True)  # remove duplicated subjects
															
 
																+
															
 
																+    if confounds is not None:  # convert sites in numbers to perform confound removal
															
 
																+        if train_status == 'train':
															
 
																+            site_name = data_df['site'].unique()
															
 
																+            if type(site_name[0]) == str:
															
 
																+                site_dict = {k: idx for idx, k in enumerate(site_name)}
															
 
																+                data_df['site'] = data_df['site'].replace(site_dict)
															
 
																+
															
 
																+        elif train_status == 'test': # add site to features & convert site in a number to predict with model trained with  confound removal
															
 
																+            X.append(confounds)
															
 
																+            site_name = data_df['site'].unique()[0,]
															
 
																+            if type(site_name) == str:
															
 
																+                data_df['site'] = 10
															
 
																+    return data_df, X, y
															
 
																+    
															
 
																+    
															
 
																+    
															
 
																+def read_data(features_file, demographics_file):
															
 
																+    data_df = pickle.load(open(features_file, 'rb')) # read the data
															
 
																+    demo = pd.read_csv(demographics_file)     # read demographics file
															
 
																+    data_df = pd.concat([demo[['site', 'subject', 'age', 'gender']], data_df], axis=1) # merge them
															
 
																+
															
 
																+    print('Data columns:', data_df.columns)
															
 
																+    print('Data Index:', data_df.index)
															
 
																+
															
 
																+    X = [col for col in data_df if col.startswith('f_')]
															
 
																+    y = 'age'
															
 
																+    data_df['age'] = data_df['age'].round().astype(int)  # round off age and convert to integer
															
 
																+    data_df = data_df[data_df['age'].between(18, 90)].reset_index(drop=True)
															
 
																+    data_df.sort_values(by='age', inplace=True, ignore_index=True)  # sort by age
															
 
																+    duplicated_subs_1 = data_df[data_df.duplicated(['subject'], keep='first')] # check for duplicates (multiple sessions for one subject)
															
 
																+    data_df = data_df.drop(duplicated_subs_1.index).reset_index(drop=True)  # remove duplicated subjects
															
 
																+    return data_df, X, y
															
 
																+
															
--- a/brainage/xgboost_adapted.py
+++ b/brainage/xgboost_adapted.py
@@ -0,0 +1,46 @@
 
																+from xgboost import XGBRegressor
															
 
																+from sklearn.base import BaseEstimator
															
 
																+from sklearn.model_selection import train_test_split
															
 
																+import numpy as np
															
 
																+
															
 
																+class XGBoostAdapted(BaseEstimator):
															
 
																+
															
 
																+    def __init__(self, early_stopping_rounds=10, eval_metric=None, eval_set_percent=0.2, random_seed=None, n_jobs=1, max_depth=6, n_estimators=50, nthread=1, reg_alpha=0):
															
 
																+        self.early_stopping_rounds = early_stopping_rounds
															
 
																+        self.eval_metric = eval_metric
															
 
																+        self.eval_set_percent = eval_set_percent
															
 
																+        self.random_seed = random_seed
															
 
																+        self.n_jobs = n_jobs
															
 
																+        self.max_depth = max_depth
															
 
																+        self.n_estimators = n_estimators
															
 
																+        self.nthread = nthread
															
 
																+        self.reg_alpha = reg_alpha
															
 
																+
															
 
																+            
															
 
																+    def fit(self, X, y):
															
 
																+        self._xgbregressor = XGBRegressor(n_jobs=self.n_jobs, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, reg_alpha=self.reg_alpha)
															
 
																+
															
 
																+        X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=self.eval_set_percent, random_state=self.random_seed)
															
 
																+
															
 
																+        eval_set = [(X_test, y_test)]
															
 
																+
															
 
																+        self._xgbregressor.fit(X_train, y_train, early_stopping_rounds=self.early_stopping_rounds, eval_metric=self.eval_metric, eval_set=eval_set)
															
 
																+        
															
 
																+        return self
															
 
																+
															
 
																+    def score(self, X, y, sample_weight=None):
															
 
																+        return self._xgbregressor.score(X.values, y.values, sample_weight)
															
 
																+
															
 
																+    def predict(self, X):
															
 
																+        return self._xgbregressor.predict(X.values)
															
 
																+
															
 
																+
															
 
																+
															
 
																+
															
 
																+        
															
 
																+        
															
 
																+
															
 
																+
															
 
																+
															
 
																+
															
 
																+
															
--- a/brainage/zscore.py
+++ b/brainage/zscore.py
@@ -0,0 +1,49 @@
 
																+import numpy as np
															
 
																+from sklearn.base import BaseEstimator, TransformerMixin
															
 
																+from sklearn.utils import check_array
															
 
																+from scipy.stats import zscore
															
 
																+
															
 
																+
															
 
																+class ZScore(BaseEstimator, TransformerMixin):
															
 
																+
															
 
																+    def __init__(self, axis=0):
															
 
																+        self.axis = axis
															
 
																+
															
 
																+    def fit(self, X, y=None):
															
 
																+        X = check_array(X)
															
 
																+        self.mean_ = np.mean(X, axis=self.axis)
															
 
																+        self.std_ = np.std(X, axis=self.axis)
															
 
																+        return self
															
 
																+
															
 
																+    def transform(self, X):
															
 
																+        X = check_array(X)
															
 
																+        mean = (
															
 
																+            self.mean_.reshape(-1, 1)
															
 
																+            if self.axis
															
 
																+            else self.mean_
															
 
																+        )
															
 
																+
															
 
																+        std = (
															
 
																+            self.std_.reshape(-1, 1)
															
 
																+            if self.axis
															
 
																+            else self.std_
															
 
																+        )
															
 
																+        # print(f"{X.shape = }")
															
 
																+        # print(f"{mean.shape = }")
															
 
																+        # print(f"{std.shape = }")
															
 
																+
															
 
																+        return (X - mean) / std
															
 
																+
															
 
																+
															
 
																+class ZScoreSubwise(BaseEstimator, TransformerMixin):
															
 
																+
															
 
																+    def __init__(self, axis=0):
															
 
																+        self.axis = axis
															
 
																+
															
 
																+    def fit(self, X, y=None):
															
 
																+        return self
															
 
																+
															
 
																+    def transform(self, X):
															
 
																+        X = check_array(X)
															
 
																+        return zscore(X, axis=self.axis)
															
 
																+