3 hafta önce · b8a507aea3
--- a/brainage/__init__.py
+++ b/brainage/__init__.py
@@ -0,0 +1,11 @@
 
				+from .calculate_features import calculate_voxelwise_features, calculate_parcelwise_features
			
 
				+from .create_splits import stratified_splits
			
 
				+from .xgboost_adapted import XGBoostAdapted
			
 
				+from .zscore import ZScoreSubwise, ZScore
			
 
				+from .create_splits import repeated_stratified_splits
			
 
				+from .read_data import read_data_cross_site
			
 
				+from .read_data import read_data
			
 
				+from .define_models import define_models
			
 
				+from sklearn.linear_model import LinearRegression
			
 
				+from .performance_metric import performance_metric
			
 
				+
			
--- a/brainage/__pycache__/__init__.cpython-311.pyc
+++ b/brainage/__pycache__/__init__.cpython-311.pyc
--- a/brainage/__pycache__/__init__.cpython-312.pyc
+++ b/brainage/__pycache__/__init__.cpython-312.pyc
--- a/brainage/__pycache__/__init__.cpython-39.pyc
+++ b/brainage/__pycache__/__init__.cpython-39.pyc
--- a/brainage/__pycache__/calculate_features.cpython-311.pyc
+++ b/brainage/__pycache__/calculate_features.cpython-311.pyc
--- a/brainage/__pycache__/calculate_features.cpython-312.pyc
+++ b/brainage/__pycache__/calculate_features.cpython-312.pyc
--- a/brainage/__pycache__/calculate_features.cpython-39.pyc
+++ b/brainage/__pycache__/calculate_features.cpython-39.pyc
--- a/brainage/__pycache__/create_splits.cpython-311.pyc
+++ b/brainage/__pycache__/create_splits.cpython-311.pyc
--- a/brainage/__pycache__/create_splits.cpython-39.pyc
+++ b/brainage/__pycache__/create_splits.cpython-39.pyc
--- a/brainage/__pycache__/define_models.cpython-311.pyc
+++ b/brainage/__pycache__/define_models.cpython-311.pyc
--- a/brainage/__pycache__/define_models.cpython-39.pyc
+++ b/brainage/__pycache__/define_models.cpython-39.pyc
--- a/brainage/__pycache__/performance_metric.cpython-311.pyc
+++ b/brainage/__pycache__/performance_metric.cpython-311.pyc
--- a/brainage/__pycache__/performance_metric.cpython-39.pyc
+++ b/brainage/__pycache__/performance_metric.cpython-39.pyc
--- a/brainage/__pycache__/read_data.cpython-311.pyc
+++ b/brainage/__pycache__/read_data.cpython-311.pyc
--- a/brainage/__pycache__/read_data.cpython-39.pyc
+++ b/brainage/__pycache__/read_data.cpython-39.pyc
--- a/brainage/__pycache__/xgboost_adapted.cpython-311.pyc
+++ b/brainage/__pycache__/xgboost_adapted.cpython-311.pyc
--- a/brainage/__pycache__/xgboost_adapted.cpython-39.pyc
+++ b/brainage/__pycache__/xgboost_adapted.cpython-39.pyc
--- a/brainage/__pycache__/zscore.cpython-311.pyc
+++ b/brainage/__pycache__/zscore.cpython-311.pyc
--- a/brainage/__pycache__/zscore.cpython-39.pyc
+++ b/brainage/__pycache__/zscore.cpython-39.pyc
--- a/brainage/calculate_features.py
+++ b/brainage/calculate_features.py
@@ -0,0 +1,188 @@
 
				+import os.path
			
 
				+import nilearn
			
 
				+from nilearn import image
			
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+import nibabel as nib
			
 
				+import nibabel.processing as npr
			
 
				+
			
 
				+def subsample_img(img, f):
			
 
				+    """Reduce resample_to_img features of a 3D array by a given factor f."""
			
 
				+
			
 
				+    data = img.get_fdata()
			
 
				+    mask = np.zeros(img.shape)
			
 
				+    mask[::f, ::f, ::f] = 1
			
 
				+    data = data * mask
			
 
				+    return nib.Nifti1Image(data, img.affine, img.header)
			
 
				+
			
 
				+def binarize_3d(img, threshold):
			
 
				+    """binarize 3D spatial image"""
			
 
				+    return nib.Nifti1Image(
			
 
				+        np.where(img.get_fdata() > threshold, 1, 0), img.affine, img.header
			
 
				+    )
			
 
				+
			
 
				+def calculate_voxelwise_features(phenotype_file, mask_file, smooth_fwhm, resample_size):
			
 
				+    """Calculate voxelwise features for the subjects
			
 
				+
			
 
				+    Args:
			
 
				+        phenotype_file (csv or txt): A csv or text file with path to subject images
			
 
				+        mask_file (nii): The GM mask file to be used to extract features
			
 
				+        smooth_fwhm (int): Smooth images by applying a Gaussian filter by given FWHM (mm)
			
 
				+        resample_size (int): Resample image to given voxel size
			
 
				+
			
 
				+    Returns:
			
 
				+        data_resampled (dataframe): pandas dataframe of features (N subjects by M features)
			
 
				+    """    
			
 
				+
			
 
				+    phenotype = pd.read_csv(phenotype_file, header=None)
			
 
				+    
			
 
				+    # don't need this anymore
			
 
				+    # filename, file_extension = os.path.splitext(phenotype_file)
			
 
				+    # if file_extension == ".txt":
			
 
				+    #     phenotype = pd.read_csv(phenotype_file, header=None)
			
 
				+    # elif file_extension == ".csv":
			
 
				+    #     phenotype = pd.read_csv(phenotype_file, sep=",", header=None)
			
 
				+    # else:
			
 
				+    #     raise ValueError("Wrong file. Please imput either a csv or text file")
			
 
				+
			
 
				+    print(phenotype.shape)
			
 
				+    print(phenotype.head())
			
 
				+
			
 
				+#    phenotype = phenotype.iloc[0:15]
			
 
				+
			
 
				+    data_resampled = np.array([])  # array to save resampled features from subjects mri
			
 
				+    count = 0
			
 
				+    for index, row in phenotype.iterrows():  # iterate over each row
			
 
				+        sub_file = row.values[0]
			
 
				+
			
 
				+        if os.path.exists(sub_file):
			
 
				+            print(f"\n-----Processing subject number {count}------")
			
 
				+            sub_img = nib.load(sub_file)  # load subject image
			
 
				+            mask_img = nib.load(mask_file)  # load mask image
			
 
				+            print("Subject and mask image loaded")
			
 
				+            print("sub affine original \n", sub_img.affine, sub_img.shape)
			
 
				+            print("mask affine original \n", mask_img.affine, mask_img.shape)
			
 
				+
			
 
				+            print("Perform smoothing")
			
 
				+            sub_img = image.smooth_img(
			
 
				+                sub_img, smooth_fwhm
			
 
				+            )  # smooth the image with 4 mm FWHM
			
 
				+
			
 
				+            print("Perform resampling")
			
 
				+            # trying to match Gaser
			
 
				+            mask_img_rs = npr.resample_to_output(
			
 
				+                mask_img, [resample_size] * len(mask_img.shape), order=1
			
 
				+            )  # resample mask
			
 
				+            print(
			
 
				+                "mask affine after resampling\n",
			
 
				+                mask_img_rs.affine,
			
 
				+                mask_img_rs.shape,
			
 
				+            )
			
 
				+
			
 
				+            sub_img_rs = image.resample_to_img(
			
 
				+                sub_img, mask_img_rs, interpolation="linear"
			
 
				+            )  # resample subject
			
 
				+            print(
			
 
				+                "sub affine after resampling\n",
			
 
				+                sub_img_rs.affine,
			
 
				+                sub_img_rs.shape,
			
 
				+            )
			
 
				+
			
 
				+            binary_mask_img_rs = binarize_3d(mask_img_rs, 0.5)  # binarize the mask
			
 
				+            mask_rs = binary_mask_img_rs.get_fdata().astype(bool)
			
 
				+
			
 
				+            sub_data_rs = sub_img_rs.get_fdata()[
			
 
				+                mask_rs
			
 
				+            ]  # extract voxel using the binarized mask
			
 
				+            sub_data_rs = sub_data_rs.reshape(1, -1)
			
 
				+
			
 
				+            if data_resampled.size == 0:
			
 
				+                data_resampled = sub_data_rs
			
 
				+            else:
			
 
				+                data_resampled = np.concatenate((data_resampled, sub_data_rs), axis=0)
			
 
				+            count = count + 1
			
 
				+            print(data_resampled.shape)
			
 
				+
			
 
				+    print("\n *** Feature extraction done ***")
			
 
				+
			
 
				+    # renaming the columns and convering to dataframe
			
 
				+    data_resampled = pd.DataFrame(data_resampled)
			
 
				+    data_resampled.rename(columns=lambda X: "f_" + str(X), inplace=True)
			
 
				+    print('Feature names:', data_resampled.columns)
			
 
				+
			
 
				+    print(f"The size of the feature space is {data_resampled.shape}")
			
 
				+
			
 
				+    return data_resampled
			
 
				+
			
 
				+
			
 
				+
			
 
				+def calculate_parcelwise_features(phenotype_file, mask_dir, num_parcels):
			
 
				+    """Calculate parcelwise features for the subjects
			
 
				+
			
 
				+    Args:
			
 
				+        phenotype_file (csv or text): A csv or text file with path to subject images
			
 
				+        mask_dir (_type_): The GM mask file to be used to extract features
			
 
				+        num_parcels (_type_): Number of parcels
			
 
				+    
			
 
				+    Returns:
			
 
				+        data_parcels (dataframe): pandas dataframe of features (N subjects by M parcels)
			
 
				+    """    
			
 
				+
			
 
				+    phenotype = pd.read_csv(phenotype_file, header=None)
			
 
				+
			
 
				+    # filename, file_extension = os.path.splitext(phenotype_file)
			
 
				+
			
 
				+    # if file_extension == '.txt':
			
 
				+    #     phenotype = pd.read_csv(phenotype_file, header=None)
			
 
				+    # elif file_extension == '.csv':
			
 
				+    #     phenotype = pd.read_csv(phenotype_file, sep=',', header=None)
			
 
				+    # else:
			
 
				+    #     raise ValueError("Wrong file. Please imput either a csv or text file")
			
 
				+
			
 
				+    print(phenotype.shape)
			
 
				+    print(phenotype.head())
			
 
				+#    phenotype = phenotype.iloc[0:15]
			
 
				+
			
 
				+    data_parcels = [] #np.array([])  # array to save resampled features from subjects mri
			
 
				+    count = 0
			
 
				+
			
 
				+    for index, row in phenotype.iterrows(): # iterate over each row
			
 
				+        sub_file = row.values[0]
			
 
				+
			
 
				+        if os.path.exists(sub_file):
			
 
				+            print(f'\nProcessing subject number {count}')
			
 
				+            sub_img = nib.load(sub_file)  # load subject image
			
 
				+            mask_img = nib.load(mask_dir)  # load mask image
			
 
				+            print ('Subject and mask image loaded')
			
 
				+            print(sub_file, sub_img.affine, mask_img.affine)
			
 
				+
			
 
				+            sub_data = sub_img.get_fdata()
			
 
				+            sub_data[sub_data == 0] = np.nan # replace zeros with Nan
			
 
				+            sub_data_parcels = []
			
 
				+
			
 
				+            if not np.array_equal(sub_img.affine, mask_img.affine):
			
 
				+                mask_img = nilearn.image.resample_to_img(mask_img, sub_img, interpolation='linear')
			
 
				+            else:
			
 
				+                print("Subject and mask have same affine")
			
 
				+
			
 
				+            for num in range(1, int(num_parcels) + 1):
			
 
				+                itemindex = np.where(mask_img.get_fdata() == num)  # get indices from the mask for a parcel
			
 
				+                sub_mat = sub_data[itemindex]
			
 
				+
			
 
				+                if np.all(np.isnan(sub_mat)):
			
 
				+                    sub_agg = 0
			
 
				+                else:
			
 
				+                    sub_agg = np.nanmean(sub_mat) # mean the data from the indices to get GM volume
			
 
				+                sub_data_parcels.append(sub_agg)
			
 
				+
			
 
				+            data_parcels.append(sub_data_parcels)
			
 
				+            print(len(data_parcels))
			
 
				+            count = count + 1
			
 
				+
			
 
				+    print('\n *** Feature extraction done ***')
			
 
				+    data_parcels = pd.DataFrame(data_parcels)
			
 
				+    data_parcels.rename(columns=lambda X :'f_' + str(X), inplace=True)
			
 
				+    print(data_parcels.columns)
			
 
				+
			
 
				+    print('final dataframe shape', data_parcels.shape)
			
 
				+    return data_parcels
			
--- a/brainage/create_splits.py
+++ b/brainage/create_splits.py
@@ -0,0 +1,98 @@
 
				+#!/home/smore/.venvs/py3smore/bin/python3
			
 
				+import math
			
 
				+import pandas as pd
			
 
				+from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
			
 
				+
			
 
				+
			
 
				+# def create_splits(data_df, repeats):
			
 
				+#     num_bins = math.ceil(len(data_df)/repeats) # calculate number of bins to be created
			
 
				+#     print('num_bins', num_bins, len(data_df)/repeats)
			
 
				+#
			
 
				+#     qc = pd.cut(data_df.index, num_bins)
			
 
				+#     df = pd.DataFrame({'bin': qc.codes})
			
 
				+#
			
 
				+#     max_num = max(df['bin'].value_counts())
			
 
				+#     print(df['bin'].value_counts())
			
 
				+#     print(max_num, 'max_num')
			
 
				+#
			
 
				+#     test_idx = {}
			
 
				+#     for rpt_num in range(0, repeats):
			
 
				+#         key = 'repeat_' + str(rpt_num)
			
 
				+#         test_idx[key] = []
			
 
				+#
			
 
				+#     if repeats == max_num:
			
 
				+#         for num in range(0, max_num):
			
 
				+#             for bin_idx in df['bin'].unique():
			
 
				+#                 test = df[df['bin'] == bin_idx]
			
 
				+#                 if num < len(test):
			
 
				+#                     key = 'repeat_' + str(num)
			
 
				+#                     test_idx[key].append(test.index[num])
			
 
				+#     return test_idx
			
 
				+
			
 
				+
			
 
				+def stratified_splits(bins_on, num_bins, data, num_splits, shuffle, random_state):
			
 
				+    """
			
 
				+    :param bins_on: variable used to create bins
			
 
				+    :param num_bins: num of bins/classes to create
			
 
				+    :param data: data to create cv splits on
			
 
				+    :param num_splits: number of cv splits to create
			
 
				+    :param shuffle: shuffle the data or not
			
 
				+    :param random_state: random seed to use if shuffle=True
			
 
				+    :return: a dictionary with index
			
 
				+    """
			
 
				+    qc = pd.cut(bins_on.tolist(), num_bins)  # divides data in bins
			
 
				+    cv = StratifiedKFold(n_splits=num_splits, shuffle=shuffle, random_state=random_state)
			
 
				+    test_idx = {}
			
 
				+    rpt_num = 0
			
 
				+    for train_index, test_index in cv.split(data, qc.codes):
			
 
				+        key = 'repeat_' + str(rpt_num)
			
 
				+        test_idx[key] = test_index
			
 
				+        rpt_num = rpt_num + 1
			
 
				+    return test_idx
			
 
				+
			
 
				+
			
 
				+def stratified_splits_class(bins_on, data, num_splits, shuffle, random_state):
			
 
				+    """
			
 
				+    :param bins_on: variable used to create bins
			
 
				+    :param data: data to create cv splits on
			
 
				+    :param num_splits: number of cv splits to create
			
 
				+    :param shuffle: shuffle the data or not
			
 
				+    :param random_state: random seed to use if shuffle=True
			
 
				+    :return: a dictionary with index
			
 
				+    """
			
 
				+    cv = StratifiedKFold(n_splits=num_splits, shuffle=shuffle, random_state=random_state)
			
 
				+    test_idx = {}
			
 
				+    rpt_num = 0
			
 
				+    for train_index, test_index in cv.split(data, bins_on):
			
 
				+        key = 'repeat_' + str(rpt_num)
			
 
				+        test_idx[key] = test_index
			
 
				+        rpt_num = rpt_num + 1
			
 
				+    return test_idx
			
 
				+
			
 
				+
			
 
				+# def stratified_splits(bins_on, num_bins, data, num_splits, shuffle, random_state): # useful for run_cross_validation()
			
 
				+#     """
			
 
				+#     :param bins_on: variable used to create bins
			
 
				+#     :param num_bins: num of bins/classes to create
			
 
				+#     :param data: data to create cv splits on
			
 
				+#     :param num_splits: number of cv splits to create
			
 
				+#     :param shuffle: shuffle the data or not
			
 
				+#     :param random_state: random seed to use if shuffle=True
			
 
				+#     :return: cv iterator
			
 
				+#     """
			
 
				+#     qc = pd.cut(bins_on.tolist(), num_bins)
			
 
				+#     cv = StratifiedKFold(n_splits=num_splits, shuffle=shuffle, random_state=random_state).split(data, qc.codes)
			
 
				+#     return cv
			
 
				+
			
 
				+
			
 
				+def repeated_stratified_splits(bins_on, num_bins, data, num_splits, num_repeats, random_state):
			
 
				+    qc = pd.cut(bins_on.tolist(), num_bins)
			
 
				+    cv = RepeatedStratifiedKFold(n_splits=num_splits, n_repeats=num_repeats, random_state=random_state)
			
 
				+    test_idx = {}
			
 
				+    rpt_num = 0
			
 
				+    for train_index, test_index in cv.split(data, qc.codes):
			
 
				+        key = 'repeat_' + str(rpt_num)
			
 
				+        test_idx[key] = test_index
			
 
				+        rpt_num = rpt_num + 1
			
 
				+    return test_idx
			
 
				+
			
--- a/brainage/define_models.py
+++ b/brainage/define_models.py
@@ -0,0 +1,53 @@
 
				+import xgboost as xgb
			
 
				+from skrvm import RVR
			
 
				+from glmnet import ElasticNet
			
 
				+import sklearn.gaussian_process as gp
			
 
				+from sklearn.kernel_ridge import KernelRidge
			
 
				+from sklearn.decomposition import PCA
			
 
				+from brainage import XGBoostAdapted
			
 
				+from sklearn.feature_selection import VarianceThreshold
			
 
				+    
			
 
				+def define_models():
			
 
				+    # Define all models and model parameters
			
 
				+    rvr_linear = RVR()
			
 
				+    rvr_poly = RVR()
			
 
				+    kernel_ridge = KernelRidge()
			
 
				+    lasso = ElasticNet(alpha=1, standardize=False)
			
 
				+    elasticnet = ElasticNet(alpha=0.5, standardize=False)
			
 
				+    ridge = ElasticNet(alpha=0, standardize=False)
			
 
				+    xgb = XGBoostAdapted(early_stopping_rounds=10, eval_metric='mae', eval_set_percent=0.2)
			
 
				+    pca = PCA(n_components=None)  # max as many components as sample size
			
 
				+
			
 
				+
			
 
				+    model_list = [ridge, 'rf', rvr_linear, kernel_ridge, 'gauss', lasso, elasticnet, rvr_poly, xgb]
			
 
				+    model_para_list = [
			
 
				+                    {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed},
			
 
				+
			
 
				+                    {'variancethreshold__threshold': var_threshold, 'rf__n_estimators': 500, 'rf__criterion': 'mse',
			
 
				+                    'rf__max_features': 0.33, 'rf__min_samples_leaf': 5,
			
 
				+                    'rf__random_state': rand_seed},
			
 
				+
			
 
				+                    {'variancethreshold__threshold': var_threshold, 'rvr__kernel': 'linear',
			
 
				+                    'rvr__random_state': rand_seed},
			
 
				+
			
 
				+                    {'variancethreshold__threshold': var_threshold,
			
 
				+                    'kernelridge__alpha': [0.0, 0.001, 0.01, 0.1, 0.5, 1.0, 10.0, 100.0, 1000.0],
			
 
				+                    'kernelridge__kernel': 'polynomial', 'kernelridge__degree': [1, 2], 'cv': 5},
			
 
				+
			
 
				+                    {'variancethreshold__threshold': var_threshold,
			
 
				+                    'gauss__kernel': gp.kernels.RBF(10.0, (1e-7, 10e7)), 'gauss__n_restarts_optimizer': 100,
			
 
				+                    'gauss__normalize_y': True, 'gauss__random_state': rand_seed},
			
 
				+
			
 
				+                    {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed},
			
 
				+
			
 
				+                    {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed},
			
 
				+
			
 
				+                    {'variancethreshold__threshold': var_threshold, 'rvr__kernel': 'poly', 'rvr__degree': 1,
			
 
				+                    'rvr__random_state': rand_seed},
			
 
				+
			
 
				+                    {'variancethreshold__threshold': var_threshold, 'xgboostadapted__n_jobs': 1,
			
 
				+                    'xgboostadapted__max_depth': [1, 2, 3, 6, 8, 10, 12], 'xgboostadapted__n_estimators': 100,
			
 
				+                    'xgboostadapted__reg_alpha': [0.001, 0.01, 0.05, 0.1, 0.2],
			
 
				+                    'xgboostadapted__random_seed': rand_seed, 'cv': 5}]  # 'search_params':{'n_jobs': 5}]
			
 
				+                    
			
 
				+    return model_list, model_para_list
			
--- a/brainage/performance_metric.py
+++ b/brainage/performance_metric.py
@@ -0,0 +1,8 @@
 
				+from sklearn.metrics import mean_absolute_error, mean_squared_error
			
 
				+import numpy as np
			
 
				+
			
 
				+def performance_metric(y_true, y_pred):
			
 
				+    mae = round(mean_absolute_error(y_true, y_pred), 3)
			
 
				+    mse = round(mean_squared_error(y_true, y_pred), 3)
			
 
				+    corr = round(np.corrcoef(y_pred, y_true)[1, 0], 3)
			
 
				+    return mae, mse, corr
			
--- a/brainage/read_data.py
+++ b/brainage/read_data.py
@@ -0,0 +1,46 @@
 
				+import pickle
			
 
				+import pandas as pd
			
 
				+
			
 
				+def read_data_cross_site(data_file, train_status, confounds):
			
 
				+    
			
 
				+    data_df = pickle.load(open(data_file, 'rb'))
			
 
				+    X = [col for col in data_df if col.startswith('f_')]
			
 
				+    y = 'age'
			
 
				+    data_df['age'] = data_df['age'].round().astype(int)  # round off age and convert to integer
			
 
				+    data_df = data_df[data_df['age'].between(18, 90)].reset_index(drop=True)
			
 
				+    duplicated_subs_1 = data_df[data_df.duplicated(['subject'], keep='first')] # check for duplicates (multiple sessions for one subject)
			
 
				+    data_df = data_df.drop(duplicated_subs_1.index).reset_index(drop=True)  # remove duplicated subjects
			
 
				+
			
 
				+    if confounds is not None:  # convert sites in numbers to perform confound removal
			
 
				+        if train_status == 'train':
			
 
				+            site_name = data_df['site'].unique()
			
 
				+            if type(site_name[0]) == str:
			
 
				+                site_dict = {k: idx for idx, k in enumerate(site_name)}
			
 
				+                data_df['site'] = data_df['site'].replace(site_dict)
			
 
				+
			
 
				+        elif train_status == 'test': # add site to features & convert site in a number to predict with model trained with  confound removal
			
 
				+            X.append(confounds)
			
 
				+            site_name = data_df['site'].unique()[0,]
			
 
				+            if type(site_name) == str:
			
 
				+                data_df['site'] = 10
			
 
				+    return data_df, X, y
			
 
				+    
			
 
				+    
			
 
				+    
			
 
				+def read_data(features_file, demographics_file):
			
 
				+    data_df = pickle.load(open(features_file, 'rb')) # read the data
			
 
				+    demo = pd.read_csv(demographics_file)     # read demographics file
			
 
				+    data_df = pd.concat([demo[['site', 'subject', 'age', 'gender']], data_df], axis=1) # merge them
			
 
				+
			
 
				+    print('Data columns:', data_df.columns)
			
 
				+    print('Data Index:', data_df.index)
			
 
				+
			
 
				+    X = [col for col in data_df if col.startswith('f_')]
			
 
				+    y = 'age'
			
 
				+    data_df['age'] = data_df['age'].round().astype(int)  # round off age and convert to integer
			
 
				+    data_df = data_df[data_df['age'].between(18, 90)].reset_index(drop=True)
			
 
				+    data_df.sort_values(by='age', inplace=True, ignore_index=True)  # sort by age
			
 
				+    duplicated_subs_1 = data_df[data_df.duplicated(['subject'], keep='first')] # check for duplicates (multiple sessions for one subject)
			
 
				+    data_df = data_df.drop(duplicated_subs_1.index).reset_index(drop=True)  # remove duplicated subjects
			
 
				+    return data_df, X, y
			
 
				+
			
--- a/brainage/xgboost_adapted.py
+++ b/brainage/xgboost_adapted.py
@@ -0,0 +1,46 @@
 
				+from xgboost import XGBRegressor
			
 
				+from sklearn.base import BaseEstimator
			
 
				+from sklearn.model_selection import train_test_split
			
 
				+import numpy as np
			
 
				+
			
 
				+class XGBoostAdapted(BaseEstimator):
			
 
				+
			
 
				+    def __init__(self, early_stopping_rounds=10, eval_metric=None, eval_set_percent=0.2, random_seed=None, n_jobs=1, max_depth=6, n_estimators=50, nthread=1, reg_alpha=0):
			
 
				+        self.early_stopping_rounds = early_stopping_rounds
			
 
				+        self.eval_metric = eval_metric
			
 
				+        self.eval_set_percent = eval_set_percent
			
 
				+        self.random_seed = random_seed
			
 
				+        self.n_jobs = n_jobs
			
 
				+        self.max_depth = max_depth
			
 
				+        self.n_estimators = n_estimators
			
 
				+        self.nthread = nthread
			
 
				+        self.reg_alpha = reg_alpha
			
 
				+
			
 
				+            
			
 
				+    def fit(self, X, y):
			
 
				+        self._xgbregressor = XGBRegressor(n_jobs=self.n_jobs, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, reg_alpha=self.reg_alpha)
			
 
				+
			
 
				+        X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=self.eval_set_percent, random_state=self.random_seed)
			
 
				+
			
 
				+        eval_set = [(X_test, y_test)]
			
 
				+
			
 
				+        self._xgbregressor.fit(X_train, y_train, early_stopping_rounds=self.early_stopping_rounds, eval_metric=self.eval_metric, eval_set=eval_set)
			
 
				+        
			
 
				+        return self
			
 
				+
			
 
				+    def score(self, X, y, sample_weight=None):
			
 
				+        return self._xgbregressor.score(X.values, y.values, sample_weight)
			
 
				+
			
 
				+    def predict(self, X):
			
 
				+        return self._xgbregressor.predict(X.values)
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+        
			
 
				+        
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/brainage/zscore.py
+++ b/brainage/zscore.py
@@ -0,0 +1,49 @@
 
				+import numpy as np
			
 
				+from sklearn.base import BaseEstimator, TransformerMixin
			
 
				+from sklearn.utils import check_array
			
 
				+from scipy.stats import zscore
			
 
				+
			
 
				+
			
 
				+class ZScore(BaseEstimator, TransformerMixin):
			
 
				+
			
 
				+    def __init__(self, axis=0):
			
 
				+        self.axis = axis
			
 
				+
			
 
				+    def fit(self, X, y=None):
			
 
				+        X = check_array(X)
			
 
				+        self.mean_ = np.mean(X, axis=self.axis)
			
 
				+        self.std_ = np.std(X, axis=self.axis)
			
 
				+        return self
			
 
				+
			
 
				+    def transform(self, X):
			
 
				+        X = check_array(X)
			
 
				+        mean = (
			
 
				+            self.mean_.reshape(-1, 1)
			
 
				+            if self.axis
			
 
				+            else self.mean_
			
 
				+        )
			
 
				+
			
 
				+        std = (
			
 
				+            self.std_.reshape(-1, 1)
			
 
				+            if self.axis
			
 
				+            else self.std_
			
 
				+        )
			
 
				+        # print(f"{X.shape = }")
			
 
				+        # print(f"{mean.shape = }")
			
 
				+        # print(f"{std.shape = }")
			
 
				+
			
 
				+        return (X - mean) / std
			
 
				+
			
 
				+
			
 
				+class ZScoreSubwise(BaseEstimator, TransformerMixin):
			
 
				+
			
 
				+    def __init__(self, axis=0):
			
 
				+        self.axis = axis
			
 
				+
			
 
				+    def fit(self, X, y=None):
			
 
				+        return self
			
 
				+
			
 
				+    def transform(self, X):
			
 
				+        X = check_array(X)
			
 
				+        return zscore(X, axis=self.axis)
			
 
				+