3 minggu lalu · 9db26a0d29
--- a/More/brainage/__init__.py
+++ b/More/brainage/__init__.py
@@ -0,0 +1,11 @@
 
				+from .calculate_features import calculate_voxelwise_features, calculate_parcelwise_features
			
 
				+from .create_splits import stratified_splits
			
 
				+from .xgboost_adapted import XGBoostAdapted
			
 
				+from .zscore import ZScoreSubwise, ZScore
			
 
				+from .create_splits import repeated_stratified_splits
			
 
				+from .read_data import read_data_cross_site
			
 
				+from .read_data import read_data
			
 
				+from .define_models import define_models
			
 
				+from sklearn.linear_model import LinearRegression
			
 
				+from .performance_metric import performance_metric
			
 
				+
			
--- a/More/brainage/__pycache__/__init__.cpython-311.pyc
+++ b/More/brainage/__pycache__/__init__.cpython-311.pyc
--- a/More/brainage/__pycache__/__init__.cpython-312.pyc
+++ b/More/brainage/__pycache__/__init__.cpython-312.pyc
--- a/More/brainage/__pycache__/__init__.cpython-39.pyc
+++ b/More/brainage/__pycache__/__init__.cpython-39.pyc
--- a/More/brainage/__pycache__/calculate_features.cpython-311.pyc
+++ b/More/brainage/__pycache__/calculate_features.cpython-311.pyc
--- a/More/brainage/__pycache__/calculate_features.cpython-312.pyc
+++ b/More/brainage/__pycache__/calculate_features.cpython-312.pyc
--- a/More/brainage/__pycache__/calculate_features.cpython-39.pyc
+++ b/More/brainage/__pycache__/calculate_features.cpython-39.pyc
--- a/More/brainage/__pycache__/create_splits.cpython-311.pyc
+++ b/More/brainage/__pycache__/create_splits.cpython-311.pyc
--- a/More/brainage/__pycache__/create_splits.cpython-39.pyc
+++ b/More/brainage/__pycache__/create_splits.cpython-39.pyc
--- a/More/brainage/__pycache__/define_models.cpython-311.pyc
+++ b/More/brainage/__pycache__/define_models.cpython-311.pyc
--- a/More/brainage/__pycache__/define_models.cpython-39.pyc
+++ b/More/brainage/__pycache__/define_models.cpython-39.pyc
--- a/More/brainage/__pycache__/performance_metric.cpython-311.pyc
+++ b/More/brainage/__pycache__/performance_metric.cpython-311.pyc
--- a/More/brainage/__pycache__/performance_metric.cpython-39.pyc
+++ b/More/brainage/__pycache__/performance_metric.cpython-39.pyc
--- a/More/brainage/__pycache__/read_data.cpython-311.pyc
+++ b/More/brainage/__pycache__/read_data.cpython-311.pyc
--- a/More/brainage/__pycache__/read_data.cpython-39.pyc
+++ b/More/brainage/__pycache__/read_data.cpython-39.pyc
--- a/More/brainage/__pycache__/xgboost_adapted.cpython-311.pyc
+++ b/More/brainage/__pycache__/xgboost_adapted.cpython-311.pyc
--- a/More/brainage/__pycache__/xgboost_adapted.cpython-39.pyc
+++ b/More/brainage/__pycache__/xgboost_adapted.cpython-39.pyc
--- a/More/brainage/__pycache__/zscore.cpython-311.pyc
+++ b/More/brainage/__pycache__/zscore.cpython-311.pyc
--- a/More/brainage/__pycache__/zscore.cpython-39.pyc
+++ b/More/brainage/__pycache__/zscore.cpython-39.pyc
--- a/More/brainage/calculate_features.py
+++ b/More/brainage/calculate_features.py
@@ -0,0 +1,188 @@
 
				+import os.path
			
 
				+import nilearn
			
 
				+from nilearn import image
			
 
				+import numpy as np
			
 
				+import pandas as pd
			
 
				+import nibabel as nib
			
 
				+import nibabel.processing as npr
			
 
				+
			
 
				+def subsample_img(img, f):
			
 
				+    """Reduce resample_to_img features of a 3D array by a given factor f."""
			
 
				+
			
 
				+    data = img.get_fdata()
			
 
				+    mask = np.zeros(img.shape)
			
 
				+    mask[::f, ::f, ::f] = 1
			
 
				+    data = data * mask
			
 
				+    return nib.Nifti1Image(data, img.affine, img.header)
			
 
				+
			
 
				+def binarize_3d(img, threshold):
			
 
				+    """binarize 3D spatial image"""
			
 
				+    return nib.Nifti1Image(
			
 
				+        np.where(img.get_fdata() > threshold, 1, 0), img.affine, img.header
			
 
				+    )
			
 
				+
			
 
				+def calculate_voxelwise_features(phenotype_file, mask_file, smooth_fwhm, resample_size):
			
 
				+    """Calculate voxelwise features for the subjects
			
 
				+
			
 
				+    Args:
			
 
				+        phenotype_file (csv or txt): A csv or text file with path to subject images
			
 
				+        mask_file (nii): The GM mask file to be used to extract features
			
 
				+        smooth_fwhm (int): Smooth images by applying a Gaussian filter by given FWHM (mm)
			
 
				+        resample_size (int): Resample image to given voxel size
			
 
				+
			
 
				+    Returns:
			
 
				+        data_resampled (dataframe): pandas dataframe of features (N subjects by M features)
			
 
				+    """    
			
 
				+
			
 
				+    phenotype = pd.read_csv(phenotype_file, header=None)
			
 
				+    
			
 
				+    # don't need this anymore
			
 
				+    # filename, file_extension = os.path.splitext(phenotype_file)
			
 
				+    # if file_extension == ".txt":
			
 
				+    #     phenotype = pd.read_csv(phenotype_file, header=None)
			
 
				+    # elif file_extension == ".csv":
			
 
				+    #     phenotype = pd.read_csv(phenotype_file, sep=",", header=None)
			
 
				+    # else:
			
 
				+    #     raise ValueError("Wrong file. Please imput either a csv or text file")
			
 
				+
			
 
				+    print(phenotype.shape)
			
 
				+    print(phenotype.head())
			
 
				+
			
 
				+#    phenotype = phenotype.iloc[0:15]
			
 
				+
			
 
				+    data_resampled = np.array([])  # array to save resampled features from subjects mri
			
 
				+    count = 0
			
 
				+    for index, row in phenotype.iterrows():  # iterate over each row
			
 
				+        sub_file = row.values[0]
			
 
				+
			
 
				+        if os.path.exists(sub_file):
			
 
				+            print(f"\n-----Processing subject number {count}------")
			
 
				+            sub_img = nib.load(sub_file)  # load subject image
			
 
				+            mask_img = nib.load(mask_file)  # load mask image
			
 
				+            print("Subject and mask image loaded")
			
 
				+            print("sub affine original \n", sub_img.affine, sub_img.shape)
			
 
				+            print("mask affine original \n", mask_img.affine, mask_img.shape)
			
 
				+
			
 
				+            print("Perform smoothing")
			
 
				+            sub_img = image.smooth_img(
			
 
				+                sub_img, smooth_fwhm
			
 
				+            )  # smooth the image with 4 mm FWHM
			
 
				+
			
 
				+            print("Perform resampling")
			
 
				+            # trying to match Gaser
			
 
				+            mask_img_rs = npr.resample_to_output(
			
 
				+                mask_img, [resample_size] * len(mask_img.shape), order=1
			
 
				+            )  # resample mask
			
 
				+            print(
			
 
				+                "mask affine after resampling\n",
			
 
				+                mask_img_rs.affine,
			
 
				+                mask_img_rs.shape,
			
 
				+            )
			
 
				+
			
 
				+            sub_img_rs = image.resample_to_img(
			
 
				+                sub_img, mask_img_rs, interpolation="linear"
			
 
				+            )  # resample subject
			
 
				+            print(
			
 
				+                "sub affine after resampling\n",
			
 
				+                sub_img_rs.affine,
			
 
				+                sub_img_rs.shape,
			
 
				+            )
			
 
				+
			
 
				+            binary_mask_img_rs = binarize_3d(mask_img_rs, 0.5)  # binarize the mask
			
 
				+            mask_rs = binary_mask_img_rs.get_fdata().astype(bool)
			
 
				+
			
 
				+            sub_data_rs = sub_img_rs.get_fdata()[
			
 
				+                mask_rs
			
 
				+            ]  # extract voxel using the binarized mask
			
 
				+            sub_data_rs = sub_data_rs.reshape(1, -1)
			
 
				+
			
 
				+            if data_resampled.size == 0:
			
 
				+                data_resampled = sub_data_rs
			
 
				+            else:
			
 
				+                data_resampled = np.concatenate((data_resampled, sub_data_rs), axis=0)
			
 
				+            count = count + 1
			
 
				+            print(data_resampled.shape)
			
 
				+
			
 
				+    print("\n *** Feature extraction done ***")
			
 
				+
			
 
				+    # renaming the columns and convering to dataframe
			
 
				+    data_resampled = pd.DataFrame(data_resampled)
			
 
				+    data_resampled.rename(columns=lambda X: "f_" + str(X), inplace=True)
			
 
				+    print('Feature names:', data_resampled.columns)
			
 
				+
			
 
				+    print(f"The size of the feature space is {data_resampled.shape}")
			
 
				+
			
 
				+    return data_resampled
			
 
				+
			
 
				+
			
 
				+
			
 
				+def calculate_parcelwise_features(phenotype_file, mask_dir, num_parcels):
			
 
				+    """Calculate parcelwise features for the subjects
			
 
				+
			
 
				+    Args:
			
 
				+        phenotype_file (csv or text): A csv or text file with path to subject images
			
 
				+        mask_dir (_type_): The GM mask file to be used to extract features
			
 
				+        num_parcels (_type_): Number of parcels
			
 
				+    
			
 
				+    Returns:
			
 
				+        data_parcels (dataframe): pandas dataframe of features (N subjects by M parcels)
			
 
				+    """    
			
 
				+
			
 
				+    phenotype = pd.read_csv(phenotype_file, header=None)
			
 
				+
			
 
				+    # filename, file_extension = os.path.splitext(phenotype_file)
			
 
				+
			
 
				+    # if file_extension == '.txt':
			
 
				+    #     phenotype = pd.read_csv(phenotype_file, header=None)
			
 
				+    # elif file_extension == '.csv':
			
 
				+    #     phenotype = pd.read_csv(phenotype_file, sep=',', header=None)
			
 
				+    # else:
			
 
				+    #     raise ValueError("Wrong file. Please imput either a csv or text file")
			
 
				+
			
 
				+    print(phenotype.shape)
			
 
				+    print(phenotype.head())
			
 
				+#    phenotype = phenotype.iloc[0:15]
			
 
				+
			
 
				+    data_parcels = [] #np.array([])  # array to save resampled features from subjects mri
			
 
				+    count = 0
			
 
				+
			
 
				+    for index, row in phenotype.iterrows(): # iterate over each row
			
 
				+        sub_file = row.values[0]
			
 
				+
			
 
				+        if os.path.exists(sub_file):
			
 
				+            print(f'\nProcessing subject number {count}')
			
 
				+            sub_img = nib.load(sub_file)  # load subject image
			
 
				+            mask_img = nib.load(mask_dir)  # load mask image
			
 
				+            print ('Subject and mask image loaded')
			
 
				+            print(sub_file, sub_img.affine, mask_img.affine)
			
 
				+
			
 
				+            sub_data = sub_img.get_fdata()
			
 
				+            sub_data[sub_data == 0] = np.nan # replace zeros with Nan
			
 
				+            sub_data_parcels = []
			
 
				+
			
 
				+            if not np.array_equal(sub_img.affine, mask_img.affine):
			
 
				+                mask_img = nilearn.image.resample_to_img(mask_img, sub_img, interpolation='linear')
			
 
				+            else:
			
 
				+                print("Subject and mask have same affine")
			
 
				+
			
 
				+            for num in range(1, int(num_parcels) + 1):
			
 
				+                itemindex = np.where(mask_img.get_fdata() == num)  # get indices from the mask for a parcel
			
 
				+                sub_mat = sub_data[itemindex]
			
 
				+
			
 
				+                if np.all(np.isnan(sub_mat)):
			
 
				+                    sub_agg = 0
			
 
				+                else:
			
 
				+                    sub_agg = np.nanmean(sub_mat) # mean the data from the indices to get GM volume
			
 
				+                sub_data_parcels.append(sub_agg)
			
 
				+
			
 
				+            data_parcels.append(sub_data_parcels)
			
 
				+            print(len(data_parcels))
			
 
				+            count = count + 1
			
 
				+
			
 
				+    print('\n *** Feature extraction done ***')
			
 
				+    data_parcels = pd.DataFrame(data_parcels)
			
 
				+    data_parcels.rename(columns=lambda X :'f_' + str(X), inplace=True)
			
 
				+    print(data_parcels.columns)
			
 
				+
			
 
				+    print('final dataframe shape', data_parcels.shape)
			
 
				+    return data_parcels
			
--- a/More/brainage/create_splits.py
+++ b/More/brainage/create_splits.py
@@ -0,0 +1,98 @@
 
				+#!/home/smore/.venvs/py3smore/bin/python3
			
 
				+import math
			
 
				+import pandas as pd
			
 
				+from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
			
 
				+
			
 
				+
			
 
				+# def create_splits(data_df, repeats):
			
 
				+#     num_bins = math.ceil(len(data_df)/repeats) # calculate number of bins to be created
			
 
				+#     print('num_bins', num_bins, len(data_df)/repeats)
			
 
				+#
			
 
				+#     qc = pd.cut(data_df.index, num_bins)
			
 
				+#     df = pd.DataFrame({'bin': qc.codes})
			
 
				+#
			
 
				+#     max_num = max(df['bin'].value_counts())
			
 
				+#     print(df['bin'].value_counts())
			
 
				+#     print(max_num, 'max_num')
			
 
				+#
			
 
				+#     test_idx = {}
			
 
				+#     for rpt_num in range(0, repeats):
			
 
				+#         key = 'repeat_' + str(rpt_num)
			
 
				+#         test_idx[key] = []
			
 
				+#
			
 
				+#     if repeats == max_num:
			
 
				+#         for num in range(0, max_num):
			
 
				+#             for bin_idx in df['bin'].unique():
			
 
				+#                 test = df[df['bin'] == bin_idx]
			
 
				+#                 if num < len(test):
			
 
				+#                     key = 'repeat_' + str(num)
			
 
				+#                     test_idx[key].append(test.index[num])
			
 
				+#     return test_idx
			
 
				+
			
 
				+
			
 
				+def stratified_splits(bins_on, num_bins, data, num_splits, shuffle, random_state):
			
 
				+    """
			
 
				+    :param bins_on: variable used to create bins
			
 
				+    :param num_bins: num of bins/classes to create
			
 
				+    :param data: data to create cv splits on
			
 
				+    :param num_splits: number of cv splits to create
			
 
				+    :param shuffle: shuffle the data or not
			
 
				+    :param random_state: random seed to use if shuffle=True
			
 
				+    :return: a dictionary with index
			
 
				+    """
			
 
				+    qc = pd.cut(bins_on.tolist(), num_bins)  # divides data in bins
			
 
				+    cv = StratifiedKFold(n_splits=num_splits, shuffle=shuffle, random_state=random_state)
			
 
				+    test_idx = {}
			
 
				+    rpt_num = 0
			
 
				+    for train_index, test_index in cv.split(data, qc.codes):
			
 
				+        key = 'repeat_' + str(rpt_num)
			
 
				+        test_idx[key] = test_index
			
 
				+        rpt_num = rpt_num + 1
			
 
				+    return test_idx
			
 
				+
			
 
				+
			
 
				+def stratified_splits_class(bins_on, data, num_splits, shuffle, random_state):
			
 
				+    """
			
 
				+    :param bins_on: variable used to create bins
			
 
				+    :param data: data to create cv splits on
			
 
				+    :param num_splits: number of cv splits to create
			
 
				+    :param shuffle: shuffle the data or not
			
 
				+    :param random_state: random seed to use if shuffle=True
			
 
				+    :return: a dictionary with index
			
 
				+    """
			
 
				+    cv = StratifiedKFold(n_splits=num_splits, shuffle=shuffle, random_state=random_state)
			
 
				+    test_idx = {}
			
 
				+    rpt_num = 0
			
 
				+    for train_index, test_index in cv.split(data, bins_on):
			
 
				+        key = 'repeat_' + str(rpt_num)
			
 
				+        test_idx[key] = test_index
			
 
				+        rpt_num = rpt_num + 1
			
 
				+    return test_idx
			
 
				+
			
 
				+
			
 
				+# def stratified_splits(bins_on, num_bins, data, num_splits, shuffle, random_state): # useful for run_cross_validation()
			
 
				+#     """
			
 
				+#     :param bins_on: variable used to create bins
			
 
				+#     :param num_bins: num of bins/classes to create
			
 
				+#     :param data: data to create cv splits on
			
 
				+#     :param num_splits: number of cv splits to create
			
 
				+#     :param shuffle: shuffle the data or not
			
 
				+#     :param random_state: random seed to use if shuffle=True
			
 
				+#     :return: cv iterator
			
 
				+#     """
			
 
				+#     qc = pd.cut(bins_on.tolist(), num_bins)
			
 
				+#     cv = StratifiedKFold(n_splits=num_splits, shuffle=shuffle, random_state=random_state).split(data, qc.codes)
			
 
				+#     return cv
			
 
				+
			
 
				+
			
 
				+def repeated_stratified_splits(bins_on, num_bins, data, num_splits, num_repeats, random_state):
			
 
				+    qc = pd.cut(bins_on.tolist(), num_bins)
			
 
				+    cv = RepeatedStratifiedKFold(n_splits=num_splits, n_repeats=num_repeats, random_state=random_state)
			
 
				+    test_idx = {}
			
 
				+    rpt_num = 0
			
 
				+    for train_index, test_index in cv.split(data, qc.codes):
			
 
				+        key = 'repeat_' + str(rpt_num)
			
 
				+        test_idx[key] = test_index
			
 
				+        rpt_num = rpt_num + 1
			
 
				+    return test_idx
			
 
				+
			
--- a/More/brainage/define_models.py
+++ b/More/brainage/define_models.py
@@ -0,0 +1,53 @@
 
				+import xgboost as xgb
			
 
				+from skrvm import RVR
			
 
				+from glmnet import ElasticNet
			
 
				+import sklearn.gaussian_process as gp
			
 
				+from sklearn.kernel_ridge import KernelRidge
			
 
				+from sklearn.decomposition import PCA
			
 
				+from brainage import XGBoostAdapted
			
 
				+from sklearn.feature_selection import VarianceThreshold
			
 
				+    
			
 
				+def define_models():
			
 
				+    # Define all models and model parameters
			
 
				+    rvr_linear = RVR()
			
 
				+    rvr_poly = RVR()
			
 
				+    kernel_ridge = KernelRidge()
			
 
				+    lasso = ElasticNet(alpha=1, standardize=False)
			
 
				+    elasticnet = ElasticNet(alpha=0.5, standardize=False)
			
 
				+    ridge = ElasticNet(alpha=0, standardize=False)
			
 
				+    xgb = XGBoostAdapted(early_stopping_rounds=10, eval_metric='mae', eval_set_percent=0.2)
			
 
				+    pca = PCA(n_components=None)  # max as many components as sample size
			
 
				+
			
 
				+
			
 
				+    model_list = [ridge, 'rf', rvr_linear, kernel_ridge, 'gauss', lasso, elasticnet, rvr_poly, xgb]
			
 
				+    model_para_list = [
			
 
				+                    {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed},
			
 
				+
			
 
				+                    {'variancethreshold__threshold': var_threshold, 'rf__n_estimators': 500, 'rf__criterion': 'mse',
			
 
				+                    'rf__max_features': 0.33, 'rf__min_samples_leaf': 5,
			
 
				+                    'rf__random_state': rand_seed},
			
 
				+
			
 
				+                    {'variancethreshold__threshold': var_threshold, 'rvr__kernel': 'linear',
			
 
				+                    'rvr__random_state': rand_seed},
			
 
				+
			
 
				+                    {'variancethreshold__threshold': var_threshold,
			
 
				+                    'kernelridge__alpha': [0.0, 0.001, 0.01, 0.1, 0.5, 1.0, 10.0, 100.0, 1000.0],
			
 
				+                    'kernelridge__kernel': 'polynomial', 'kernelridge__degree': [1, 2], 'cv': 5},
			
 
				+
			
 
				+                    {'variancethreshold__threshold': var_threshold,
			
 
				+                    'gauss__kernel': gp.kernels.RBF(10.0, (1e-7, 10e7)), 'gauss__n_restarts_optimizer': 100,
			
 
				+                    'gauss__normalize_y': True, 'gauss__random_state': rand_seed},
			
 
				+
			
 
				+                    {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed},
			
 
				+
			
 
				+                    {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed},
			
 
				+
			
 
				+                    {'variancethreshold__threshold': var_threshold, 'rvr__kernel': 'poly', 'rvr__degree': 1,
			
 
				+                    'rvr__random_state': rand_seed},
			
 
				+
			
 
				+                    {'variancethreshold__threshold': var_threshold, 'xgboostadapted__n_jobs': 1,
			
 
				+                    'xgboostadapted__max_depth': [1, 2, 3, 6, 8, 10, 12], 'xgboostadapted__n_estimators': 100,
			
 
				+                    'xgboostadapted__reg_alpha': [0.001, 0.01, 0.05, 0.1, 0.2],
			
 
				+                    'xgboostadapted__random_seed': rand_seed, 'cv': 5}]  # 'search_params':{'n_jobs': 5}]
			
 
				+                    
			
 
				+    return model_list, model_para_list
			
--- a/More/brainage/performance_metric.py
+++ b/More/brainage/performance_metric.py
@@ -0,0 +1,8 @@
 
				+from sklearn.metrics import mean_absolute_error, mean_squared_error
			
 
				+import numpy as np
			
 
				+
			
 
				+def performance_metric(y_true, y_pred):
			
 
				+    mae = round(mean_absolute_error(y_true, y_pred), 3)
			
 
				+    mse = round(mean_squared_error(y_true, y_pred), 3)
			
 
				+    corr = round(np.corrcoef(y_pred, y_true)[1, 0], 3)
			
 
				+    return mae, mse, corr
			
--- a/More/brainage/read_data.py
+++ b/More/brainage/read_data.py
@@ -0,0 +1,46 @@
 
				+import pickle
			
 
				+import pandas as pd
			
 
				+
			
 
				+def read_data_cross_site(data_file, train_status, confounds):
			
 
				+    
			
 
				+    data_df = pickle.load(open(data_file, 'rb'))
			
 
				+    X = [col for col in data_df if col.startswith('f_')]
			
 
				+    y = 'age'
			
 
				+    data_df['age'] = data_df['age'].round().astype(int)  # round off age and convert to integer
			
 
				+    data_df = data_df[data_df['age'].between(18, 90)].reset_index(drop=True)
			
 
				+    duplicated_subs_1 = data_df[data_df.duplicated(['subject'], keep='first')] # check for duplicates (multiple sessions for one subject)
			
 
				+    data_df = data_df.drop(duplicated_subs_1.index).reset_index(drop=True)  # remove duplicated subjects
			
 
				+
			
 
				+    if confounds is not None:  # convert sites in numbers to perform confound removal
			
 
				+        if train_status == 'train':
			
 
				+            site_name = data_df['site'].unique()
			
 
				+            if type(site_name[0]) == str:
			
 
				+                site_dict = {k: idx for idx, k in enumerate(site_name)}
			
 
				+                data_df['site'] = data_df['site'].replace(site_dict)
			
 
				+
			
 
				+        elif train_status == 'test': # add site to features & convert site in a number to predict with model trained with  confound removal
			
 
				+            X.append(confounds)
			
 
				+            site_name = data_df['site'].unique()[0,]
			
 
				+            if type(site_name) == str:
			
 
				+                data_df['site'] = 10
			
 
				+    return data_df, X, y
			
 
				+    
			
 
				+    
			
 
				+    
			
 
				+def read_data(features_file, demographics_file):
			
 
				+    data_df = pickle.load(open(features_file, 'rb')) # read the data
			
 
				+    demo = pd.read_csv(demographics_file)     # read demographics file
			
 
				+    data_df = pd.concat([demo[['site', 'subject', 'age', 'gender']], data_df], axis=1) # merge them
			
 
				+
			
 
				+    print('Data columns:', data_df.columns)
			
 
				+    print('Data Index:', data_df.index)
			
 
				+
			
 
				+    X = [col for col in data_df if col.startswith('f_')]
			
 
				+    y = 'age'
			
 
				+    data_df['age'] = data_df['age'].round().astype(int)  # round off age and convert to integer
			
 
				+    data_df = data_df[data_df['age'].between(18, 90)].reset_index(drop=True)
			
 
				+    data_df.sort_values(by='age', inplace=True, ignore_index=True)  # sort by age
			
 
				+    duplicated_subs_1 = data_df[data_df.duplicated(['subject'], keep='first')] # check for duplicates (multiple sessions for one subject)
			
 
				+    data_df = data_df.drop(duplicated_subs_1.index).reset_index(drop=True)  # remove duplicated subjects
			
 
				+    return data_df, X, y
			
 
				+
			
--- a/More/brainage/xgboost_adapted.py
+++ b/More/brainage/xgboost_adapted.py
@@ -0,0 +1,46 @@
 
				+from xgboost import XGBRegressor
			
 
				+from sklearn.base import BaseEstimator
			
 
				+from sklearn.model_selection import train_test_split
			
 
				+import numpy as np
			
 
				+
			
 
				+class XGBoostAdapted(BaseEstimator):
			
 
				+
			
 
				+    def __init__(self, early_stopping_rounds=10, eval_metric=None, eval_set_percent=0.2, random_seed=None, n_jobs=1, max_depth=6, n_estimators=50, nthread=1, reg_alpha=0):
			
 
				+        self.early_stopping_rounds = early_stopping_rounds
			
 
				+        self.eval_metric = eval_metric
			
 
				+        self.eval_set_percent = eval_set_percent
			
 
				+        self.random_seed = random_seed
			
 
				+        self.n_jobs = n_jobs
			
 
				+        self.max_depth = max_depth
			
 
				+        self.n_estimators = n_estimators
			
 
				+        self.nthread = nthread
			
 
				+        self.reg_alpha = reg_alpha
			
 
				+
			
 
				+            
			
 
				+    def fit(self, X, y):
			
 
				+        self._xgbregressor = XGBRegressor(n_jobs=self.n_jobs, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, reg_alpha=self.reg_alpha)
			
 
				+
			
 
				+        X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=self.eval_set_percent, random_state=self.random_seed)
			
 
				+
			
 
				+        eval_set = [(X_test, y_test)]
			
 
				+
			
 
				+        self._xgbregressor.fit(X_train, y_train, early_stopping_rounds=self.early_stopping_rounds, eval_metric=self.eval_metric, eval_set=eval_set)
			
 
				+        
			
 
				+        return self
			
 
				+
			
 
				+    def score(self, X, y, sample_weight=None):
			
 
				+        return self._xgbregressor.score(X.values, y.values, sample_weight)
			
 
				+
			
 
				+    def predict(self, X):
			
 
				+        return self._xgbregressor.predict(X.values)
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+        
			
 
				+        
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/More/brainage/zscore.py
+++ b/More/brainage/zscore.py
@@ -0,0 +1,49 @@
 
				+import numpy as np
			
 
				+from sklearn.base import BaseEstimator, TransformerMixin
			
 
				+from sklearn.utils import check_array
			
 
				+from scipy.stats import zscore
			
 
				+
			
 
				+
			
 
				+class ZScore(BaseEstimator, TransformerMixin):
			
 
				+
			
 
				+    def __init__(self, axis=0):
			
 
				+        self.axis = axis
			
 
				+
			
 
				+    def fit(self, X, y=None):
			
 
				+        X = check_array(X)
			
 
				+        self.mean_ = np.mean(X, axis=self.axis)
			
 
				+        self.std_ = np.std(X, axis=self.axis)
			
 
				+        return self
			
 
				+
			
 
				+    def transform(self, X):
			
 
				+        X = check_array(X)
			
 
				+        mean = (
			
 
				+            self.mean_.reshape(-1, 1)
			
 
				+            if self.axis
			
 
				+            else self.mean_
			
 
				+        )
			
 
				+
			
 
				+        std = (
			
 
				+            self.std_.reshape(-1, 1)
			
 
				+            if self.axis
			
 
				+            else self.std_
			
 
				+        )
			
 
				+        # print(f"{X.shape = }")
			
 
				+        # print(f"{mean.shape = }")
			
 
				+        # print(f"{std.shape = }")
			
 
				+
			
 
				+        return (X - mean) / std
			
 
				+
			
 
				+
			
 
				+class ZScoreSubwise(BaseEstimator, TransformerMixin):
			
 
				+
			
 
				+    def __init__(self, axis=0):
			
 
				+        self.axis = axis
			
 
				+
			
 
				+    def fit(self, X, y=None):
			
 
				+        return self
			
 
				+
			
 
				+    def transform(self, X):
			
 
				+        X = check_array(X)
			
 
				+        return zscore(X, axis=self.axis)
			
 
				+
			
--- a/More/entrypoint.sh
+++ b/More/entrypoint.sh
@@ -0,0 +1,7 @@
 
				+#!/bin/bash
			
 
				+
			
 
				+cd /opt/scripts/
			
 
				+echo "PYTHONPATH:$PYTHONPATH"
			
 
				+ls /opt/src/brainage/
			
 
				+
			
 
				+python predict_age_sing.py --features_path $1 --data_dir $2 --subject_filepaths $3 --output_path $4 --output_prefix $5 --mask_file /opt/masks/brainmask_12.8.nii --smooth_fwhm $6  --resample_size $7 --model_file $8
			
--- a/More/predict_age_sing.py
+++ b/More/predict_age_sing.py
@@ -0,0 +1,138 @@
 
				+#from read_data_mask_resampled import *
			
 
				+import sys
			
 
				+print("sys.path:", sys.path)
			
 
				+
			
 
				+from brainage.calculate_features import calculate_voxelwise_features
			
 
				+from pathlib import Path
			
 
				+import pandas as pd
			
 
				+import argparse
			
 
				+import pickle
			
 
				+import os
			
 
				+import re
			
 
				+
			
 
				+
			
 
				+def model_pred(test_df, model_file, feature_space_str):
			
 
				+    """This functions predicts age
			
 
				+    Args:
			
 
				+        test_df (dataframe): test data
			
 
				+        model_file (pickle file): trained model file
			
 
				+        feature_space_str (string): feature space name
			
 
				+
			
 
				+    Returns:
			
 
				+        dataframe: predictions from the model
			
 
				+    """    
			
 
				+
			
 
				+    model = pickle.load(open(model_file, 'rb')) # load model
			
 
				+    pred = pd.DataFrame()
			
 
				+    for key, model_value in model.items():
			
 
				+        X = data_df.columns.tolist()
			
 
				+        pre_X, pre_X2 = model_value.preprocess(test_df[X], test_df[X])  # preprocessed data
			
 
				+        y_pred = model_value.predict(test_df).ravel()
			
 
				+        print(y_pred.shape)
			
 
				+        pred[feature_space_str + '+' + key] = y_pred
			
 
				+    return pred
			
 
				+
			
 
				+
			
 
				+if __name__ == "__main__":
			
 
				+    parser = argparse.ArgumentParser()
			
 
				+    parser.add_argument("--features_path", type=str, help="path to features dir")  # eg '../data/ADNI'
			
 
				+    parser.add_argument("--data_dir", type=str, help="path to data dir")  #
			
 
				+    parser.add_argument("--subject_filepaths", type=str, help="path to csv or txt file with subject filepaths") # eg: '../data/ADNI/ADNI.paths_cat12.8.csv'
			
 
				+    parser.add_argument("--output_path", type=str, help="path to output_dir")  # eg'../results/ADNI'
			
 
				+    parser.add_argument("--output_prefix", type=str, help="prefix added to features filename ans results (predictions) file name") # eg: 'ADNI'
			
 
				+    parser.add_argument("--mask_file", type=str, help="path to GM mask nii file",
			
 
				+                        default='../masks/brainmask_12.8.nii')
			
 
				+    parser.add_argument("--smooth_fwhm", type=int, help="smoothing FWHM", default=4)
			
 
				+    parser.add_argument("--resample_size", type=int, help="resampling kernel size", default=4)
			
 
				+    parser.add_argument("--model_file", type=str, help="Trained model to be used to predict",
			
 
				+                        default='../trained_models/4sites.S4_R4_pca.gauss.models')
			
 
				+    # For testing
			
 
				+    # python3 predict_age.py --features_path ../data/ADNI --subject_filepaths ../data/ADNI/ADNI.paths_cat12.8.csv --output_path ../results/ADNI --output_prefix ADNI --mask_file ../masks/brainmask_12.8.nii  --smooth_fwhm 4 --resample_size 4 --model_file ../trained_models/4sites.S4_R4_pca.gauss.models
			
 
				+
			
 
				+    args = parser.parse_args()
			
 
				+    features_path = args.features_path
			
 
				+    data_dir = args.data_dir
			
 
				+    subject_filepaths = args.subject_filepaths
			
 
				+    output_path = args.output_path
			
 
				+    output_prefix = args.output_prefix
			
 
				+    smooth_fwhm = args.smooth_fwhm
			
 
				+    resample_size = args.resample_size
			
 
				+    mask_file = args.mask_file
			
 
				+    model_file = args.model_file
			
 
				+
			
 
				+    print('\nBrain-age trained model used: ', model_file)
			
 
				+    print('Data directory (test data): ', data_dir)
			
 
				+    print('Subjects filepaths (test data): ', subject_filepaths)
			
 
				+    print('Directory to features path: ',  features_path)
			
 
				+    print('Results directory: ', output_path)
			
 
				+    print('Results filename prefix: ', output_prefix)
			
 
				+    print('GM mask used: ', mask_file)
			
 
				+    
			
 
				+    # create full filename for the nii files of the subjects and save as csv in features_path
			
 
				+    subject_filepaths_nii = pd.read_csv(subject_filepaths, header=None)
			
 
				+    subject_filepaths_nii = data_dir + '/' + subject_filepaths_nii
			
 
				+    print(subject_filepaths_nii)
			
 
				+    subject_full_filepaths = os.path.join(features_path, 'subject_full_filepaths.csv')
			
 
				+    print(subject_full_filepaths)
			
 
				+    subject_filepaths_nii.to_csv(subject_full_filepaths, header=False, index=False)
			
 
				+    
			
 
				+    
			
 
				+    # get feature space name from the model file entered and
			
 
				+    # create feature space name using the input values (smoothing, resampling)
			
 
				+    # match them: they should be same
			
 
				+
			
 
				+    # get feature space name from the model file entered in argument
			
 
				+    pipeline_name1 = model_file.split('/')[-1]
			
 
				+    feature_space = pipeline_name1.split('.')[1]
			
 
				+    model_name = pipeline_name1.split('.')[2]
			
 
				+    pipeline_name = feature_space + '.' + model_name
			
 
				+    
			
 
				+    # create feature space name using the input values (smoothing, resampling)
			
 
				+    pca_string = re.findall(r"pca", feature_space)
			
 
				+    if len(pca_string) == 1:
			
 
				+        feature_space_str = 'S' + str(smooth_fwhm) + '_R' + str(resample_size) + '_pca'
			
 
				+    else:
			
 
				+        feature_space_str = 'S' + str(smooth_fwhm) + '_R' + str(resample_size)
			
 
				+
			
 
				+    # match them: they should be same
			
 
				+    assert(feature_space_str == feature_space), f"Mismatch in feature parameters entered ({feature_space_str}) & features used for model training ({feature_space})"
			
 
				+
			
 
				+    print('Feature space: ', feature_space)
			
 
				+    print('Model name: ', model_name)
			
 
				+
			
 
				+    # Create directories, create features if they don't exists
			
 
				+    Path(output_path).mkdir(exist_ok=True, parents=True)
			
 
				+    Path(features_path).mkdir(exist_ok=True, parents=True)
			
 
				+    features_filename = str(output_prefix) + '.S' + str(smooth_fwhm) + '_R' + str(resample_size)
			
 
				+    features_fullfile = os.path.join(features_path, features_filename)
			
 
				+    print('\nfilename for features created: ', features_fullfile)
			
 
				+
			
 
				+    if os.path.isfile(features_fullfile): # check if features file exists
			
 
				+        print('\n----File exists')
			
 
				+        data_df = pickle.load(open(features_fullfile, 'rb'))
			
 
				+        print('Features loaded')
			
 
				+    else:
			
 
				+        print('\n-----Extracting features')
			
 
				+        # create features
			
 
				+        data_df = calculate_voxelwise_features(subject_full_filepaths, mask_file, smooth_fwhm=smooth_fwhm, resample_size=resample_size)
			
 
				+        # save features
			
 
				+        pickle.dump(data_df, open(features_fullfile, "wb"), protocol=4)
			
 
				+        data_df.to_csv(features_fullfile + '.csv', index=False)
			
 
				+        print('Feature extraction done and saved')
			
 
				+
			
 
				+    # get predictions and save
			
 
				+    try:
			
 
				+        predictions_df = model_pred(data_df, model_file, feature_space_str)
			
 
				+        # save predictions
			
 
				+        predictions_filename = str(output_prefix) + '.' + pipeline_name + '.prediction.csv'
			
 
				+        predictions_fullfile = os.path.join(output_path, predictions_filename)
			
 
				+        print('\nfilename for predictions created: ', predictions_fullfile)
			
 
				+        predictions_df.to_csv(predictions_fullfile, index=False)
			
 
				+        print(predictions_df)
			
 
				+
			
 
				+    except FileNotFoundError:
			
 
				+        print(f'{model_file} is not present')
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/More/pyproject.toml
+++ b/More/pyproject.toml
@@ -0,0 +1,9 @@
 
				+[build-system]
			
 
				+requires = ["setuptools>=42", "wheel", "setuptools_scm[toml]>=3.4"]
			
 
				+build-backend = "setuptools.build_meta"
			
 
				+
			
 
				+[tool.pytest.ini_options]
			
 
				+addopts = "--cov=brainage"
			
 
				+testpaths = [
			
 
				+    "tests",
			
 
				+]
			
--- a/More/setup.py
+++ b/More/setup.py
@@ -0,0 +1,32 @@
 
				+from setuptools import find_packages, setup
			
 
				+
			
 
				+# requirements = []
			
 
				+# with open("requirements.txt", "r") as f:
			
 
				+#    for line in f:
			
 
				+#        requirements.append(line)
			
 
				+
			
 
				+setup(
			
 
				+    name="brainage",
			
 
				+    version="0.1.0",
			
 
				+    description="Brainage prediction project",
			
 
				+    url="https://github.com/juaml/brainage_estimation",
			
 
				+    author="Applied Machine Learning FZJ",
			
 
				+    packages=find_packages(),
			
 
				+    # install_requires=requirements,
			
 
				+    classifiers=[
			
 
				+        "Development Status :: 1 - Planning",
			
 
				+        "Intended Audience :: Science/Research",
			
 
				+        "License :: OSI Approved :: BSD License",
			
 
				+        "Operating System :: POSIX :: Linux",
			
 
				+        "Programming Language :: Python :: 2",
			
 
				+        "Programming Language :: Python :: 2.7",
			
 
				+        "Programming Language :: Python :: 3",
			
 
				+        "Programming Language :: Python :: 3.4",
			
 
				+        "Programming Language :: Python :: 3.5",
			
 
				+    ],
			
 
				+    python_requires=">=3.6",
			
 
				+    include_package_data=True,
			
 
				+    package_data={"": ["data/*"]},
			
 
				+)
			
 
				+
			
 
				+
			
--- a/More/trained_models/4sites.S0_R4.lasso.models
+++ b/More/trained_models/4sites.S0_R4.lasso.models
@@ -0,0 +1 @@
 
				+/annex/objects/MD5-s21234411--36fff9c99dad205bc292616f9038f506
			
--- a/More/trained_models/4sites.S4_R4.gauss.models
+++ b/More/trained_models/4sites.S4_R4.gauss.models
@@ -0,0 +1 @@
 
				+/annex/objects/MD5-s771535810--2fb9db5aabd52fabc2d01da1a89b43cf
			
--- a/More/trained_models/4sites.S4_R4_pca.gauss.models
+++ b/More/trained_models/4sites.S4_R4_pca.gauss.models
@@ -0,0 +1 @@
 
				+/annex/objects/MD5-s841713501--5be0577ed3c1a2e7c918250ad343dcc7
			
--- a/More/trained_models/9datasets.S4_R4.gauss.models
+++ b/More/trained_models/9datasets.S4_R4.gauss.models
@@ -0,0 +1 @@
 
				+/annex/objects/MD5-s1439971899--58bdc02b3089255ff02d61699f2b6aa4
			
--- a/More/trained_models/9datasets.S4_R4_pca.gauss.models
+++ b/More/trained_models/9datasets.S4_R4_pca.gauss.models
@@ -0,0 +1 @@
 
				+/annex/objects/MD5-s1654211173--a782645224e59d35f07639d8850544ef
		`@@ -0,0 +1 @@`
		`+/annex/objects/MD5-s21234411--36fff9c99dad205bc292616f9038f506`
		`@@ -0,0 +1 @@`
		`+/annex/objects/MD5-s771535810--2fb9db5aabd52fabc2d01da1a89b43cf`
		`@@ -0,0 +1 @@`
		`+/annex/objects/MD5-s841713501--5be0577ed3c1a2e7c918250ad343dcc7`
		`@@ -0,0 +1 @@`
		`+/annex/objects/MD5-s1439971899--58bdc02b3089255ff02d61699f2b6aa4`
		`@@ -0,0 +1 @@`
		`+/annex/objects/MD5-s1654211173--a782645224e59d35f07639d8850544ef`