Jelajahi Sumber

Upload files to 'More'

Lucas Backes 3 minggu lalu
induk
melakukan
9db26a0d29
35 mengubah file dengan 690 tambahan dan 0 penghapusan
  1. 11 0
      More/brainage/__init__.py
  2. TEMPAT SAMPAH
      More/brainage/__pycache__/__init__.cpython-311.pyc
  3. TEMPAT SAMPAH
      More/brainage/__pycache__/__init__.cpython-312.pyc
  4. TEMPAT SAMPAH
      More/brainage/__pycache__/__init__.cpython-39.pyc
  5. TEMPAT SAMPAH
      More/brainage/__pycache__/calculate_features.cpython-311.pyc
  6. TEMPAT SAMPAH
      More/brainage/__pycache__/calculate_features.cpython-312.pyc
  7. TEMPAT SAMPAH
      More/brainage/__pycache__/calculate_features.cpython-39.pyc
  8. TEMPAT SAMPAH
      More/brainage/__pycache__/create_splits.cpython-311.pyc
  9. TEMPAT SAMPAH
      More/brainage/__pycache__/create_splits.cpython-39.pyc
  10. TEMPAT SAMPAH
      More/brainage/__pycache__/define_models.cpython-311.pyc
  11. TEMPAT SAMPAH
      More/brainage/__pycache__/define_models.cpython-39.pyc
  12. TEMPAT SAMPAH
      More/brainage/__pycache__/performance_metric.cpython-311.pyc
  13. TEMPAT SAMPAH
      More/brainage/__pycache__/performance_metric.cpython-39.pyc
  14. TEMPAT SAMPAH
      More/brainage/__pycache__/read_data.cpython-311.pyc
  15. TEMPAT SAMPAH
      More/brainage/__pycache__/read_data.cpython-39.pyc
  16. TEMPAT SAMPAH
      More/brainage/__pycache__/xgboost_adapted.cpython-311.pyc
  17. TEMPAT SAMPAH
      More/brainage/__pycache__/xgboost_adapted.cpython-39.pyc
  18. TEMPAT SAMPAH
      More/brainage/__pycache__/zscore.cpython-311.pyc
  19. TEMPAT SAMPAH
      More/brainage/__pycache__/zscore.cpython-39.pyc
  20. 188 0
      More/brainage/calculate_features.py
  21. 98 0
      More/brainage/create_splits.py
  22. 53 0
      More/brainage/define_models.py
  23. 8 0
      More/brainage/performance_metric.py
  24. 46 0
      More/brainage/read_data.py
  25. 46 0
      More/brainage/xgboost_adapted.py
  26. 49 0
      More/brainage/zscore.py
  27. 7 0
      More/entrypoint.sh
  28. 138 0
      More/predict_age_sing.py
  29. 9 0
      More/pyproject.toml
  30. 32 0
      More/setup.py
  31. 1 0
      More/trained_models/4sites.S0_R4.lasso.models
  32. 1 0
      More/trained_models/4sites.S4_R4.gauss.models
  33. 1 0
      More/trained_models/4sites.S4_R4_pca.gauss.models
  34. 1 0
      More/trained_models/9datasets.S4_R4.gauss.models
  35. 1 0
      More/trained_models/9datasets.S4_R4_pca.gauss.models

+ 11 - 0
More/brainage/__init__.py

@@ -0,0 +1,11 @@
+from .calculate_features import calculate_voxelwise_features, calculate_parcelwise_features
+from .create_splits import stratified_splits
+from .xgboost_adapted import XGBoostAdapted
+from .zscore import ZScoreSubwise, ZScore
+from .create_splits import repeated_stratified_splits
+from .read_data import read_data_cross_site
+from .read_data import read_data
+from .define_models import define_models
+from sklearn.linear_model import LinearRegression
+from .performance_metric import performance_metric
+

TEMPAT SAMPAH
More/brainage/__pycache__/__init__.cpython-311.pyc


TEMPAT SAMPAH
More/brainage/__pycache__/__init__.cpython-312.pyc


TEMPAT SAMPAH
More/brainage/__pycache__/__init__.cpython-39.pyc


TEMPAT SAMPAH
More/brainage/__pycache__/calculate_features.cpython-311.pyc


TEMPAT SAMPAH
More/brainage/__pycache__/calculate_features.cpython-312.pyc


TEMPAT SAMPAH
More/brainage/__pycache__/calculate_features.cpython-39.pyc


TEMPAT SAMPAH
More/brainage/__pycache__/create_splits.cpython-311.pyc


TEMPAT SAMPAH
More/brainage/__pycache__/create_splits.cpython-39.pyc


TEMPAT SAMPAH
More/brainage/__pycache__/define_models.cpython-311.pyc


TEMPAT SAMPAH
More/brainage/__pycache__/define_models.cpython-39.pyc


TEMPAT SAMPAH
More/brainage/__pycache__/performance_metric.cpython-311.pyc


TEMPAT SAMPAH
More/brainage/__pycache__/performance_metric.cpython-39.pyc


TEMPAT SAMPAH
More/brainage/__pycache__/read_data.cpython-311.pyc


TEMPAT SAMPAH
More/brainage/__pycache__/read_data.cpython-39.pyc


TEMPAT SAMPAH
More/brainage/__pycache__/xgboost_adapted.cpython-311.pyc


TEMPAT SAMPAH
More/brainage/__pycache__/xgboost_adapted.cpython-39.pyc


TEMPAT SAMPAH
More/brainage/__pycache__/zscore.cpython-311.pyc


TEMPAT SAMPAH
More/brainage/__pycache__/zscore.cpython-39.pyc


+ 188 - 0
More/brainage/calculate_features.py

@@ -0,0 +1,188 @@
+import os.path
+import nilearn
+from nilearn import image
+import numpy as np
+import pandas as pd
+import nibabel as nib
+import nibabel.processing as npr
+
+def subsample_img(img, f):
+    """Reduce resample_to_img features of a 3D array by a given factor f."""
+
+    data = img.get_fdata()
+    mask = np.zeros(img.shape)
+    mask[::f, ::f, ::f] = 1
+    data = data * mask
+    return nib.Nifti1Image(data, img.affine, img.header)
+
+def binarize_3d(img, threshold):
+    """binarize 3D spatial image"""
+    return nib.Nifti1Image(
+        np.where(img.get_fdata() > threshold, 1, 0), img.affine, img.header
+    )
+
+def calculate_voxelwise_features(phenotype_file, mask_file, smooth_fwhm, resample_size):
+    """Calculate voxelwise features for the subjects
+
+    Args:
+        phenotype_file (csv or txt): A csv or text file with path to subject images
+        mask_file (nii): The GM mask file to be used to extract features
+        smooth_fwhm (int): Smooth images by applying a Gaussian filter by given FWHM (mm)
+        resample_size (int): Resample image to given voxel size
+
+    Returns:
+        data_resampled (dataframe): pandas dataframe of features (N subjects by M features)
+    """    
+
+    phenotype = pd.read_csv(phenotype_file, header=None)
+    
+    # don't need this anymore
+    # filename, file_extension = os.path.splitext(phenotype_file)
+    # if file_extension == ".txt":
+    #     phenotype = pd.read_csv(phenotype_file, header=None)
+    # elif file_extension == ".csv":
+    #     phenotype = pd.read_csv(phenotype_file, sep=",", header=None)
+    # else:
+    #     raise ValueError("Wrong file. Please imput either a csv or text file")
+
+    print(phenotype.shape)
+    print(phenotype.head())
+
+#    phenotype = phenotype.iloc[0:15]
+
+    data_resampled = np.array([])  # array to save resampled features from subjects mri
+    count = 0
+    for index, row in phenotype.iterrows():  # iterate over each row
+        sub_file = row.values[0]
+
+        if os.path.exists(sub_file):
+            print(f"\n-----Processing subject number {count}------")
+            sub_img = nib.load(sub_file)  # load subject image
+            mask_img = nib.load(mask_file)  # load mask image
+            print("Subject and mask image loaded")
+            print("sub affine original \n", sub_img.affine, sub_img.shape)
+            print("mask affine original \n", mask_img.affine, mask_img.shape)
+
+            print("Perform smoothing")
+            sub_img = image.smooth_img(
+                sub_img, smooth_fwhm
+            )  # smooth the image with 4 mm FWHM
+
+            print("Perform resampling")
+            # trying to match Gaser
+            mask_img_rs = npr.resample_to_output(
+                mask_img, [resample_size] * len(mask_img.shape), order=1
+            )  # resample mask
+            print(
+                "mask affine after resampling\n",
+                mask_img_rs.affine,
+                mask_img_rs.shape,
+            )
+
+            sub_img_rs = image.resample_to_img(
+                sub_img, mask_img_rs, interpolation="linear"
+            )  # resample subject
+            print(
+                "sub affine after resampling\n",
+                sub_img_rs.affine,
+                sub_img_rs.shape,
+            )
+
+            binary_mask_img_rs = binarize_3d(mask_img_rs, 0.5)  # binarize the mask
+            mask_rs = binary_mask_img_rs.get_fdata().astype(bool)
+
+            sub_data_rs = sub_img_rs.get_fdata()[
+                mask_rs
+            ]  # extract voxel using the binarized mask
+            sub_data_rs = sub_data_rs.reshape(1, -1)
+
+            if data_resampled.size == 0:
+                data_resampled = sub_data_rs
+            else:
+                data_resampled = np.concatenate((data_resampled, sub_data_rs), axis=0)
+            count = count + 1
+            print(data_resampled.shape)
+
+    print("\n *** Feature extraction done ***")
+
+    # renaming the columns and convering to dataframe
+    data_resampled = pd.DataFrame(data_resampled)
+    data_resampled.rename(columns=lambda X: "f_" + str(X), inplace=True)
+    print('Feature names:', data_resampled.columns)
+
+    print(f"The size of the feature space is {data_resampled.shape}")
+
+    return data_resampled
+
+
+
+def calculate_parcelwise_features(phenotype_file, mask_dir, num_parcels):
+    """Calculate parcelwise features for the subjects
+
+    Args:
+        phenotype_file (csv or text): A csv or text file with path to subject images
+        mask_dir (_type_): The GM mask file to be used to extract features
+        num_parcels (_type_): Number of parcels
+    
+    Returns:
+        data_parcels (dataframe): pandas dataframe of features (N subjects by M parcels)
+    """    
+
+    phenotype = pd.read_csv(phenotype_file, header=None)
+
+    # filename, file_extension = os.path.splitext(phenotype_file)
+
+    # if file_extension == '.txt':
+    #     phenotype = pd.read_csv(phenotype_file, header=None)
+    # elif file_extension == '.csv':
+    #     phenotype = pd.read_csv(phenotype_file, sep=',', header=None)
+    # else:
+    #     raise ValueError("Wrong file. Please imput either a csv or text file")
+
+    print(phenotype.shape)
+    print(phenotype.head())
+#    phenotype = phenotype.iloc[0:15]
+
+    data_parcels = [] #np.array([])  # array to save resampled features from subjects mri
+    count = 0
+
+    for index, row in phenotype.iterrows(): # iterate over each row
+        sub_file = row.values[0]
+
+        if os.path.exists(sub_file):
+            print(f'\nProcessing subject number {count}')
+            sub_img = nib.load(sub_file)  # load subject image
+            mask_img = nib.load(mask_dir)  # load mask image
+            print ('Subject and mask image loaded')
+            print(sub_file, sub_img.affine, mask_img.affine)
+
+            sub_data = sub_img.get_fdata()
+            sub_data[sub_data == 0] = np.nan # replace zeros with Nan
+            sub_data_parcels = []
+
+            if not np.array_equal(sub_img.affine, mask_img.affine):
+                mask_img = nilearn.image.resample_to_img(mask_img, sub_img, interpolation='linear')
+            else:
+                print("Subject and mask have same affine")
+
+            for num in range(1, int(num_parcels) + 1):
+                itemindex = np.where(mask_img.get_fdata() == num)  # get indices from the mask for a parcel
+                sub_mat = sub_data[itemindex]
+
+                if np.all(np.isnan(sub_mat)):
+                    sub_agg = 0
+                else:
+                    sub_agg = np.nanmean(sub_mat) # mean the data from the indices to get GM volume
+                sub_data_parcels.append(sub_agg)
+
+            data_parcels.append(sub_data_parcels)
+            print(len(data_parcels))
+            count = count + 1
+
+    print('\n *** Feature extraction done ***')
+    data_parcels = pd.DataFrame(data_parcels)
+    data_parcels.rename(columns=lambda X :'f_' + str(X), inplace=True)
+    print(data_parcels.columns)
+
+    print('final dataframe shape', data_parcels.shape)
+    return data_parcels

+ 98 - 0
More/brainage/create_splits.py

@@ -0,0 +1,98 @@
+#!/home/smore/.venvs/py3smore/bin/python3
+import math
+import pandas as pd
+from sklearn.model_selection import StratifiedKFold, RepeatedStratifiedKFold
+
+
+# def create_splits(data_df, repeats):
+#     num_bins = math.ceil(len(data_df)/repeats) # calculate number of bins to be created
+#     print('num_bins', num_bins, len(data_df)/repeats)
+#
+#     qc = pd.cut(data_df.index, num_bins)
+#     df = pd.DataFrame({'bin': qc.codes})
+#
+#     max_num = max(df['bin'].value_counts())
+#     print(df['bin'].value_counts())
+#     print(max_num, 'max_num')
+#
+#     test_idx = {}
+#     for rpt_num in range(0, repeats):
+#         key = 'repeat_' + str(rpt_num)
+#         test_idx[key] = []
+#
+#     if repeats == max_num:
+#         for num in range(0, max_num):
+#             for bin_idx in df['bin'].unique():
+#                 test = df[df['bin'] == bin_idx]
+#                 if num < len(test):
+#                     key = 'repeat_' + str(num)
+#                     test_idx[key].append(test.index[num])
+#     return test_idx
+
+
+def stratified_splits(bins_on, num_bins, data, num_splits, shuffle, random_state):
+    """
+    :param bins_on: variable used to create bins
+    :param num_bins: num of bins/classes to create
+    :param data: data to create cv splits on
+    :param num_splits: number of cv splits to create
+    :param shuffle: shuffle the data or not
+    :param random_state: random seed to use if shuffle=True
+    :return: a dictionary with index
+    """
+    qc = pd.cut(bins_on.tolist(), num_bins)  # divides data in bins
+    cv = StratifiedKFold(n_splits=num_splits, shuffle=shuffle, random_state=random_state)
+    test_idx = {}
+    rpt_num = 0
+    for train_index, test_index in cv.split(data, qc.codes):
+        key = 'repeat_' + str(rpt_num)
+        test_idx[key] = test_index
+        rpt_num = rpt_num + 1
+    return test_idx
+
+
+def stratified_splits_class(bins_on, data, num_splits, shuffle, random_state):
+    """
+    :param bins_on: variable used to create bins
+    :param data: data to create cv splits on
+    :param num_splits: number of cv splits to create
+    :param shuffle: shuffle the data or not
+    :param random_state: random seed to use if shuffle=True
+    :return: a dictionary with index
+    """
+    cv = StratifiedKFold(n_splits=num_splits, shuffle=shuffle, random_state=random_state)
+    test_idx = {}
+    rpt_num = 0
+    for train_index, test_index in cv.split(data, bins_on):
+        key = 'repeat_' + str(rpt_num)
+        test_idx[key] = test_index
+        rpt_num = rpt_num + 1
+    return test_idx
+
+
+# def stratified_splits(bins_on, num_bins, data, num_splits, shuffle, random_state): # useful for run_cross_validation()
+#     """
+#     :param bins_on: variable used to create bins
+#     :param num_bins: num of bins/classes to create
+#     :param data: data to create cv splits on
+#     :param num_splits: number of cv splits to create
+#     :param shuffle: shuffle the data or not
+#     :param random_state: random seed to use if shuffle=True
+#     :return: cv iterator
+#     """
+#     qc = pd.cut(bins_on.tolist(), num_bins)
+#     cv = StratifiedKFold(n_splits=num_splits, shuffle=shuffle, random_state=random_state).split(data, qc.codes)
+#     return cv
+
+
+def repeated_stratified_splits(bins_on, num_bins, data, num_splits, num_repeats, random_state):
+    qc = pd.cut(bins_on.tolist(), num_bins)
+    cv = RepeatedStratifiedKFold(n_splits=num_splits, n_repeats=num_repeats, random_state=random_state)
+    test_idx = {}
+    rpt_num = 0
+    for train_index, test_index in cv.split(data, qc.codes):
+        key = 'repeat_' + str(rpt_num)
+        test_idx[key] = test_index
+        rpt_num = rpt_num + 1
+    return test_idx
+

+ 53 - 0
More/brainage/define_models.py

@@ -0,0 +1,53 @@
+import xgboost as xgb
+from skrvm import RVR
+from glmnet import ElasticNet
+import sklearn.gaussian_process as gp
+from sklearn.kernel_ridge import KernelRidge
+from sklearn.decomposition import PCA
+from brainage import XGBoostAdapted
+from sklearn.feature_selection import VarianceThreshold
+    
+def define_models():
+    # Define all models and model parameters
+    rvr_linear = RVR()
+    rvr_poly = RVR()
+    kernel_ridge = KernelRidge()
+    lasso = ElasticNet(alpha=1, standardize=False)
+    elasticnet = ElasticNet(alpha=0.5, standardize=False)
+    ridge = ElasticNet(alpha=0, standardize=False)
+    xgb = XGBoostAdapted(early_stopping_rounds=10, eval_metric='mae', eval_set_percent=0.2)
+    pca = PCA(n_components=None)  # max as many components as sample size
+
+
+    model_list = [ridge, 'rf', rvr_linear, kernel_ridge, 'gauss', lasso, elasticnet, rvr_poly, xgb]
+    model_para_list = [
+                    {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed},
+
+                    {'variancethreshold__threshold': var_threshold, 'rf__n_estimators': 500, 'rf__criterion': 'mse',
+                    'rf__max_features': 0.33, 'rf__min_samples_leaf': 5,
+                    'rf__random_state': rand_seed},
+
+                    {'variancethreshold__threshold': var_threshold, 'rvr__kernel': 'linear',
+                    'rvr__random_state': rand_seed},
+
+                    {'variancethreshold__threshold': var_threshold,
+                    'kernelridge__alpha': [0.0, 0.001, 0.01, 0.1, 0.5, 1.0, 10.0, 100.0, 1000.0],
+                    'kernelridge__kernel': 'polynomial', 'kernelridge__degree': [1, 2], 'cv': 5},
+
+                    {'variancethreshold__threshold': var_threshold,
+                    'gauss__kernel': gp.kernels.RBF(10.0, (1e-7, 10e7)), 'gauss__n_restarts_optimizer': 100,
+                    'gauss__normalize_y': True, 'gauss__random_state': rand_seed},
+
+                    {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed},
+
+                    {'variancethreshold__threshold': var_threshold, 'elasticnet__random_state': rand_seed},
+
+                    {'variancethreshold__threshold': var_threshold, 'rvr__kernel': 'poly', 'rvr__degree': 1,
+                    'rvr__random_state': rand_seed},
+
+                    {'variancethreshold__threshold': var_threshold, 'xgboostadapted__n_jobs': 1,
+                    'xgboostadapted__max_depth': [1, 2, 3, 6, 8, 10, 12], 'xgboostadapted__n_estimators': 100,
+                    'xgboostadapted__reg_alpha': [0.001, 0.01, 0.05, 0.1, 0.2],
+                    'xgboostadapted__random_seed': rand_seed, 'cv': 5}]  # 'search_params':{'n_jobs': 5}]
+                    
+    return model_list, model_para_list

+ 8 - 0
More/brainage/performance_metric.py

@@ -0,0 +1,8 @@
+from sklearn.metrics import mean_absolute_error, mean_squared_error
+import numpy as np
+
+def performance_metric(y_true, y_pred):
+    mae = round(mean_absolute_error(y_true, y_pred), 3)
+    mse = round(mean_squared_error(y_true, y_pred), 3)
+    corr = round(np.corrcoef(y_pred, y_true)[1, 0], 3)
+    return mae, mse, corr

+ 46 - 0
More/brainage/read_data.py

@@ -0,0 +1,46 @@
+import pickle
+import pandas as pd
+
+def read_data_cross_site(data_file, train_status, confounds):
+    
+    data_df = pickle.load(open(data_file, 'rb'))
+    X = [col for col in data_df if col.startswith('f_')]
+    y = 'age'
+    data_df['age'] = data_df['age'].round().astype(int)  # round off age and convert to integer
+    data_df = data_df[data_df['age'].between(18, 90)].reset_index(drop=True)
+    duplicated_subs_1 = data_df[data_df.duplicated(['subject'], keep='first')] # check for duplicates (multiple sessions for one subject)
+    data_df = data_df.drop(duplicated_subs_1.index).reset_index(drop=True)  # remove duplicated subjects
+
+    if confounds is not None:  # convert sites in numbers to perform confound removal
+        if train_status == 'train':
+            site_name = data_df['site'].unique()
+            if type(site_name[0]) == str:
+                site_dict = {k: idx for idx, k in enumerate(site_name)}
+                data_df['site'] = data_df['site'].replace(site_dict)
+
+        elif train_status == 'test': # add site to features & convert site in a number to predict with model trained with  confound removal
+            X.append(confounds)
+            site_name = data_df['site'].unique()[0,]
+            if type(site_name) == str:
+                data_df['site'] = 10
+    return data_df, X, y
+    
+    
+    
+def read_data(features_file, demographics_file):
+    data_df = pickle.load(open(features_file, 'rb')) # read the data
+    demo = pd.read_csv(demographics_file)     # read demographics file
+    data_df = pd.concat([demo[['site', 'subject', 'age', 'gender']], data_df], axis=1) # merge them
+
+    print('Data columns:', data_df.columns)
+    print('Data Index:', data_df.index)
+
+    X = [col for col in data_df if col.startswith('f_')]
+    y = 'age'
+    data_df['age'] = data_df['age'].round().astype(int)  # round off age and convert to integer
+    data_df = data_df[data_df['age'].between(18, 90)].reset_index(drop=True)
+    data_df.sort_values(by='age', inplace=True, ignore_index=True)  # sort by age
+    duplicated_subs_1 = data_df[data_df.duplicated(['subject'], keep='first')] # check for duplicates (multiple sessions for one subject)
+    data_df = data_df.drop(duplicated_subs_1.index).reset_index(drop=True)  # remove duplicated subjects
+    return data_df, X, y
+

+ 46 - 0
More/brainage/xgboost_adapted.py

@@ -0,0 +1,46 @@
+from xgboost import XGBRegressor
+from sklearn.base import BaseEstimator
+from sklearn.model_selection import train_test_split
+import numpy as np
+
+class XGBoostAdapted(BaseEstimator):
+
+    def __init__(self, early_stopping_rounds=10, eval_metric=None, eval_set_percent=0.2, random_seed=None, n_jobs=1, max_depth=6, n_estimators=50, nthread=1, reg_alpha=0):
+        self.early_stopping_rounds = early_stopping_rounds
+        self.eval_metric = eval_metric
+        self.eval_set_percent = eval_set_percent
+        self.random_seed = random_seed
+        self.n_jobs = n_jobs
+        self.max_depth = max_depth
+        self.n_estimators = n_estimators
+        self.nthread = nthread
+        self.reg_alpha = reg_alpha
+
+            
+    def fit(self, X, y):
+        self._xgbregressor = XGBRegressor(n_jobs=self.n_jobs, max_depth=self.max_depth, n_estimators=self.n_estimators, nthread=self.nthread, reg_alpha=self.reg_alpha)
+
+        X_train, X_test, y_train, y_test = train_test_split(X.values, y.values, test_size=self.eval_set_percent, random_state=self.random_seed)
+
+        eval_set = [(X_test, y_test)]
+
+        self._xgbregressor.fit(X_train, y_train, early_stopping_rounds=self.early_stopping_rounds, eval_metric=self.eval_metric, eval_set=eval_set)
+        
+        return self
+
+    def score(self, X, y, sample_weight=None):
+        return self._xgbregressor.score(X.values, y.values, sample_weight)
+
+    def predict(self, X):
+        return self._xgbregressor.predict(X.values)
+
+
+
+
+        
+        
+
+
+
+
+

+ 49 - 0
More/brainage/zscore.py

@@ -0,0 +1,49 @@
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.utils import check_array
+from scipy.stats import zscore
+
+
+class ZScore(BaseEstimator, TransformerMixin):
+
+    def __init__(self, axis=0):
+        self.axis = axis
+
+    def fit(self, X, y=None):
+        X = check_array(X)
+        self.mean_ = np.mean(X, axis=self.axis)
+        self.std_ = np.std(X, axis=self.axis)
+        return self
+
+    def transform(self, X):
+        X = check_array(X)
+        mean = (
+            self.mean_.reshape(-1, 1)
+            if self.axis
+            else self.mean_
+        )
+
+        std = (
+            self.std_.reshape(-1, 1)
+            if self.axis
+            else self.std_
+        )
+        # print(f"{X.shape = }")
+        # print(f"{mean.shape = }")
+        # print(f"{std.shape = }")
+
+        return (X - mean) / std
+
+
+class ZScoreSubwise(BaseEstimator, TransformerMixin):
+
+    def __init__(self, axis=0):
+        self.axis = axis
+
+    def fit(self, X, y=None):
+        return self
+
+    def transform(self, X):
+        X = check_array(X)
+        return zscore(X, axis=self.axis)
+

+ 7 - 0
More/entrypoint.sh

@@ -0,0 +1,7 @@
+#!/bin/bash
+
+cd /opt/scripts/
+echo "PYTHONPATH:$PYTHONPATH"
+ls /opt/src/brainage/
+
+python predict_age_sing.py --features_path $1 --data_dir $2 --subject_filepaths $3 --output_path $4 --output_prefix $5 --mask_file /opt/masks/brainmask_12.8.nii --smooth_fwhm $6  --resample_size $7 --model_file $8

+ 138 - 0
More/predict_age_sing.py

@@ -0,0 +1,138 @@
+#from read_data_mask_resampled import *
+import sys
+print("sys.path:", sys.path)
+
+from brainage.calculate_features import calculate_voxelwise_features
+from pathlib import Path
+import pandas as pd
+import argparse
+import pickle
+import os
+import re
+
+
+def model_pred(test_df, model_file, feature_space_str):
+    """This functions predicts age
+    Args:
+        test_df (dataframe): test data
+        model_file (pickle file): trained model file
+        feature_space_str (string): feature space name
+
+    Returns:
+        dataframe: predictions from the model
+    """    
+
+    model = pickle.load(open(model_file, 'rb')) # load model
+    pred = pd.DataFrame()
+    for key, model_value in model.items():
+        X = data_df.columns.tolist()
+        pre_X, pre_X2 = model_value.preprocess(test_df[X], test_df[X])  # preprocessed data
+        y_pred = model_value.predict(test_df).ravel()
+        print(y_pred.shape)
+        pred[feature_space_str + '+' + key] = y_pred
+    return pred
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--features_path", type=str, help="path to features dir")  # eg '../data/ADNI'
+    parser.add_argument("--data_dir", type=str, help="path to data dir")  #
+    parser.add_argument("--subject_filepaths", type=str, help="path to csv or txt file with subject filepaths") # eg: '../data/ADNI/ADNI.paths_cat12.8.csv'
+    parser.add_argument("--output_path", type=str, help="path to output_dir")  # eg'../results/ADNI'
+    parser.add_argument("--output_prefix", type=str, help="prefix added to features filename ans results (predictions) file name") # eg: 'ADNI'
+    parser.add_argument("--mask_file", type=str, help="path to GM mask nii file",
+                        default='../masks/brainmask_12.8.nii')
+    parser.add_argument("--smooth_fwhm", type=int, help="smoothing FWHM", default=4)
+    parser.add_argument("--resample_size", type=int, help="resampling kernel size", default=4)
+    parser.add_argument("--model_file", type=str, help="Trained model to be used to predict",
+                        default='../trained_models/4sites.S4_R4_pca.gauss.models')
+    # For testing
+    # python3 predict_age.py --features_path ../data/ADNI --subject_filepaths ../data/ADNI/ADNI.paths_cat12.8.csv --output_path ../results/ADNI --output_prefix ADNI --mask_file ../masks/brainmask_12.8.nii  --smooth_fwhm 4 --resample_size 4 --model_file ../trained_models/4sites.S4_R4_pca.gauss.models
+
+    args = parser.parse_args()
+    features_path = args.features_path
+    data_dir = args.data_dir
+    subject_filepaths = args.subject_filepaths
+    output_path = args.output_path
+    output_prefix = args.output_prefix
+    smooth_fwhm = args.smooth_fwhm
+    resample_size = args.resample_size
+    mask_file = args.mask_file
+    model_file = args.model_file
+
+    print('\nBrain-age trained model used: ', model_file)
+    print('Data directory (test data): ', data_dir)
+    print('Subjects filepaths (test data): ', subject_filepaths)
+    print('Directory to features path: ',  features_path)
+    print('Results directory: ', output_path)
+    print('Results filename prefix: ', output_prefix)
+    print('GM mask used: ', mask_file)
+    
+    # create full filename for the nii files of the subjects and save as csv in features_path
+    subject_filepaths_nii = pd.read_csv(subject_filepaths, header=None)
+    subject_filepaths_nii = data_dir + '/' + subject_filepaths_nii
+    print(subject_filepaths_nii)
+    subject_full_filepaths = os.path.join(features_path, 'subject_full_filepaths.csv')
+    print(subject_full_filepaths)
+    subject_filepaths_nii.to_csv(subject_full_filepaths, header=False, index=False)
+    
+    
+    # get feature space name from the model file entered and
+    # create feature space name using the input values (smoothing, resampling)
+    # match them: they should be same
+
+    # get feature space name from the model file entered in argument
+    pipeline_name1 = model_file.split('/')[-1]
+    feature_space = pipeline_name1.split('.')[1]
+    model_name = pipeline_name1.split('.')[2]
+    pipeline_name = feature_space + '.' + model_name
+    
+    # create feature space name using the input values (smoothing, resampling)
+    pca_string = re.findall(r"pca", feature_space)
+    if len(pca_string) == 1:
+        feature_space_str = 'S' + str(smooth_fwhm) + '_R' + str(resample_size) + '_pca'
+    else:
+        feature_space_str = 'S' + str(smooth_fwhm) + '_R' + str(resample_size)
+
+    # match them: they should be same
+    assert(feature_space_str == feature_space), f"Mismatch in feature parameters entered ({feature_space_str}) & features used for model training ({feature_space})"
+
+    print('Feature space: ', feature_space)
+    print('Model name: ', model_name)
+
+    # Create directories, create features if they don't exists
+    Path(output_path).mkdir(exist_ok=True, parents=True)
+    Path(features_path).mkdir(exist_ok=True, parents=True)
+    features_filename = str(output_prefix) + '.S' + str(smooth_fwhm) + '_R' + str(resample_size)
+    features_fullfile = os.path.join(features_path, features_filename)
+    print('\nfilename for features created: ', features_fullfile)
+
+    if os.path.isfile(features_fullfile): # check if features file exists
+        print('\n----File exists')
+        data_df = pickle.load(open(features_fullfile, 'rb'))
+        print('Features loaded')
+    else:
+        print('\n-----Extracting features')
+        # create features
+        data_df = calculate_voxelwise_features(subject_full_filepaths, mask_file, smooth_fwhm=smooth_fwhm, resample_size=resample_size)
+        # save features
+        pickle.dump(data_df, open(features_fullfile, "wb"), protocol=4)
+        data_df.to_csv(features_fullfile + '.csv', index=False)
+        print('Feature extraction done and saved')
+
+    # get predictions and save
+    try:
+        predictions_df = model_pred(data_df, model_file, feature_space_str)
+        # save predictions
+        predictions_filename = str(output_prefix) + '.' + pipeline_name + '.prediction.csv'
+        predictions_fullfile = os.path.join(output_path, predictions_filename)
+        print('\nfilename for predictions created: ', predictions_fullfile)
+        predictions_df.to_csv(predictions_fullfile, index=False)
+        print(predictions_df)
+
+    except FileNotFoundError:
+        print(f'{model_file} is not present')
+
+
+
+

+ 9 - 0
More/pyproject.toml

@@ -0,0 +1,9 @@
+[build-system]
+requires = ["setuptools>=42", "wheel", "setuptools_scm[toml]>=3.4"]
+build-backend = "setuptools.build_meta"
+
+[tool.pytest.ini_options]
+addopts = "--cov=brainage"
+testpaths = [
+    "tests",
+]

+ 32 - 0
More/setup.py

@@ -0,0 +1,32 @@
+from setuptools import find_packages, setup
+
+# requirements = []
+# with open("requirements.txt", "r") as f:
+#    for line in f:
+#        requirements.append(line)
+
+setup(
+    name="brainage",
+    version="0.1.0",
+    description="Brainage prediction project",
+    url="https://github.com/juaml/brainage_estimation",
+    author="Applied Machine Learning FZJ",
+    packages=find_packages(),
+    # install_requires=requirements,
+    classifiers=[
+        "Development Status :: 1 - Planning",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: BSD License",
+        "Operating System :: POSIX :: Linux",
+        "Programming Language :: Python :: 2",
+        "Programming Language :: Python :: 2.7",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.4",
+        "Programming Language :: Python :: 3.5",
+    ],
+    python_requires=">=3.6",
+    include_package_data=True,
+    package_data={"": ["data/*"]},
+)
+
+

+ 1 - 0
More/trained_models/4sites.S0_R4.lasso.models

@@ -0,0 +1 @@
+/annex/objects/MD5-s21234411--36fff9c99dad205bc292616f9038f506

+ 1 - 0
More/trained_models/4sites.S4_R4.gauss.models

@@ -0,0 +1 @@
+/annex/objects/MD5-s771535810--2fb9db5aabd52fabc2d01da1a89b43cf

+ 1 - 0
More/trained_models/4sites.S4_R4_pca.gauss.models

@@ -0,0 +1 @@
+/annex/objects/MD5-s841713501--5be0577ed3c1a2e7c918250ad343dcc7

+ 1 - 0
More/trained_models/9datasets.S4_R4.gauss.models

@@ -0,0 +1 @@
+/annex/objects/MD5-s1439971899--58bdc02b3089255ff02d61699f2b6aa4

+ 1 - 0
More/trained_models/9datasets.S4_R4_pca.gauss.models

@@ -0,0 +1 @@
+/annex/objects/MD5-s1654211173--a782645224e59d35f07639d8850544ef