In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
import glob
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score
%matplotlib inline
plt.rcParams['figure.dpi'] = 100  # adjust fig size in notebook
InteractiveShell.ast_node_interactivity = "all"   # allows for multiple outputs per cell to be shown in notebook

## Data Loading and Subsetting

In [2]:
# load feature data
df = pd.read_csv("C:/Users/franz/Documents/Bachelor Studium/BachelorArbeit/JupyterLabBA/files/DLCfiles_final_results/res_file.csv")
df = df[df["distance"] != "not_specified"]

# subsetting data
MOPstroke_BL = df[df["group"] == "MOPstroke_BL"].copy()
MOPstroke_P3 = df[df["group"] == "MOPstroke_P3"].copy()
MOPstroke_P28 = df[df["group"] == "MOPstroke_P28"].copy()
MOSstroke_BL = df[df["group"] == "MOSstroke_BL"].copy()
MOSstroke_P3 = df[df["group"] == "MOSstroke_P3"].copy()
MOSstroke_P28 = df[df["group"] == "MOSstroke_P28"].copy()
WMstroke_BL = df[df["group"] == "WMstroke_BL"].copy()
WMstroke_P3 = df[df["group"] == "WMstroke_P3"].copy()
WMstroke_P28 = df[df["group"] == "WMstroke_P28"].copy()
MOPMOSsham_BL = df[df["group"] == "MOPMOSsham_BL"].copy()
MOPMOSsham_P3 = df[df["group"] == "MOPMOSsham_P3"].copy()
MOPMOSsham_P28 = df[df["group"] == "MOPMOSsham_P28"].copy()
WMsham_BL = df[df["group"] == "WMsham_BL"].copy()
WMsham_P3 = df[df["group"] == "WMsham_P3"].copy()
WMsham_P28 = df[df["group"] == "WMsham_P28"].copy()

In [3]:
feature_names = [
    "time", "distance", "average_speed",
    "fp_cycle_dur_median", "fp_stance_dur_median", "fp_swing_dur_median", "fp_sw_st_ratio_median", "fp_stride_len_median",
    "hp_cycle_dur_median", "hp_stance_dur_median", "hp_swing_dur_median", "hp_sw_st_ratio_median", "hp_stride_len_median",
    "paw_dist_min", "paw_dist_max", "paw_dist_mean", "paw_dist_std",
    "fp_vel_min", "fp_vel_max", "fp_vel_mean", "fp_vel_std", "hp_vel_min", "hp_vel_max", "hp_vel_mean", "hp_vel_std",
    "tilt_up_min", "tilt_up_max", "tilt_up_mean", "tilt_up_std", "tilt_low_min", "tilt_low_max", "tilt_low_mean", "tilt_low_std",
    "snout_h_min", "snout_h_max", "snout_h_mean", "snout_h_std",
    "midb_h_min", "midb_h_max", "midb_h_mean", "midb_h_std",
    "no_paw_contact", "single_paw_contact", "double_paw_contact",
    "ank_h_min", "ank_h_max", "ank_h_mean", "ank_h_median", "ank_h_std", "ank_ang_min", "ank_ang_max", "ank_ang_mean", "ank_ang_median", "ank_ang_ROM", "ank_ang_std", "ank_angv_min", "ank_angv_max", "ank_angv_mean", "ank_angv_std", "ank_pos_min", "ank_pos_max", "ank_pos_mean", "ank_pos_std", "ank_anginit_min", "ank_anginit_max", "ank_anginit_mean", "ank_anginit_median", "ank_anginit_std", "ank_angpsw_min", "ank_angpsw_max", "ank_angpsw_mean", "ank_angpsw_median", "ank_angpsw_std",
    "kn_h_min", "kn_h_max", "kn_h_mean", "kn_h_median", "kn_h_std", "kn_ang_min", "kn_ang_max", "kn_ang_mean", "kn_ang_median", "kn_ang_ROM", "kn_ang_std", "kn_angv_min", "kn_angv_max", "kn_angv_mean", "kn_angv_std", "kn_pos_min", "kn_pos_max", "kn_pos_mean", "kn_pos_std", "kn_anginit_min", "kn_anginit_max", "kn_anginit_mean", "kn_anginit_median", "kn_anginit_std", "kn_angpsw_min", "kn_angpsw_max", "kn_angpsw_mean", "kn_angpsw_median", "kn_angpsw_std",
    "hip_h_min", "hip_h_max", "hip_h_mean", "hip_h_median", "hip_h_std", "hip_ang_min", "hip_ang_max", "hip_ang_mean", "hip_ang_median", "hip_ang_ROM", "hip_ang_std", "hip_angv_min", "hip_angv_max", "hip_angv_mean", "hip_angv_std", "hip_pos_min", "hip_pos_max", "hip_pos_mean", "hip_pos_std", "hip_anginit_min", "hip_anginit_max", "hip_anginit_mean", "hip_anginit_median", "hip_anginit_std", "hip_angpsw_min", "hip_angpsw_max", "hip_angpsw_mean", "hip_angpsw_median", "hip_angpsw_std",
    "wr_h_min", "wr_h_max", "wr_h_mean", "wr_h_median", "wr_h_std", "wr_ang_min", "wr_ang_max", "wr_ang_mean", "wr_ang_median", "wr_ang_ROM", "wr_ang_std", "wr_angv_min", "wr_angv_max", "wr_angv_mean", "wr_angv_std", "wr_pos_min", "wr_pos_max", "wr_pos_mean", "wr_pos_std", "wr_anginit_min", "wr_anginit_max", "wr_anginit_mean", "wr_anginit_median", "wr_anginit_std", "wr_angpsw_min", "wr_angpsw_max", "wr_angpsw_mean", "wr_angpsw_median", "wr_angpsw_std",
    "el_h_min", "el_h_max", "el_h_mean", "el_h_median", "el_h_std", "el_ang_min", "el_ang_max", "el_ang_mean", "el_ang_median", "el_ang_ROM", "el_ang_std", "el_angv_min", "el_angv_max", "el_angv_mean", "el_angv_std", "el_pos_min", "el_pos_max", "el_pos_mean", "el_pos_std", "el_anginit_min", "el_anginit_max", "el_anginit_mean", "el_anginit_median", "el_anginit_std", "el_angpsw_min", "el_angpsw_max", "el_angpsw_mean", "el_angpsw_median", "el_angpsw_std",
    "sh_h_min", "sh_h_max", "sh_h_mean", "sh_h_median", "sh_h_std", "sh_ang_min", "sh_ang_max", "sh_ang_mean", "sh_ang_median", "sh_ang_ROM", "sh_ang_std", "sh_angv_min", "sh_angv_max", "sh_angv_mean", "sh_angv_std", "sh_pos_min", "sh_pos_max", "sh_pos_mean", "sh_pos_std", "sh_anginit_min", "sh_anginit_max", "sh_anginit_mean", "sh_anginit_median", "sh_anginit_std", "sh_angpsw_min", "sh_angpsw_max", "sh_angpsw_mean", "sh_angpsw_median", "sh_angpsw_std",
    "hdrop_num_abs", "hdrop_num_rel",
    "under_beam"]

In [4]:
# name two or three files to compare with classifier
df1 = MOPstroke_P3
df2 = MOPMOSsham_P3
df3 = pd.DataFrame()

In [5]:
df1.describe()
df2.describe()
#df3.describe()

Unnamed: 0,time,fp_cycle_dur_median,fp_stance_dur_median,fp_swing_dur_median,fp_sw_st_ratio_median,fp_stride_len_median,hp_cycle_dur_median,hp_stance_dur_median,hp_swing_dur_median,hp_sw_st_ratio_median,...,sh_anginit_mean,sh_anginit_median,sh_anginit_std,sh_angpsw_min,sh_angpsw_max,sh_angpsw_mean,sh_angpsw_median,sh_angpsw_std,hdrop_num_abs,under_beam
count,101.0,101.0,101.0,101.0,101.0,100.0,101.0,101.0,101.0,101.0,...,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0,101.0
mean,8.56946,0.412788,0.069316,0.330655,4.897041,0.160896,0.482551,0.063783,0.418767,9.509027,...,220.932931,222.140768,17.251572,189.510422,246.704918,219.029805,220.039472,16.216653,1.811881,0.051994
std,5.172731,0.254325,0.015501,0.247497,3.485659,0.163586,0.452034,0.0132,0.446884,11.7047,...,8.888797,9.552478,4.921159,10.224294,9.745654,8.049792,8.641654,3.751567,2.504847,0.081752
min,3.765273,0.193091,0.045137,0.128727,1.666667,-0.019821,0.221667,0.038031,0.152614,3.267857,...,195.964708,196.871671,8.015501,162.994557,222.802952,201.183966,196.708327,7.801642,0.0,0.0
25%,5.627632,0.271453,0.064231,0.196953,3.0,0.044324,0.294882,0.054167,0.237876,5.563866,...,216.375162,215.788337,13.338026,182.89566,241.514277,213.950086,216.237298,13.178168,0.0,0.0
50%,7.350081,0.333333,0.066667,0.25,4.0,0.131568,0.359567,0.061905,0.299535,6.957738,...,221.016359,223.754013,17.035766,186.846959,246.307548,219.075362,220.463486,16.641529,1.0,0.017544
75%,9.033333,0.433333,0.069105,0.357765,5.5,0.220495,0.484211,0.072094,0.421053,9.146875,...,226.354558,228.801225,20.925703,193.56439,252.753739,224.454297,225.649237,19.646506,3.0,0.055172
max,42.509025,1.6315,0.133333,1.485228,27.25,1.083275,3.9066,0.101502,3.807584,103.98381,...,239.566316,240.441019,33.949624,220.114148,268.970073,240.642112,241.787432,25.013106,13.0,0.410188


Unnamed: 0,time,fp_cycle_dur_median,fp_stance_dur_median,fp_swing_dur_median,fp_sw_st_ratio_median,fp_stride_len_median,hp_cycle_dur_median,hp_stance_dur_median,hp_swing_dur_median,hp_sw_st_ratio_median,...,sh_anginit_mean,sh_anginit_median,sh_anginit_std,sh_angpsw_min,sh_angpsw_max,sh_angpsw_mean,sh_angpsw_median,sh_angpsw_std,hdrop_num_abs,under_beam
count,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,...,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0,83.0
mean,6.222911,0.309528,0.072473,0.225063,3.370152,0.19317,0.316402,0.058356,0.258046,5.898642,...,223.165525,223.815682,14.491601,195.040818,245.1783,221.084253,221.097157,14.039303,0.710843,0.0139
std,2.099432,0.090808,0.019437,0.093464,1.789558,0.139702,0.091626,0.011285,0.089563,2.288775,...,11.880388,12.651813,4.031649,14.365911,11.65814,11.930484,12.811183,3.790966,2.471941,0.045644
min,3.375587,0.189285,0.031935,0.09298,1.0,-0.007645,0.189263,0.040351,0.135775,2.965789,...,197.137017,195.638895,7.228378,168.859958,219.366591,198.863748,199.416563,8.69991,0.0,0.0
25%,4.866214,0.249921,0.064049,0.163635,2.0,0.083155,0.250915,0.051652,0.197152,4.304436,...,213.990719,214.149501,11.676275,183.546411,237.482189,212.143198,212.835242,10.860017,0.0,0.0
50%,5.533333,0.287411,0.066667,0.2,3.0,0.166551,0.288865,0.056164,0.233616,5.171875,...,222.35507,221.84371,13.835482,189.553503,244.588448,218.782297,218.980809,12.956647,0.0,0.0
75%,6.83359,0.356065,0.081653,0.263161,4.5,0.295055,0.372115,0.063228,0.311286,7.202478,...,232.538214,232.795688,16.909589,207.00685,255.574905,231.022127,232.224749,16.832864,0.0,0.0
max,11.326717,0.550972,0.157494,0.5,10.0,0.645501,0.583193,0.099884,0.483309,13.336508,...,246.930219,247.688365,24.023483,233.153303,269.994174,248.686375,249.724347,23.33456,15.0,0.284091


## Data pre-processing

In [6]:
# check for NaN values
df1.isnull().sum().sum()
df2.isnull().sum().sum()
df3.isnull().sum().sum()

# drop rows with missing values
df1.dropna(inplace=True)
df2.dropna(inplace=True)
df3.dropna(inplace=True)

# concat dataframes
df = pd.concat([df1, df2, df3])
df["group"].value_counts()

# remove unnecessary columns
df = df.drop(['file_name', 'reason_end'], axis=1)

1

1

0.0

MOPstroke_P3     100
MOPMOSsham_P3     82
Name: group, dtype: int64

In [7]:
# separate data in response and feature variables
X = df.drop("group", axis=1)
y = df["group"]
# make sure all series in X are numeric
X = X.apply(pd.to_numeric)
# encode labels in y as numerals
le = LabelEncoder()
le.fit(["MOPstroke_BL", "MOPstroke_P3", "MOPstroke_P28", "MOSstroke_BL", "MOSstroke_P3", "MOSstroke_P28", "WMstroke_BL", "WMstroke_P3", "WMstroke_P28", "MOPMOSsham_BL", "MOPMOSsham_P3", "MOPMOSsham_P28", "WMsham_BL", "WMsham_P3", "WMsham_P28"])
y = le.transform(y)


LabelEncoder()

## Random Forest Classifer

In [8]:
acc_score = 0
for i in range(100):
    if (i+1) % 10 == 0:
        print("iterations: {}/100".format(i+1))
    # split data in train and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    # Scaling data
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)    
    # Random Forest Classifier
    rfc = RandomForestClassifier(n_estimators=2000, bootstrap=True).fit(X_train, y_train)
    pred_rfc = rfc.predict(X_test)
    if accuracy_score(y_test, pred_rfc) > acc_score:
        best_model = rfc
        acc_score = accuracy_score(y_test, pred_rfc)
        data_split = [X_train, X_test, y_train, y_test]

iterations: 10/100
iterations: 20/100
iterations: 30/100
iterations: 40/100
iterations: 50/100
iterations: 60/100
iterations: 70/100
iterations: 80/100
iterations: 90/100


KeyboardInterrupt: 

In [None]:
rfc = best_model
X_train, X_test, y_train, y_test = data_split
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
pred_rfc = rfc.predict(X_test)

print(classification_report(y_test, pred_rfc))
print(accuracy_score(y_test, pred_rfc))
print(balanced_accuracy_score(y_test, pred_rfc))
print(f1_score(y_test, pred_rfc, average='weighted'))

In [None]:
selector = SelectFromModel(rfc, prefit=True)
f_imp_df = pd.DataFrame({"features": feature_names, "importance_scores": rfc.feature_importances_, "support": selector.get_support()})
f_imp_df_sorted = f_imp_df.sort_values('importance_scores', ascending=False)
num_features = f_imp_df_sorted["support"].sum()
f_imp_df_sorted.to_csv("C:/Users/franz/Documents/Bachelor Studium/BachelorArbeit/JupyterLabBA/files/RandomForest/feature_importances{}.csv".format(le.inverse_transform(rfc.classes_)), index = False)

## Plot Function

In [None]:
def plot_feature_importances(scores, labels):
    scores = scores[scores["support"]]
    fig = plt.figure(figsize=(10,5))
    if len(labels) == 3 :
        fig = fig.suptitle("{} vs. {} vs. {}: Feature importance scores of {} selected features (acc. {:.3f})".format(labels[0], labels[1], labels[2], num_features, accuracy_score(y_test, pred_rfc)))
    else:
        fig = fig.suptitle("{} vs. {}: Feature importance scores of {} selected features (acc. {:.3f})".format(labels[0], labels[1], num_features, accuracy_score(y_test, pred_rfc)))
    fig = plt.tick_params('x', labelrotation=90)
    fig = plt.bar(scores["features"], scores["importance_scores"])
    fig = plt.ylabel("feature importance score")
    fig = plt.tight_layout()
    plt.savefig("C:/Users/franz/Documents/Bachelor Studium/BachelorArbeit/JupyterLabBA/files/RandomForest/plot_feature_importance_{}.pdf".format(le.inverse_transform(rfc.classes_)))
    plt.show()

In [None]:
plot_confusion_matrix(rfc, X_test, y_test, display_labels=le.inverse_transform(rfc.classes_), cmap=plt.cm.Blues, normalize='true')
plt.savefig("C:/Users/franz/Documents/Bachelor Studium/BachelorArbeit/JupyterLabBA/files/RandomForest/confusion_matrix_{}.png".format(le.inverse_transform(rfc.classes_)))
plot_confusion_matrix(rfc, X_test, y_test, display_labels=le.inverse_transform(rfc.classes_), cmap=plt.cm.Blues, normalize='false')
plot_feature_importances(scores=f_imp_df_sorted, labels=le.inverse_transform(rfc.classes_))