123456789101112131415161718192021222324252627282930313233343536373839404142 |
- import argparse
- import pandas as pd
- import seaborn as sns
- from sklearn import model_selection
- from sklearn.neighbors import KNeighborsClassifier
- from sklearn.metrics import classification_report
- parser = argparse.ArgumentParser(description="Analyze iris data")
- parser.add_argument("data", help="Input data (CSV) to process")
- parser.add_argument("output_figure", help="Output figure path")
- parser.add_argument("output_report", help="Output report path")
- args = parser.parse_args()
- # prepare the data as a pandas dataframe
- df = pd.read_csv(args.data)
- attributes = ["sepal_length", "sepal_width", "petal_length", "petal_width", "class"]
- df.columns = attributes
- # create a pairplot to plot pairwise relationships in the dataset
- plot = sns.pairplot(df, hue="class", palette="muted")
- plot.savefig(args.output_figure)
- # perform a K-nearest-neighbours classification with scikit-learn
- # Step 1: split data in test and training dataset (20:80)
- array = df.values
- X = array[:, 0:4]
- Y = array[:, 4]
- test_size = 0.20
- seed = 7
- X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
- X, Y, test_size=test_size, random_state=seed
- )
- # Step 2: Fit the model and make predictions on the test dataset
- knn = KNeighborsClassifier()
- knn.fit(X_train, Y_train)
- predictions = knn.predict(X_test)
- # Step 3: Save the classification report
- report = classification_report(Y_test, predictions, output_dict=True)
- df_report = pd.DataFrame(report).transpose().to_csv(args.output_report)
|