import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report


# Define id for generating random state
ID = 1586140

# Load in the file
df = pd.read_csv('https://raw.githubusercontent.com/cbondnz/data_analytics_files_2023/main/breast_cancer_bd.csv')

# Remove rows with '?'
df = df[(df != '?').all(axis=1)]

# Convert class values 2 -> 0, and 4 -> 1
df['class'] = df['class'].replace({2: 0, 4: 1})


sns.pairplot(data=df.iloc[:,1:], hue='class', palette='Set2', kind='scatter', plot_kws={'alpha':0.3}, height=1.2, aspect=1.2)

<seaborn.axisgrid.PairGrid at 0x7af6e8d5d4b0>


# Start after the ID Column
X = df.iloc[:, 1:-1]
y = df.iloc[:, -1]

# test_size: 20% for test set, 80% for training set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=ID, stratify=y)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(546, 9) (546,)
(137, 9) (137,)


# Initilize and fit from the training set
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

# Make predictions on the test set
y_pred = knn.predict(X_test)

# Showing some values of y_test compared to the predicted y_pred
compare = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
compare


# Generate the Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Generate the classifcation report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Confusion Matrix:
[[88  1]
 [ 2 46]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98        89
           1       0.98      0.96      0.97        48

    accuracy                           0.98       137
   macro avg       0.98      0.97      0.98       137
weighted avg       0.98      0.98      0.98       137


# Generate values from 1 to 100
ks = [i for i in range (1, 101)]

# Takes the true and predicted y values and returns accuracy values for each 'k' stored in an array
acc = [accuracy_score(y_test, KNeighborsClassifier(n_neighbors=k).fit(X_train, y_train).predict(X_test)) for k in ks]

# K will be at the index of the acc array (+1 since 0-based index)
k_val = acc.index(max(acc)) + 1
print(acc)
print(f"The highest accuracy is {max(acc)}")

[0.9708029197080292, 0.9635036496350365, 0.9854014598540146, 0.9708029197080292, 0.9781021897810219, 0.9708029197080292, 0.9781021897810219, 0.9635036496350365, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9708029197080292, 0.9635036496350365, 0.9635036496350365, 0.9635036496350365, 0.9635036496350365, 0.9635036496350365, 0.9635036496350365, 0.9635036496350365, 0.9635036496350365, 0.9635036496350365, 0.9635036496350365, 0.9635036496350365, 0.9635036496350365, 0.9635036496350365, 0.9635036496350365, 0.9635036496350365, 0.9635036496350365, 0.9635036496350365, 0.9635036496350365, 0.9635036496350365, 0.9635036496350365, 0.9635036496350365, 0.9635036496350365, 0.9635036496350365]
The highest accuracy is 0.9854014598540146


plt.plot(ks, acc)
plt.grid()
plt.xlabel('k')
plt.ylabel('accs')
plt.title('K vs Accuracy')
plt.show()

	Actual	Predicted
334	1	1
608	1	1
488	1	1
674	0	0
468	0	0
...	...	...
48	0	0
453	1	1
195	0	0
564	0	0
133	0	0

Introduction¶

Result¶

Methodology¶

Load data¶

Wisconsin Breast Cancer Database¶

Generate pairplot¶

Split data - 80% train and 20% test¶

Train kNN classifier and predict¶

Generate the Confusion Matrix and Classification Report¶

Train kNN models using k-values ranging from 1 to 100¶

Generate Plot for K vs Accuracy¶