Python機器學習筆記(五):使用Scikit-Learn進行K-Nearest演算法

引入模組與資料集

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
df = pd.read_csv('200811-201811.csv')
df.head()
#Danger分類點說明
#對敏感族群不健康為PM2.5數值在35.5以上

載入標準化比例尺(StandardScaler)套件

from sklearn.preprocessing import StandardScaler#將Danger中特徵中移除,作為要預測的對象
scaler = StandardScaler()
scaler.fit(df.drop('Danger',axis=1))
scaled_features = scaler.transform(df.drop('Danger',axis=1))
df_feat = pd.DataFrame(scaled_features,columns=df.columns[:-1])
df_feat.head()

將資料分成訓練組及測試組

from sklearn.model_selection import train_test_splitX = df_feat
y = df['Danger']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=101)

使用KNN演算法

from sklearn.neighbors import KNeighborsClassifier#從k值=1開始測試
knn = KNeighborsClassifier(n_neighbors=1)
knn.fit(X_train,y_train)
pred = knn.predict(X_test)

測試KNN演算法的好壞

#使用混淆矩陣
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

選擇 K 值

建立流水線與候選k值範圍

from sklearn.pipeline import Pipeline, FeatureUnionpipe = Pipeline([('scaler', scaler), ('knn', knn)])search_space = [{'knn__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}]

進行最佳K值搜尋

from sklearn.model_selection import GridSearchCVclf = GridSearchCV(pipe, search_space, cv=5, verbose=0).fit(X_std, y)#最佳k值
clf.best_estimator_.get_params()['knn__n_neighbors']

或者

利用 For迴圈,選擇k值

error_rate = []

for i in range(1,60):

knn = KNeighborsClassifier(n_neighbors=i)
knn.fit(X_train,y_train)
pred_i = knn.predict(X_test)
error_rate.append(np.mean(pred_i != y_test))


#將k=1~60的錯誤率製圖畫出。k=7之後,錯誤率就往上跑,
plt.figure(figsize=(10,6))
plt.plot(range(1,60),error_rate,color='blue', linestyle='dashed', marker='o',
markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

與 K值=1時比較

knn = KNeighborsClassifier(n_neighbors=1)

knn.fit(X_train,y_train)
pred = knn.predict(X_test)

print('WITH K=1')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

選擇 K值=7時來進行比較

knn = KNeighborsClassifier(n_neighbors=7)knn.fit(X_train,y_train)
pred = knn.predict(X_test)
print('WITH K=7')
print('\n')
print(confusion_matrix(y_test,pred))
print('\n')
print(classification_report(y_test,pred))

顯示所有特徵

df.head(1)有SO2,CO,O3,PM25,Nox,NO,NO2,THC,NMHC,CH4,WindSpeed,TEMP,Humidity這幾項
其中,Danger是我們要進行預測的標籤

給予一筆數值,進行預測

#0:Safe   對一般人無害
#1:Danger 對敏感族群有害
classes={0:'Safe',1:'Danger'}
#建立一筆新資料並進行預測
x_new=[[4,0.3,25,15,22,2.2,20,2.3,0.3,2.3,2,20,60]]
y_predict=knn.predict(x_new)
print(classes[y_predict[0]])

評估KNN的準確率

knn.score(X_test, y_test)

Written by

Machine Learning / Deep Learning / Python / Flutter cakeresume.com/yanwei-liu

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store