# Python機器學習筆記(五)：使用Scikit-Learn進行K-Nearest演算法

`import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as sns%matplotlib inlinedf = pd.read_csv('200811-201811.csv')df.head()#Danger分類點說明#對敏感族群不健康為PM2.5數值在35.5以上`

`from sklearn.preprocessing import StandardScaler#將Danger中特徵中移除，作為要預測的對象scaler = StandardScaler()scaler.fit(df.drop('Danger',axis=1))scaled_features = scaler.transform(df.drop('Danger',axis=1))df_feat = pd.DataFrame(scaled_features,columns=df.columns[:-1])df_feat.head()`

`from sklearn.model_selection import train_test_splitX = df_featy = df['Danger']X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.30,random_state=101)`

`from sklearn.neighbors import KNeighborsClassifier#從k值=1開始測試knn = KNeighborsClassifier(n_neighbors=1)knn.fit(X_train,y_train)pred = knn.predict(X_test)`

`#使用混淆矩陣from sklearn.metrics import classification_report,confusion_matrixprint(confusion_matrix(y_test,pred))print(classification_report(y_test,pred))`

# 選擇 K 值

`from sklearn.pipeline import Pipeline, FeatureUnionpipe = Pipeline([('scaler', scaler), ('knn', knn)])search_space = [{'knn__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}]`

`from sklearn.model_selection import GridSearchCVclf = GridSearchCV(pipe, search_space, cv=5, verbose=0).fit(X_std, y)#最佳k值clf.best_estimator_.get_params()['knn__n_neighbors']`

`error_rate = []for i in range(1,60):        knn = KNeighborsClassifier(n_neighbors=i)    knn.fit(X_train,y_train)    pred_i = knn.predict(X_test)    error_rate.append(np.mean(pred_i != y_test))#將k=1~60的錯誤率製圖畫出。k=7之後，錯誤率就往上跑，plt.figure(figsize=(10,6))plt.plot(range(1,60),error_rate,color='blue', linestyle='dashed', marker='o',         markerfacecolor='red', markersize=10)plt.title('Error Rate vs. K Value')plt.xlabel('K')plt.ylabel('Error Rate')`

`knn = KNeighborsClassifier(n_neighbors=1)knn.fit(X_train,y_train)pred = knn.predict(X_test)print('WITH K=1')print('\n')print(confusion_matrix(y_test,pred))print('\n')print(classification_report(y_test,pred))`

`knn = KNeighborsClassifier(n_neighbors=7)knn.fit(X_train,y_train)pred = knn.predict(X_test)print('WITH K=7')print('\n')print(confusion_matrix(y_test,pred))print('\n')print(classification_report(y_test,pred))`

`df.head(1)有SO2,CO,O3,PM25,Nox,NO,NO2,THC,NMHC,CH4,WindSpeed,TEMP,Humidity這幾項其中，Danger是我們要進行預測的標籤`

`#0:Safe   對一般人無害#1:Danger 對敏感族群有害classes={0:'Safe',1:'Danger'}#建立一筆新資料並進行預測x_new=[[4,0.3,25,15,22,2.2,20,2.3,0.3,2.3,2,20,60]]y_predict=knn.predict(x_new)print(classes[y_predict[0]])`

`knn.score(X_test, y_test)`

Written by