Scikit-Learn使用筆記

安裝

pip install scikit-learn

或者

直接安裝Anaconda

選擇演算法

Image for post
Image for post

引入模組

import pandas as pd 
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

載入數據

#內建數據
iris = datasets.load_iris()
iris_X = iris.data
iris_y = iris.target

或者

#用Pandas讀數據數據
df = pd.read_csv('./csv檔案位置')
df = pd.read_excel('./xlsx檔案位置')

或者

#建立分類數據
from sklearn.datasets import make_classification
import pandas as pd
features, output = make_classification
(n_samples = 100, #100個樣本 n_features = 10, #10個特徵 n_informative = 5, #5個預測輸出的特徵 n_redundant = 5, #5個隨機特徵 n_classes = 3, #3個特徵分類
#建立迴歸數據
import pandas as pdfrom sklearn.datasets import make_regression
features, output, coef = make_regression(n_samples = 100,
n_features =3,
n_informative = 2,
n_targets = 1,
noise = 0.0,coef = True)

將資料分為訓練集和測試集

X_train, X_test, y_train, y_test = train_test_split(
iris_X, iris_y, test_size=0.3)
#test_size=0.3,測試集占總數據的30%

建立模型、訓練、預測

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print(knn.predict(X_test))
print(y_test)

引入模組


from sklearn import datasets
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

載入數據

#內建數據
loaded_data = datasets.load_boston()
data_X = loaded_data.data
data_y = loaded_data.target

建立模型、訓練、預測

model = LinearRegression()
model.fit(data_X, data_y)
#分別顯示預測值和實際值
print(model.predict(data_X[:4, :]))
print(data_y[:4])

可視化

#建立數據
X, y = datasets.make_regression(n_samples=100, n_features=1, n_targets=1, noise=10)
#繪圖
plt.scatter(X, y)
plt.show()

參數和模型分數

#取得參數
print(model.get_params())
#取得R^2分數
print(model.score(data_X, data_y))

平均絕對誤差(MAE)、均方誤差(MSE)、均方根誤差(RMSE)

from sklearn import metricsprint('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

分類報告與混淆矩陣

from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))
Confusion Matrix 預測 (No) 預測(Yes)
實際(No) TN=549 FP=11
實際(Yes) FN=20 TP=506
#混淆矩陣說明
TN=True Negative
FP=False Positive
FN=False Negative
TP=True Positive
precison = TP / (TP + FP)
recall = TP / (TP + FN)
accuracy = (TP + TN) / (TN + FP + FN + TP)
F-beta score可視為precision和recall加權平均的值,數值介於0-1,最好是1。
support代表在測試組y實際值的發生次數。

標準化

#標準化可提升機器學習的成效from sklearn import preprocessing #標準化數據模組
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets.samples_generator import make_classification
from sklearn.svm import SVC
import matplotlib.pyplot as plt
#標準化前
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
clf = SVC()
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
#標準化後
X = preprocessing.scale(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
clf = SVC()
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

基礎驗證

from sklearn.datasets import load_iris # iris數據集
from sklearn.model_selection import train_test_split # 分割數據集
from sklearn.neighbors import KNeighborsClassifier # kNN演算法

#載入數據
iris = load_iris()
X = iris.data
y = iris.target

#分成測試與訓練集
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

#建立模型
knn = KNeighborsClassifier()

#訓練模型
knn.fit(X_train, y_train)

#準確率
print(knn.score(X_test, y_test))
# 0.973684210526

交叉驗證

from sklearn.model_selection import cross_val_score # K折交叉驗證模組

#使用K折交叉驗證模組
scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')

#5次的預測準確率
print(scores)
# [ 0.96666667 1. 0.93333333 0.96666667 1. ]

#5次的預測平均率
print(scores.mean())
# 0.973333333333

以 Accuracy判斷

#一般以准確率(accuracy)用於判斷分類(Classification)模型的好壞。
#K過高造成準確率下降,是因為過擬合(Over fitting)的問題。
import matplotlib.pyplot as plt #可視化模組

#建立測試參數集K值
k_range = range(1, 31)

k_scores = []

#藉由迭代的方式来計算不同参数對模型的影響,並返回交叉驗證後的平均準確率
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
k_scores.append(scores.mean())

#可視化
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
plt.show()

以 Mean squared error 判斷

#一般用平均方差(Mean squared error)於判斷回歸(Regression)模型的好壞。
import matplotlib.pyplot as plt
k_range = range(1, 31)
k_scores = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
loss = -cross_val_score(knn, X, y, cv=10, scoring='neg_mean_squared_error')
k_scores.append(loss.mean())

plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated MSE')
plt.show()

Learning curve 檢視過擬合

from sklearn.learning_curve import learning_curve #學習曲線模組
from sklearn.datasets import load_digits #digits數據集
from sklearn.svm import SVC #支持向量Classifier
import matplotlib.pyplot as plt #可視化模組
import numpy as np
digits = load_digits()
X = digits.data
y = digits.target
train_sizes, train_loss, test_loss = learning_curve(
SVC(gamma=0.001), X, y, cv=10, scoring='neg_mean_squared_error',
train_sizes=[0.1, 0.25, 0.5, 0.75, 1])

#平均每一輪所得到的平均方差(共5輪,分別為樣本10%、25%、50%、75%、100%)
train_loss_mean = -np.mean(train_loss, axis=1)
test_loss_mean = -np.mean(test_loss, axis=1)
#可視化
plt.plot(train_sizes, train_loss_mean, 'o-', color="r",
label="Training")
plt.plot(train_sizes, test_loss_mean, 'o-', color="g",
label="Cross-validation")
plt.xlabel("Training examples")
plt.ylabel("Loss")
plt.legend(loc="best")
plt.show()

Validation curve 檢視過擬合

validation_curve,看出改變模型參數的時候有沒有過擬合(overfitting)的問題from sklearn.learning_curve import validation_curve 
from sklearn.datasets import load_digits
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np

#digits數據集
digits = load_digits()
X = digits.data
y = digits.target

#建立參數測試集
param_range = np.logspace(-6, -2.3, 5)

#使用validation_curve快速找出參數對模型的影響
train_loss, test_loss = validation_curve(
SVC(), X, y, param_name='gamma', param_range=param_range, cv=10, scoring='mean_squared_error')

#平均每一輪的平均方差
train_loss_mean = -np.mean(train_loss, axis=1)
test_loss_mean = -np.mean(test_loss, axis=1)

#可視化圖形
plt.plot(param_range, train_loss_mean, 'o-', color="r",
label="Training")
plt.plot(param_range, test_loss_mean, 'o-', color="g",
label="Cross-validation")

plt.xlabel("gamma")
plt.ylabel("Loss")
plt.legend(loc="best")
plt.show()

儲存訓練完的模型

使用 joblib 保存模型from sklearn import svm
from sklearn import datasets

clf = svm.SVC()
iris = datasets.load_iris()
X, y = iris.data, iris.target
clf.fit(X,y)
from sklearn.externals import joblib #jbolib模組

#保存Model(注:save資料夾要預先建立,否則會錯誤)
joblib.dump(clf, 'save/clf.pkl')

載入訓練好的模型

from sklearn.externals import joblib #jbolib模組#讀取Model
clf3 = joblib.load('save/clf.pkl')

#測試讀取後的Model
clf3.predict(X[0:1])

參考資料

Written by

Machine Learning / Deep Learning / Python / Flutter cakeresume.com/yanwei-liu

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store