Scikit-Learn使用筆記
14 min readMar 30, 2019
2021/06/20更新溫州大學機器學習課程教材
https://github.com/fengdu78/WZU-machine-learning-course/blob/main/ppt/Scikit-learn.pdf
安裝
pip install scikit-learn
或者
直接安裝Anaconda
選擇演算法
引入模組
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
載入數據
#內建數據
iris = datasets.load_iris()
iris_X = iris.data
iris_y = iris.target
或者
#用Pandas讀數據數據
df = pd.read_csv('./csv檔案位置')
df = pd.read_excel('./xlsx檔案位置')
或者
#建立分類數據
from sklearn.datasets import make_classification
import pandas as pdfeatures, output = make_classification
(n_samples = 100, #100個樣本 n_features = 10, #10個特徵 n_informative = 5, #5個預測輸出的特徵 n_redundant = 5, #5個隨機特徵 n_classes = 3, #3個特徵分類#建立迴歸數據
import pandas as pdfrom sklearn.datasets import make_regression
features, output, coef = make_regression(n_samples = 100,
n_features =3,
n_informative = 2,
n_targets = 1,
noise = 0.0,coef = True)
將資料分為訓練集和測試集
X_train, X_test, y_train, y_test = train_test_split(
iris_X, iris_y, test_size=0.3)#test_size=0.3,測試集占總數據的30%
建立模型、訓練、預測
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print(knn.predict(X_test))
print(y_test)
引入模組
from sklearn import datasets
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
載入數據
#內建數據
loaded_data = datasets.load_boston()
data_X = loaded_data.data
data_y = loaded_data.target
建立模型、訓練、預測
model = LinearRegression()
model.fit(data_X, data_y)#分別顯示預測值和實際值
print(model.predict(data_X[:4, :]))
print(data_y[:4])
可視化
#建立數據
X, y = datasets.make_regression(n_samples=100, n_features=1, n_targets=1, noise=10)
#繪圖
plt.scatter(X, y)
plt.show()
參數和模型分數
#取得參數
print(model.get_params())
#取得R^2分數
print(model.score(data_X, data_y))
平均絕對誤差(MAE)、均方誤差(MSE)、均方根誤差(RMSE)
from sklearn import metricsprint('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
分類報告與混淆矩陣
from sklearn.metrics import classification_report,confusion_matrix
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))Confusion Matrix 預測 (No) 預測(Yes)
實際(No) TN=549 FP=11
實際(Yes) FN=20 TP=506#混淆矩陣說明
TN=True Negative
FP=False Positive
FN=False Negative
TP=True Positiveprecison = TP / (TP + FP)
recall = TP / (TP + FN)
accuracy = (TP + TN) / (TN + FP + FN + TP)
F-beta score可視為precision和recall加權平均的值,數值介於0-1,最好是1。
support代表在測試組y實際值的發生次數。
標準化
#標準化可提升機器學習的成效from sklearn import preprocessing #標準化數據模組
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets.samples_generator import make_classification
from sklearn.svm import SVC
import matplotlib.pyplot as plt#標準化前
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
clf = SVC()
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))#標準化後
X = preprocessing.scale(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
clf = SVC()
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))
基礎驗證
from sklearn.datasets import load_iris # iris數據集
from sklearn.model_selection import train_test_split # 分割數據集
from sklearn.neighbors import KNeighborsClassifier # kNN演算法
#載入數據
iris = load_iris()
X = iris.data
y = iris.target
#分成測試與訓練集
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)
#建立模型
knn = KNeighborsClassifier()
#訓練模型
knn.fit(X_train, y_train)
#準確率
print(knn.score(X_test, y_test))
# 0.973684210526
交叉驗證
from sklearn.model_selection import cross_val_score # K折交叉驗證模組
#使用K折交叉驗證模組
scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
#5次的預測準確率
print(scores)
# [ 0.96666667 1. 0.93333333 0.96666667 1. ]
#5次的預測平均率
print(scores.mean())
# 0.973333333333
以 Accuracy判斷
#一般以准確率(accuracy)用於判斷分類(Classification)模型的好壞。
#K過高造成準確率下降,是因為過擬合(Over fitting)的問題。import matplotlib.pyplot as plt #可視化模組
#建立測試參數集K值
k_range = range(1, 31)
k_scores = []
#藉由迭代的方式来計算不同参数對模型的影響,並返回交叉驗證後的平均準確率
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(knn, X, y, cv=10, scoring='accuracy')
k_scores.append(scores.mean())
#可視化
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
plt.show()
以 Mean squared error 判斷
#一般用平均方差(Mean squared error)於判斷回歸(Regression)模型的好壞。
import matplotlib.pyplot as plt
k_range = range(1, 31)
k_scores = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
loss = -cross_val_score(knn, X, y, cv=10, scoring='neg_mean_squared_error')
k_scores.append(loss.mean())
plt.plot(k_range, k_scores)
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated MSE')
plt.show()
Learning curve 檢視過擬合
from sklearn.learning_curve import learning_curve #學習曲線模組
from sklearn.datasets import load_digits #digits數據集
from sklearn.svm import SVC #支持向量Classifier
import matplotlib.pyplot as plt #可視化模組
import numpy as npdigits = load_digits()
X = digits.data
y = digits.target
train_sizes, train_loss, test_loss = learning_curve(
SVC(gamma=0.001), X, y, cv=10, scoring='neg_mean_squared_error',
train_sizes=[0.1, 0.25, 0.5, 0.75, 1])
#平均每一輪所得到的平均方差(共5輪,分別為樣本10%、25%、50%、75%、100%)
train_loss_mean = -np.mean(train_loss, axis=1)
test_loss_mean = -np.mean(test_loss, axis=1)#可視化
plt.plot(train_sizes, train_loss_mean, 'o-', color="r",
label="Training")
plt.plot(train_sizes, test_loss_mean, 'o-', color="g",
label="Cross-validation")
plt.xlabel("Training examples")
plt.ylabel("Loss")
plt.legend(loc="best")
plt.show()
Validation curve 檢視過擬合
validation_curve,看出改變模型參數的時候有沒有過擬合(overfitting)的問題from sklearn.learning_curve import validation_curve
from sklearn.datasets import load_digits
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import numpy as np
#digits數據集
digits = load_digits()
X = digits.data
y = digits.target
#建立參數測試集
param_range = np.logspace(-6, -2.3, 5)
#使用validation_curve快速找出參數對模型的影響
train_loss, test_loss = validation_curve(
SVC(), X, y, param_name='gamma', param_range=param_range, cv=10, scoring='mean_squared_error')
#平均每一輪的平均方差
train_loss_mean = -np.mean(train_loss, axis=1)
test_loss_mean = -np.mean(test_loss, axis=1)
#可視化圖形
plt.plot(param_range, train_loss_mean, 'o-', color="r",
label="Training")
plt.plot(param_range, test_loss_mean, 'o-', color="g",
label="Cross-validation")
plt.xlabel("gamma")
plt.ylabel("Loss")
plt.legend(loc="best")
plt.show()
儲存訓練完的模型
使用 joblib 保存模型from sklearn import svm
from sklearn import datasets
clf = svm.SVC()
iris = datasets.load_iris()
X, y = iris.data, iris.target
clf.fit(X,y)from sklearn.externals import joblib #jbolib模組
#保存Model(注:save資料夾要預先建立,否則會錯誤)
joblib.dump(clf, 'save/clf.pkl')
載入訓練好的模型
from sklearn.externals import joblib #jbolib模組#讀取Model
clf3 = joblib.load('save/clf.pkl')
#測試讀取後的Model
clf3.predict(X[0:1])
參考資料