Python機器學習筆記(七):使用Scikit-Learn進行各種演算法準確率比較
21 min readMar 28, 2019
使用下列演算法
Linear regression, Neural network, Lasso, ElasticNet, Random Forest, Extra Trees, Boosted, Decision Tree, XGBoost
引入模組
import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
pd.set_option("display.max_rows", 1000) #設定最大能顯示1000rows
pd.set_option("display.max_columns", 1000) #設定最大能顯示1000columns
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['Microsoft YaHei']
# 指定默認字形:解決plot不能顯示中文問題
mpl.rcParams['axes.unicode_minus'] = False
載入資料
df = pd.read_excel('KH-1982-2018.xlsx')
df.head()
顯示標籤
df.columns
轉換資料型態
df["SO2"] = pd.to_numeric(df.SO2, errors='coerce')
df["CO"] = pd.to_numeric(df.CO, errors='coerce')
df["CO2"] = pd.to_numeric(df.CO2, errors='coerce')
df["O3"] = pd.to_numeric(df.O3, errors='coerce')
df["PM25"] = pd.to_numeric(df.PM25, errors='coerce')
df["Nox"] = pd.to_numeric(df.Nox, errors='coerce')
df["NO"] = pd.to_numeric(df.NO, errors='coerce')
df["NO2"] = pd.to_numeric(df.NO2, errors='coerce')
df["THC"] = pd.to_numeric(df.THC, errors='coerce')
df["NMHC"] = pd.to_numeric(df.NMHC, errors='coerce')
df["CH4"] = pd.to_numeric(df.CH4, errors='coerce')
df["WindSpeed"] = pd.to_numeric(df.WindSpeed, errors='coerce')
df["TEMP"] = pd.to_numeric(df.TEMP, errors='coerce')
df["Humidity"] = pd.to_numeric(df.Humidity, errors='coerce')
檢查資料型態
df.dtypes
設定繪圖樣式
import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.rcParams["figure.figsize"] = [16,9]
除去PM2.5(作為預測的數值需要移除)
X = df.drop(['PM25'], axis=1)
y=df[['SO2', 'CO', 'O3','Nox', 'NO', 'NO2', 'THC', 'NMHC', 'CH4','WindSpeed', 'TEMP', 'Humidity']]
分成測試與訓練集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=12)X_train.shape, y_train.shape
查看訓練集與測試集的資料筆數
X_train.shape, y_train.shapeX_test.shape, y_test.shape
訓練的標籤
X.columns
線性迴歸
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# Create linear regression object
regr = LinearRegression()regr.fit(X_train, y_train)LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
normalize=False)lin_pred = regr.predict(X_test)linear_regression_score = regr.score(X_test, y_test)
評估預測的準確度
from math import sqrt
# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Root mean squared error: %.2f"
% sqrt(mean_squared_error(y_test, lin_pred)))
# The absolute squared error
print("Mean absolute error: %.2f"
% mean_absolute_error(y_test, lin_pred))
# Explained variance score: 1 is perfect prediction
print('R-squared: %.2f' % r2_score(y_test, lin_pred))
神經網路
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# Create MLPRegressor object
mlp = MLPRegressor()mlp.fit(X_train, y_train)MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
beta_2=0.999, early_stopping=False, epsilon=1e-08,
hidden_layer_sizes=(100,), learning_rate='constant',
learning_rate_init=0.001, max_iter=200, momentum=0.9,
n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
random_state=None, shuffle=True, solver='adam', tol=0.0001,
validation_fraction=0.1, verbose=False, warm_start=False)# Score the model
neural_network_regression_score = mlp.score(X_test, y_test)
neural_network_regression_score# Make predictions using the testing set
nnr_pred = mlp.predict(X_test)
評估預測的準確度
# The mean squared error
print("Root mean squared error: %.2f"
% sqrt(mean_squared_error(y_test, nnr_pred)))
# The absolute squared error
print("Mean absolute error: %.2f"
% mean_absolute_error(y_test, nnr_pred))
# Explained variance score: 1 is perfect prediction
print('R-squared: %.2f' % r2_score(y_test, nnr_pred))
Lasso
from sklearn.linear_model import Lasso
lasso = Lasso()lasso.fit(X_train, y_train)Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
normalize=False, positive=False, precompute=False, random_state=None,
selection='cyclic', tol=0.0001, warm_start=False)# Score the model
lasso_score = lasso.score(X_test, y_test)
lasso_score# Make predictions using the testing set
lasso_pred = lasso.predict(X_test)print("Root mean squared error: %.2f"
% sqrt(mean_squared_error(y_test, lasso_pred)))
ElasticNet
from sklearn.linear_model import ElasticNet
elasticnet = ElasticNet()elasticnet.fit(X_train, y_train)ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
max_iter=1000, normalize=False, positive=False, precompute=False,
random_state=None, selection='cyclic', tol=0.0001, warm_start=False)elasticnet_score = elasticnet.score(X_test, y_test)
elasticnet_scoreelasticnet_pred = elasticnet.predict(X_test)# The mean squared error
print("Root mean squared error: %.2f"
% sqrt(mean_squared_error(y_test, elasticnet_pred)))
Random Forest
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# Create Random Forrest Regressor object
regr_rf = RandomForestRegressor(n_estimators=200, random_state=1234)# Train the model using the training sets
regr_rf.fit(X_train, y_train)RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
oob_score=False, random_state=1234, verbose=0, warm_start=False)# Score the model
decision_forest_score = regr_rf.score(X_test, y_test)
decision_forest_score# Make predictions using the testing set
regr_rf_pred = regr_rf.predict(X_test)# The mean squared error
print("Root mean squared error: %.2f"
% sqrt(mean_squared_error(y_test, regr_rf_pred)))
# The absolute squared error
print("Mean absolute error: %.2f"
% mean_absolute_error(y_test, regr_rf_pred))
# Explained variance score: 1 is perfect prediction
print('R-squared: %.2f' % r2_score(y_test, regr_rf_pred))X.columnsfeatures = X.columns
importances = regr_rf.feature_importances_
indices = np.argsort(importances)#繪製特徵重要程度度plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')
plt.show()
Extra Tree
from sklearn.ensemble import ExtraTreesRegressor
extra_tree = ExtraTreesRegressor(n_estimators=200, random_state=1234)extra_tree.fit(X_train, y_train)ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
oob_score=False, random_state=1234, verbose=0, warm_start=False)extratree_score = extra_tree.score(X_test, y_test)
extratree_scoreextratree_pred = extra_tree.predict(X_test)print("Root mean squared error: %.2f"
% sqrt(mean_squared_error(y_test, extratree_pred)))Root mean squared error: 3.60features = X.columns
importances = extra_tree.feature_importances_
indices = np.argsort(importances)#繪製特徵重要程度度plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')
plt.show()
Desion Tree & Boosted Tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# Create Decision Tree Regressor object
tree_1 = DecisionTreeRegressor()
tree_2 = AdaBoostRegressor(DecisionTreeRegressor(), n_estimators=200, learning_rate=.1)# Train the model using the training sets
tree_1.fit(X_train, y_train)
tree_2.fit(X_train, y_train)# Score the decision tree model
tree_1.score(X_test, y_test)# Score the boosted decision tree model
boosted_tree_score = tree_2.score(X_test, y_test)
boosted_tree_score# Make predictions using the testing set
tree_1_pred = tree_1.predict(X_test)
tree_2_pred = tree_2.predict(X_test)# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Root mean squared error: %.2f"
% sqrt(mean_squared_error(y_test, tree_2_pred)))
# The absolute squared error
print("Mean absolute error: %.2f"
% mean_absolute_error(y_test, tree_2_pred))
# Explained variance score: 1 is perfect prediction
print('R-squared: %.2f' % r2_score(y_test, tree_2_pred))features = X.columns
importances = tree_2.feature_importances_
indices = np.argsort(importances)
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')
plt.show()
XGBoost
from xgboost.sklearn import XGBRegressor
#Fitting XGB regressor
xboost = XGBRegressor(n_estimators=200)xboost.fit(X_train, y_train)XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bytree=1, gamma=0, importance_type='gain',
learning_rate=0.1, max_delta_step=0, max_depth=3,
min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
subsample=1)xgb_score = xboost.score(X_test, y_test)
xgb_score#Predict
xboost_pred = xboost.predict(X_test)print("Root mean squared error: %.2f"
% sqrt(mean_squared_error(y_test, xboost_pred)))
總結
print("Scores:")
print("Linear regression score: ", linear_regression_score)
print("Neural network regression score: ", neural_network_regression_score)
print("Lasso regression score: ", lasso_score)
print("ElasticNet regression score: ", elasticnet_score)
print("Decision forest score: ", decision_forest_score)
print("Extra Trees score: ", extratree_score)
print("Boosted decision tree score: ", boosted_tree_score)
print("XGBoost score:", xgb_score)
print("\n")
print("RMSE:")
print("Linear regression RMSE: %.2f"
% sqrt(mean_squared_error(y_test, lin_pred)))
print("Neural network RMSE: %.2f"
% sqrt(mean_squared_error(y_test, nnr_pred)))
print("Lasso RMSE: %.2f"
% sqrt(mean_squared_error(y_test, lasso_pred)))
print("ElasticNet RMSE: %.2f"
% sqrt(mean_squared_error(y_test, elasticnet_pred)))
print("Decision forest RMSE: %.2f"
% sqrt(mean_squared_error(y_test, regr_rf_pred)))
print("Extra Trees RMSE: %.2f"
% sqrt(mean_squared_error(y_test, extratree_pred)))
print("Boosted decision tree RMSE: %.2f"
% sqrt(mean_squared_error(y_test, tree_2_pred)))
print("XGBoost RMSE: %.2f"
% sqrt(mean_squared_error(y_test, xboost_pred)))
Scores:
Linear regression score: 0.9101721045818417
Neural network regression score: 0.8532397997401182
Lasso regression score: 0.8873130463758103
ElasticNet regression score: 0.888862421794889
Decision forest score: 0.9023960871039343
Extra Trees score: 0.9167571974098775
Boosted decision tree score: 0.892229896203619
XGBoost score: 0.8902505097601974
RMSE:
Linear regression RMSE: 3.74
Neural network RMSE: 4.78
Lasso RMSE: 4.19
ElasticNet RMSE: 4.16
Decision forest RMSE: 3.90
Extra Trees RMSE: 3.60
Boosted decision tree RMSE: 4.09
XGBoost RMSE: 4.13