Python機器學習筆記(七):使用Scikit-Learn進行各種演算法準確率比較

Yanwei Liu
21 min readMar 28, 2019

--

使用下列演算法

Linear regression, Neural network, Lasso, ElasticNet, Random Forest, Extra Trees, Boosted, Decision Tree, XGBoost

引入模組

import pandas as pd
import numpy as np
import seaborn as sns
%matplotlib inline
pd.set_option("display.max_rows", 1000) #設定最大能顯示1000rows
pd.set_option("display.max_columns", 1000) #設定最大能顯示1000columns
from pylab import mpl
mpl.rcParams['font.sans-serif'] = ['Microsoft YaHei']
# 指定默認字形:解決plot不能顯示中文問題
mpl.rcParams['axes.unicode_minus'] = False

載入資料

df = pd.read_excel('KH-1982-2018.xlsx')
df.head()

顯示標籤

df.columns

轉換資料型態

df["SO2"] = pd.to_numeric(df.SO2, errors='coerce')
df["CO"] = pd.to_numeric(df.CO, errors='coerce')
df["CO2"] = pd.to_numeric(df.CO2, errors='coerce')
df["O3"] = pd.to_numeric(df.O3, errors='coerce')
df["PM25"] = pd.to_numeric(df.PM25, errors='coerce')
df["Nox"] = pd.to_numeric(df.Nox, errors='coerce')
df["NO"] = pd.to_numeric(df.NO, errors='coerce')
df["NO2"] = pd.to_numeric(df.NO2, errors='coerce')
df["THC"] = pd.to_numeric(df.THC, errors='coerce')
df["NMHC"] = pd.to_numeric(df.NMHC, errors='coerce')
df["CH4"] = pd.to_numeric(df.CH4, errors='coerce')
df["WindSpeed"] = pd.to_numeric(df.WindSpeed, errors='coerce')
df["TEMP"] = pd.to_numeric(df.TEMP, errors='coerce')
df["Humidity"] = pd.to_numeric(df.Humidity, errors='coerce')

檢查資料型態

df.dtypes

設定繪圖樣式

import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.rcParams["figure.figsize"] = [16,9]

除去PM2.5(作為預測的數值需要移除)

X = df.drop(['PM25'], axis=1)
y=df[['SO2', 'CO', 'O3','Nox', 'NO', 'NO2', 'THC', 'NMHC', 'CH4','WindSpeed', 'TEMP', 'Humidity']]

分成測試與訓練集

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=12)
X_train.shape, y_train.shape

查看訓練集與測試集的資料筆數

X_train.shape, y_train.shapeX_test.shape, y_test.shape

訓練的標籤

X.columns

線性迴歸

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Create linear regression object
regr = LinearRegression()
regr.fit(X_train, y_train)LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
normalize=False)
lin_pred = regr.predict(X_test)linear_regression_score = regr.score(X_test, y_test)

評估預測的準確度

from math import sqrt
# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Root mean squared error: %.2f"
% sqrt(mean_squared_error(y_test, lin_pred)))
# The absolute squared error
print("Mean absolute error: %.2f"
% mean_absolute_error(y_test, lin_pred))
# Explained variance score: 1 is perfect prediction
print('R-squared: %.2f' % r2_score(y_test, lin_pred))

神經網路

from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Create MLPRegressor object
mlp = MLPRegressor()
mlp.fit(X_train, y_train)MLPRegressor(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
beta_2=0.999, early_stopping=False, epsilon=1e-08,
hidden_layer_sizes=(100,), learning_rate='constant',
learning_rate_init=0.001, max_iter=200, momentum=0.9,
n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
random_state=None, shuffle=True, solver='adam', tol=0.0001,
validation_fraction=0.1, verbose=False, warm_start=False)
# Score the model
neural_network_regression_score = mlp.score(X_test, y_test)
neural_network_regression_score
# Make predictions using the testing set
nnr_pred = mlp.predict(X_test)

評估預測的準確度

# The mean squared error
print("Root mean squared error: %.2f"
% sqrt(mean_squared_error(y_test, nnr_pred)))
# The absolute squared error
print("Mean absolute error: %.2f"
% mean_absolute_error(y_test, nnr_pred))
# Explained variance score: 1 is perfect prediction
print('R-squared: %.2f' % r2_score(y_test, nnr_pred))

Lasso

from sklearn.linear_model import Lasso

lasso = Lasso()
lasso.fit(X_train, y_train)Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
normalize=False, positive=False, precompute=False, random_state=None,
selection='cyclic', tol=0.0001, warm_start=False)
# Score the model
lasso_score = lasso.score(X_test, y_test)
lasso_score
# Make predictions using the testing set
lasso_pred = lasso.predict(X_test)
print("Root mean squared error: %.2f"
% sqrt(mean_squared_error(y_test, lasso_pred)))

ElasticNet

from sklearn.linear_model import ElasticNet

elasticnet = ElasticNet()
elasticnet.fit(X_train, y_train)ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,
max_iter=1000, normalize=False, positive=False, precompute=False,
random_state=None, selection='cyclic', tol=0.0001, warm_start=False)
elasticnet_score = elasticnet.score(X_test, y_test)
elasticnet_score
elasticnet_pred = elasticnet.predict(X_test)# The mean squared error
print("Root mean squared error: %.2f"
% sqrt(mean_squared_error(y_test, elasticnet_pred)))

Random Forest

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


# Create Random Forrest Regressor object
regr_rf = RandomForestRegressor(n_estimators=200, random_state=1234)
# Train the model using the training sets
regr_rf.fit(X_train, y_train)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
oob_score=False, random_state=1234, verbose=0, warm_start=False)
# Score the model
decision_forest_score = regr_rf.score(X_test, y_test)
decision_forest_score
# Make predictions using the testing set
regr_rf_pred = regr_rf.predict(X_test)
# The mean squared error
print("Root mean squared error: %.2f"
% sqrt(mean_squared_error(y_test, regr_rf_pred)))
# The absolute squared error
print("Mean absolute error: %.2f"
% mean_absolute_error(y_test, regr_rf_pred))
# Explained variance score: 1 is perfect prediction
print('R-squared: %.2f' % r2_score(y_test, regr_rf_pred))
X.columnsfeatures = X.columns
importances = regr_rf.feature_importances_
indices = np.argsort(importances)
#繪製特徵重要程度度plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')
plt.show()

Extra Tree

from sklearn.ensemble import ExtraTreesRegressor

extra_tree = ExtraTreesRegressor(n_estimators=200, random_state=1234)
extra_tree.fit(X_train, y_train)ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
oob_score=False, random_state=1234, verbose=0, warm_start=False)
extratree_score = extra_tree.score(X_test, y_test)
extratree_score
extratree_pred = extra_tree.predict(X_test)print("Root mean squared error: %.2f"
% sqrt(mean_squared_error(y_test, extratree_pred)))
Root mean squared error: 3.60features = X.columns
importances = extra_tree.feature_importances_
indices = np.argsort(importances)
#繪製特徵重要程度度plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')
plt.show()

Desion Tree & Boosted Tree

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Create Decision Tree Regressor object
tree_1 = DecisionTreeRegressor()

tree_2 = AdaBoostRegressor(DecisionTreeRegressor(), n_estimators=200, learning_rate=.1)
# Train the model using the training sets
tree_1.fit(X_train, y_train)
tree_2.fit(X_train, y_train)
# Score the decision tree model
tree_1.score(X_test, y_test)
# Score the boosted decision tree model
boosted_tree_score = tree_2.score(X_test, y_test)
boosted_tree_score
# Make predictions using the testing set
tree_1_pred = tree_1.predict(X_test)
tree_2_pred = tree_2.predict(X_test)
# The coefficients
print('Coefficients: \n', regr.coef_)
# The mean squared error
print("Root mean squared error: %.2f"
% sqrt(mean_squared_error(y_test, tree_2_pred)))
# The absolute squared error
print("Mean absolute error: %.2f"
% mean_absolute_error(y_test, tree_2_pred))
# Explained variance score: 1 is perfect prediction
print('R-squared: %.2f' % r2_score(y_test, tree_2_pred))
features = X.columns
importances = tree_2.feature_importances_
indices = np.argsort(importances)

plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), features[indices])
plt.xlabel('Relative Importance')
plt.show()

XGBoost

from xgboost.sklearn import XGBRegressor

#Fitting XGB regressor
xboost = XGBRegressor(n_estimators=200)
xboost.fit(X_train, y_train)XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bytree=1, gamma=0, importance_type='gain',
learning_rate=0.1, max_delta_step=0, max_depth=3,
min_child_weight=1, missing=None, n_estimators=200, n_jobs=1,
nthread=None, objective='reg:linear', random_state=0, reg_alpha=0,
reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
subsample=1)
xgb_score = xboost.score(X_test, y_test)
xgb_score
#Predict
xboost_pred = xboost.predict(X_test)
print("Root mean squared error: %.2f"
% sqrt(mean_squared_error(y_test, xboost_pred)))

總結

print("Scores:")
print("Linear regression score: ", linear_regression_score)
print("Neural network regression score: ", neural_network_regression_score)
print("Lasso regression score: ", lasso_score)
print("ElasticNet regression score: ", elasticnet_score)
print("Decision forest score: ", decision_forest_score)
print("Extra Trees score: ", extratree_score)
print("Boosted decision tree score: ", boosted_tree_score)
print("XGBoost score:", xgb_score)
print("\n")
print("RMSE:")
print("Linear regression RMSE: %.2f"
% sqrt(mean_squared_error(y_test, lin_pred)))
print("Neural network RMSE: %.2f"
% sqrt(mean_squared_error(y_test, nnr_pred)))
print("Lasso RMSE: %.2f"
% sqrt(mean_squared_error(y_test, lasso_pred)))
print("ElasticNet RMSE: %.2f"
% sqrt(mean_squared_error(y_test, elasticnet_pred)))
print("Decision forest RMSE: %.2f"
% sqrt(mean_squared_error(y_test, regr_rf_pred)))
print("Extra Trees RMSE: %.2f"
% sqrt(mean_squared_error(y_test, extratree_pred)))
print("Boosted decision tree RMSE: %.2f"
% sqrt(mean_squared_error(y_test, tree_2_pred)))
print("XGBoost RMSE: %.2f"
% sqrt(mean_squared_error(y_test, xboost_pred)))
Scores:
Linear regression score: 0.9101721045818417
Neural network regression score: 0.8532397997401182
Lasso regression score: 0.8873130463758103
ElasticNet regression score: 0.888862421794889
Decision forest score: 0.9023960871039343
Extra Trees score: 0.9167571974098775
Boosted decision tree score: 0.892229896203619
XGBoost score: 0.8902505097601974
RMSE:
Linear regression RMSE: 3.74
Neural network RMSE: 4.78
Lasso RMSE: 4.19
ElasticNet RMSE: 4.16
Decision forest RMSE: 3.90
Extra Trees RMSE: 3.60
Boosted decision tree RMSE: 4.09
XGBoost RMSE: 4.13

--

--

No responses yet