Python機器學習筆記(三):使用Scikit-Learn進行線性迴歸預測

Yanwei Liu
16 min readMar 5, 2019

--

本文分兩部分

Part1為純粹線性迴歸

Part2為「所有」的線性迴歸

Part1

以下ipynb更新於2019年3月13日:

本文發部於2019年3月:

一元線性回歸

引入模組

import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
import seaborn as seabornInstance
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
%matplotlib inline

載入資料

dataset = pd.read_csv('200811-201811.csv')

查看資料概況

dataset.describe()

進行繪圖

dataset.plot(x='PM25', y='CO', style='o')  
plt.title('PM25 vs CO')
plt.xlabel('PM25')
plt.ylabel('CO')
plt.show()

資料轉換

X = dataset['PM25'].values.reshape(-1,1)
y = dataset['CO'].values.reshape(-1,1)
#reshape(-1,1):換成單一列(垂直)
#reshape(1,-1):換成單一行(水平)

建立訓練集和資料集

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)#80%為Training set,20%為Test set

建立並訓練模型

regressor = LinearRegression()  
regressor.fit(X_train, y_train) #訓練模型

進行預測

df = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})df  #顯示實際值和預測值df1 = df.head(25)

視覺化:呈現實際值和預測值的差距

df1.plot(kind='bar',figsize=(10,5))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

視覺化:建立迴歸線

plt.scatter(X_test, y_test,  color='gray')
plt.plot(X_test, y_pred, color='red', linewidth=2)
plt.show()

平均絕對誤差(MAE)、均方誤差(MSE)、均方根誤差(RMSE)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

檢查準確度

regressor.score(X_train, y_train)

多元線性迴歸

引入模組

import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
import seaborn as seabornInstance
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
%matplotlib inline

載入資料

dataset = pd.read_csv('200811-201811.csv')

查看資料概況

dataset.describe()

檢查缺失值

dataset.isnull().any()

移除缺失值

dataset = dataset.fillna(method='ffill')

將資料分成屬性及標籤

X = dataset[['SO2', 'CO', 'O3', 'Nox', 'NO', 'NO2', 'THC', 'NMHC', 'CH4', 'WindSpeed','TEMP','Humidity']].valuesy = dataset['PM25'].values#X是要拿來預測的資料
#Y是想預測的結果

視覺化:查看PM2.5的資料分佈

plt.figure(figsize=(10,5))
plt.tight_layout()
seabornInstance.distplot(dataset['PM25'])

建立訓練集和資料集

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)##80%為Training set,20%為Test set

建立並訓練模型

regressor = LinearRegression()  
regressor.fit(X_train, y_train) #訓練模型

進行預測

df = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})df1 = df.head(25) #顯示實際值和預測值

視覺化:呈現實際值和預測值的差距

df1.plot(kind='bar',figsize=(10,8))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()

平均絕對誤差(MAE)、均方誤差(MSE)、均方根誤差(RMSE)

print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))  
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

檢查準確度

regressor.score(X_train, y_train)

Part2

一次掌握所有的迴歸演算法

原文: https://machinelearningmastery.com/spot-check-regression-machine-learning-algorithms-python-scikit-learn/

線性機器學習演算法

1.線性迴歸

# Linear Regression
import pandas
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.data"
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
dataframe = pandas.read_csv(url, delim_whitespace=True, names=names)
array = dataframe.values
X = array[:,0:13]
Y = array[:,13]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = LinearRegression()
scoring = 'neg_mean_squared_error'
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(results.mean())

2.岭迴歸

# Ridge Regression
import pandas
from sklearn import model_selection
from sklearn.linear_model import Ridge
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.data"
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
dataframe = pandas.read_csv(url, delim_whitespace=True, names=names)
array = dataframe.values
X = array[:,0:13]
Y = array[:,13]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = Ridge()
scoring = 'neg_mean_squared_error'
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(results.mean())

3. LASSO 迴歸

# Lasso Regression
import pandas
from sklearn import model_selection
from sklearn.linear_model import Lasso
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.data"
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
dataframe = pandas.read_csv(url, delim_whitespace=True, names=names)
array = dataframe.values
X = array[:,0:13]
Y = array[:,13]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = Lasso()
scoring = 'neg_mean_squared_error'
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(results.mean())

4.ElasticNet彈性網路迴歸

# ElasticNet Regression
import pandas
from sklearn import model_selection
from sklearn.linear_model import ElasticNet
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.data"
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
dataframe = pandas.read_csv(url, delim_whitespace=True, names=names)
array = dataframe.values
X = array[:,0:13]
Y = array[:,13]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = ElasticNet()
scoring = 'neg_mean_squared_error'
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(results.mean())

非線性機器學習演算法

1. KNN迴歸

# KNN Regression
import pandas
from sklearn import model_selection
from sklearn.neighbors import KNeighborsRegressor
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.data"
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
dataframe = pandas.read_csv(url, delim_whitespace=True, names=names)
array = dataframe.values
X = array[:,0:13]
Y = array[:,13]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = KNeighborsRegressor()
scoring = 'neg_mean_squared_error'
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(results.mean())

2.決策樹迴歸

# Decision Tree Regression
import pandas
from sklearn import model_selection
from sklearn.tree import DecisionTreeRegressor
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.data"
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
dataframe = pandas.read_csv(url, delim_whitespace=True, names=names)
array = dataframe.values
X = array[:,0:13]
Y = array[:,13]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = DecisionTreeRegressor()
scoring = 'neg_mean_squared_error'
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(results.mean())

3.支持項量機迴歸(SVR)

# SVM Regression
import pandas
from sklearn import model_selection
from sklearn.svm import SVR
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.data"
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
dataframe = pandas.read_csv(url, delim_whitespace=True, names=names)
array = dataframe.values
X = array[:,0:13]
Y = array[:,13]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = SVR()
scoring = 'neg_mean_squared_error'
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(results.mean())

--

--