Python機器學習筆記(三):使用Scikit-Learn進行線性迴歸預測
16 min readMar 5, 2019
本文分兩部分
Part1為純粹線性迴歸
Part2為「所有」的線性迴歸
Part1
以下ipynb更新於2019年3月13日:
本文發部於2019年3月:
一元線性回歸
引入模組
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as seabornInstance
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
%matplotlib inline
載入資料
dataset = pd.read_csv('200811-201811.csv')
查看資料概況
dataset.describe()
進行繪圖
dataset.plot(x='PM25', y='CO', style='o')
plt.title('PM25 vs CO')
plt.xlabel('PM25')
plt.ylabel('CO')
plt.show()
資料轉換
X = dataset['PM25'].values.reshape(-1,1)
y = dataset['CO'].values.reshape(-1,1)#reshape(-1,1):換成單一列(垂直)
#reshape(1,-1):換成單一行(水平)
建立訓練集和資料集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)#80%為Training set,20%為Test set
建立並訓練模型
regressor = LinearRegression()
regressor.fit(X_train, y_train) #訓練模型
進行預測
df = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})df #顯示實際值和預測值df1 = df.head(25)
視覺化:呈現實際值和預測值的差距
df1.plot(kind='bar',figsize=(10,5))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()
視覺化:建立迴歸線
plt.scatter(X_test, y_test, color='gray')
plt.plot(X_test, y_pred, color='red', linewidth=2)
plt.show()
平均絕對誤差(MAE)、均方誤差(MSE)、均方根誤差(RMSE)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
檢查準確度
regressor.score(X_train, y_train)
多元線性迴歸
引入模組
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as seabornInstance
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
%matplotlib inline
載入資料
dataset = pd.read_csv('200811-201811.csv')
查看資料概況
dataset.describe()
檢查缺失值
dataset.isnull().any()
移除缺失值
dataset = dataset.fillna(method='ffill')
將資料分成屬性及標籤
X = dataset[['SO2', 'CO', 'O3', 'Nox', 'NO', 'NO2', 'THC', 'NMHC', 'CH4', 'WindSpeed','TEMP','Humidity']].valuesy = dataset['PM25'].values#X是要拿來預測的資料
#Y是想預測的結果
視覺化:查看PM2.5的資料分佈
plt.figure(figsize=(10,5))
plt.tight_layout()
seabornInstance.distplot(dataset['PM25'])
建立訓練集和資料集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)##80%為Training set,20%為Test set
建立並訓練模型
regressor = LinearRegression()
regressor.fit(X_train, y_train) #訓練模型
進行預測
df = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})df1 = df.head(25) #顯示實際值和預測值
視覺化:呈現實際值和預測值的差距
df1.plot(kind='bar',figsize=(10,8))
plt.grid(which='major', linestyle='-', linewidth='0.5', color='green')
plt.grid(which='minor', linestyle=':', linewidth='0.5', color='black')
plt.show()
平均絕對誤差(MAE)、均方誤差(MSE)、均方根誤差(RMSE)
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
檢查準確度
regressor.score(X_train, y_train)
Part2
一次掌握所有的迴歸演算法
線性機器學習演算法
1.線性迴歸
# Linear Regression
import pandas
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.data"
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
dataframe = pandas.read_csv(url, delim_whitespace=True, names=names)
array = dataframe.values
X = array[:,0:13]
Y = array[:,13]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = LinearRegression()
scoring = 'neg_mean_squared_error'
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(results.mean())
2.岭迴歸
# Ridge Regression
import pandas
from sklearn import model_selection
from sklearn.linear_model import Ridge
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.data"
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
dataframe = pandas.read_csv(url, delim_whitespace=True, names=names)
array = dataframe.values
X = array[:,0:13]
Y = array[:,13]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = Ridge()
scoring = 'neg_mean_squared_error'
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(results.mean())
3. LASSO 迴歸
# Lasso Regression
import pandas
from sklearn import model_selection
from sklearn.linear_model import Lasso
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.data"
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
dataframe = pandas.read_csv(url, delim_whitespace=True, names=names)
array = dataframe.values
X = array[:,0:13]
Y = array[:,13]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = Lasso()
scoring = 'neg_mean_squared_error'
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(results.mean())
4.ElasticNet彈性網路迴歸
# ElasticNet Regression
import pandas
from sklearn import model_selection
from sklearn.linear_model import ElasticNet
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.data"
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
dataframe = pandas.read_csv(url, delim_whitespace=True, names=names)
array = dataframe.values
X = array[:,0:13]
Y = array[:,13]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = ElasticNet()
scoring = 'neg_mean_squared_error'
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(results.mean())
非線性機器學習演算法
1. KNN迴歸
# KNN Regression
import pandas
from sklearn import model_selection
from sklearn.neighbors import KNeighborsRegressor
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.data"
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
dataframe = pandas.read_csv(url, delim_whitespace=True, names=names)
array = dataframe.values
X = array[:,0:13]
Y = array[:,13]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = KNeighborsRegressor()
scoring = 'neg_mean_squared_error'
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(results.mean())
2.決策樹迴歸
# Decision Tree Regression
import pandas
from sklearn import model_selection
from sklearn.tree import DecisionTreeRegressor
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.data"
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
dataframe = pandas.read_csv(url, delim_whitespace=True, names=names)
array = dataframe.values
X = array[:,0:13]
Y = array[:,13]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = DecisionTreeRegressor()
scoring = 'neg_mean_squared_error'
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(results.mean())
3.支持項量機迴歸(SVR)
# SVM Regression
import pandas
from sklearn import model_selection
from sklearn.svm import SVR
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.data"
names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
dataframe = pandas.read_csv(url, delim_whitespace=True, names=names)
array = dataframe.values
X = array[:,0:13]
Y = array[:,13]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
model = SVR()
scoring = 'neg_mean_squared_error'
results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
print(results.mean())