Python機器學習筆記(四):使用Scikit-Learn進行邏輯迴歸分析

Yanwei Liu
6 min readMar 13, 2019

--

引入套件

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

載入資料集

train = pd.read_csv('200811-201811.csv')
train.head()
#Danger分類點說明
#對敏感族群不健康為PM2.5數值在35.5以上
#若>35.5則為1,若<35.5則為0

用heatmap(.isnull())來找出缺失的資料在哪些欄位

sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')

用countplot來看Nox是否影響健康

sns.countplot(x='Danger',hue='Nox',data=train,palette='RdBu_r')

用直方圖看年齡分佈。缺失資料在此不計。

sns.distplot(train['PM25'].dropna(),kde=False,bins=30)

用直方圖看Nox的分佈

train['Nox'].hist(bins=40,figsize=(10,4))

建立邏輯迴歸模型並預測

X = train.drop('Danger', axis=1)
y = train['Danger']
#載入訓練組及測試組分組套件
from sklearn.model_selection import train_test_split
#設定隨機種子為42,測試組比例佔25%
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.75,random_state=42)
#載入邏輯迴歸
from sklearn.linear_model import LogisticRegression
#建立訓練組邏輯迴歸模型
logmodel = LogisticRegression(solver='liblinear')
logmodel.fit(X_train,y_train)
#使用模型來預測
predictions = logmodel.predict(X_test)
#載入classification_report來看預測和實際數值的差異,包含precision、recall、f1-score及supportfrom sklearn.metrics import classification_reportprint(classification_report(y_test,predictions))#模型分數
from sklearn import metrics
logmodel.score(X_train,y_train)
print("Accuracy:",metrics.accuracy_score(y_test, predictions))
print("Precision:",metrics.precision_score(y_test, predictions))
print("Recall:",metrics.recall_score(y_test, predictions))
#預測準確度如下所示 precision recall f1-score support

0(健康) 0.96 0.98 0.97 560
1(不健康) 0.98 0.96 0.97 526

micro avg 0.97 0.97 0.97 1086
macro avg 0.97 0.97 0.97 1086
weighted avg 0.97 0.97 0.97 1086
#建立confusion_matrixfrom sklearn.metrics import confusion_matrix
matrix=confusion_matrix(y_test,predictions)
#Confusion Matrix如下所示array([[549, 11],
[ 20, 506]], dtype=int64)
Confusion Matrix 預測 (No) 預測(Yes)
實際(No) TN=549 FP=11
實際(Yes) FN=20 TP=506
#混淆矩陣說明
TN=True Negative
FP=False Positive
FN=False Negative
TP=True Positive
precison = TP / (TP + FP)
recall = TP / (TP + FN)
accuracy = (TP + TN) / (TN + FP + FN + TP)
F-beta score可視為precision和recall加權平均的值,數值介於0-1,最好是1。
support代表在測試組y實際值的發生次數。
#混淆矩陣視覺化(Heatmap)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
class_names=[0,1] # name of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')
#ROC曲線
y_pred_proba = logreg.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

--

--

No responses yet