# Python機器學習筆記(四)：使用Scikit-Learn進行邏輯迴歸分析

`import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as sns%matplotlib inline`

`train = pd.read_csv('200811-201811.csv')train.head()#Danger分類點說明#對敏感族群不健康為PM2.5數值在35.5以上#若>35.5則為1，若<35.5則為0`

`sns.heatmap(train.isnull(),yticklabels=False,cbar=False,cmap='viridis')`

`sns.countplot(x='Danger',hue='Nox',data=train,palette='RdBu_r')`

`sns.distplot(train['PM25'].dropna(),kde=False,bins=30)`

`train['Nox'].hist(bins=40,figsize=(10,4))`

`X = train.drop('Danger', axis=1)y = train['Danger']#載入訓練組及測試組分組套件from sklearn.model_selection import train_test_split#設定隨機種子為42，測試組比例佔25%X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.75,random_state=42)#載入邏輯迴歸from sklearn.linear_model import LogisticRegression#建立訓練組邏輯迴歸模型logmodel = LogisticRegression(solver='liblinear')logmodel.fit(X_train,y_train)#使用模型來預測predictions = logmodel.predict(X_test)#載入classification_report來看預測和實際數值的差異，包含precision、recall、f1-score及supportfrom sklearn.metrics import classification_reportprint(classification_report(y_test,predictions))#模型分數from sklearn import metricslogmodel.score(X_train,y_train)print("Accuracy:",metrics.accuracy_score(y_test, predictions))print("Precision:",metrics.precision_score(y_test, predictions))print("Recall:",metrics.recall_score(y_test, predictions))#預測準確度如下所示              precision    recall   f1-score   support  0(健康)          0.96      0.98      0.97       560  1(不健康)        0.98      0.96      0.97       526   micro avg       0.97      0.97      0.97      1086   macro avg       0.97      0.97      0.97      1086weighted avg       0.97      0.97      0.97      1086#建立confusion_matrixfrom sklearn.metrics import confusion_matrixmatrix=confusion_matrix(y_test,predictions)#Confusion Matrix如下所示array([[549,  11],       [ 20, 506]], dtype=int64)Confusion Matrix    預測 (No)     預測（Yes）實際（No）            TN=549        FP=11實際（Yes）           FN=20         TP=506#混淆矩陣說明TN=True NegativeFP=False PositiveFN=False NegativeTP=True Positiveprecison = TP / (TP + FP)recall = TP / (TP + FN)accuracy = (TP + TN) / (TN + FP + FN + TP)F-beta score可視為precision和recall加權平均的值，數值介於0-1，最好是1。support代表在測試組y實際值的發生次數。#混淆矩陣視覺化(Heatmap)import numpy as npimport matplotlib.pyplot as pltimport seaborn as sns%matplotlib inlineclass_names=[0,1] # name  of classesfig, ax = plt.subplots()tick_marks = np.arange(len(class_names))plt.xticks(tick_marks, class_names)plt.yticks(tick_marks, class_names)# create heatmapsns.heatmap(pd.DataFrame(matrix), annot=True, cmap="YlGnBu" ,fmt='g')ax.xaxis.set_label_position("top")plt.tight_layout()plt.title('Confusion matrix', y=1.1)plt.ylabel('Actual label')plt.xlabel('Predicted label')#ROC曲線y_pred_proba = logreg.predict_proba(X_test)[::,1]fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)auc = metrics.roc_auc_score(y_test, y_pred_proba)plt.plot(fpr,tpr,label="data 1, auc="+str(auc))plt.legend(loc=4)plt.show()`

