# 深度學習筆記(5)：鐵達尼號資料集的生存分析

`import pandas as pdimport numpy as npseed = 7np.random.seed(seed)# 載入資料集df = pd.read_csv("./titanic_data.csv")# 刪除不需要的欄位df = df.drop(["name", "ticket", "cabin"], axis=1)# 處理遺失資料df[["age"]] = df[["age"]].fillna(value=df[["age"]].mean())df[["fare"]] = df[["fare"]].fillna(value=df[["fare"]].mean())df[["embarked"]] = df[["embarked"]].fillna(value=df["embarked"].value_counts().idxmax())print(df["embarked"].value_counts())print(df["embarked"].value_counts().idxmax())# 轉換分類資料df["sex"] = df["sex"].map( {"female": 1, "male": 0} ).astype(int)# Embarked欄位的One-hot編碼enbarked_one_hot = pd.get_dummies(df["embarked"], prefix="embarked")df = df.drop("embarked", axis=1)df = df.join(enbarked_one_hot)# 將標籤的 survived 欄位移至最後df_survived = df.pop("survived")df["survived"] = df_survivedprint(df.head())df.head().to_html("Ch6_2_2.html")# 分割成訓練(80%)和測試(20%)資料集mask = np.random.rand(len(df)) < 0.8df_train = df[mask]df_test = df[~mask]print("Train:", df_train.shape)print("Test:", df_test.shape)# 儲存處理後的資料df_train.to_csv("titanic_train.csv", index=False)df_test.to_csv("titanic_test.csv", index=False)`

`import numpy as npimport pandas as pdfrom keras.models import Sequentialfrom keras.layers import Denseseed = 7np.random.seed(seed)# 載入Titanic的訓練和測試資料集df_train = pd.read_csv("./titanic_train.csv")df_test = pd.read_csv("./titanic_test.csv")dataset_train = df_train.valuesdataset_test = df_test.values# 分割成特徵資料和標籤資料X_train = dataset_train[:, 0:9]Y_train = dataset_train[:, 9]X_test = dataset_test[:, 0:9]Y_test = dataset_test[:, 9]# 特徵標準化X_train -= X_train.mean(axis=0)X_train /= X_train.std(axis=0)X_test -= X_test.mean(axis=0)X_test /= X_test.std(axis=0)# 定義模型model = Sequential()model.add(Dense(11, input_dim=X_train.shape[1], activation="relu"))model.add(Dense(11, activation="relu"))model.add(Dense(1, activation="sigmoid"))# 編譯模型model.compile(loss="binary_crossentropy", optimizer="adam",metrics=["accuracy"])# 訓練模型print("Training ...")model.fit(X_train, Y_train, epochs=18, batch_size=10, verbose=0)# 評估模型print("\nTesting ...")loss, accuracy = model.evaluate(X_train, Y_train)print("訓練資料集的準確度 = {:.2f}".format(accuracy))loss, accuracy = model.evaluate(X_test, Y_test)print("測試資料集的準確度 = {:.2f}".format(accuracy))# 儲存Keras模型print("Saving Model: titanic.h5 ...")model.save("titanic.h5")`

`import numpy as npimport pandas as pdfrom keras.models import Sequentialfrom keras.models import load_modelseed = 7np.random.seed(seed)# 載入Titanic的測試資料集df_test = pd.read_csv("./titanic_test.csv")dataset_test = df_test.values# 分割成特徵資料和標籤資料X_test = dataset_test[:, 0:9]Y_test = dataset_test[:, 9]# 特徵標準化X_test -= X_test.mean(axis=0)X_test /= X_test.std(axis=0)# 建立Keras的Sequential模型model = Sequential()model = load_model("titanic.h5")# 編譯模型model.compile(loss="binary_crossentropy", optimizer="adam",metrics=["accuracy"])# 評估模型print("\nTesting ...")loss, accuracy = model.evaluate(X_test, Y_test)print("測試資料集的準確度 = {:.2f}".format(accuracy))# 計算分類的預測值print("\nPredicting ...")Y_pred = model.predict_classes(X_test)print(Y_pred[:,0])print(Y_test.astype(int))# 顯示混淆矩陣tb = pd.crosstab(Y_test.astype(int), Y_pred[:,0],rownames=["label"], colnames=["predict"])print(tb)tb.to_html("Ch6_2_4.html")`

Written by