Part1：特徵選擇

# 1.單變量選擇

`# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)import pandasimport numpyfrom sklearn.feature_selection import SelectKBestfrom sklearn.feature_selection import chi2# load dataurl = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']dataframe = pandas.read_csv(url, names=names)array = dataframe.valuesX = array[:,0:8]Y = array[:,8]# feature extractiontest = SelectKBest(score_func=chi2, k=4)fit = test.fit(X, Y)# summarize scoresnumpy.set_printoptions(precision=3)print(fit.scores_)features = fit.transform(X)# summarize selected featuresprint(features[0:5,:])`

# 2.RFE遞迴特徵消除

`# Feature Extraction with RFEfrom pandas import read_csvfrom sklearn.feature_selection import RFEfrom sklearn.linear_model import LogisticRegression# load dataurl = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']dataframe = read_csv(url, names=names)array = dataframe.valuesX = array[:,0:8]Y = array[:,8]# feature extractionmodel = LogisticRegression()rfe = RFE(model, 3)fit = rfe.fit(X, Y)print("Num Features: %d") % fit.n_features_print("Selected Features: %s") % fit.support_print("Feature Ranking: %s") % fit.ranking_`

# 3.主成分分析(PCA)

`# Feature Extraction with PCAimport numpyfrom pandas import read_csvfrom sklearn.decomposition import PCA# load dataurl = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']dataframe = read_csv(url, names=names)array = dataframe.valuesX = array[:,0:8]Y = array[:,8]# feature extractionpca = PCA(n_components=3)fit = pca.fit(X)# summarize componentsprint("Explained Variance: %s") % fit.explained_variance_ratio_print(fit.components_)`

# 4.特徵重要度

`import xgboost as xgbhousing_dmatrix = xgb.DMatrix(X,y)# Create the parameter dictionary: paramsparams = {"objective":"reg:linear","max_depth":"4"}# Train the model: xg_regxg_reg = xgb.train(dtrain=housing_dmatrix,params=params,num_boost_round=10)# Plot the feature importancesxgb.plot_importance(xg_reg)-------------------------------------------------------# Feature Importance with Extra Trees Classifierfrom pandas import read_csvfrom sklearn.ensemble import ExtraTreesClassifier# load dataurl = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']dataframe = read_csv(url, names=names)array = dataframe.valuesX = array[:,0:8]Y = array[:,8]# feature extractionmodel = ExtraTreesClassifier()model.fit(X, Y)print(model.feature_importances_)`

Part2：特徵處理

# 匯入模組

`import pandas as pdimport numpy as np`

# 1.處理遺失值

`(1)填入固定值(2)填入平均值(3)填入眾數(4)填入前一個或後一筆的數據(5)插值法(6)KNN(7)隨機森林threshold = 0.7#丟棄遺失值比例高於0.7的columnsdata = data[data.columns[data.isnull().mean() < threshold]]#丟棄遺失值比例高於0.7的rowsdata = data.loc[data.isnull().mean(axis=1) < threshold]#填補遺失值成0data = data.fillna(0)#填補遺失值成該欄位的中位數data = data.fillna(data.median())#填補類別欄位的遺失值data['column_name'].fillna(data['column_name'].value_counts().idxmax(), inplace=True)#只要擁有一個移失值，就移除整欄資料data = data.dropna(axis=1)--------------------------------------------------------------------使用scikit-learn進行遺失值處理from sklearn.preprocessing import Imputerimr = Imputer(missing_values='NaN' , strategy='mean', axis=0)imr = imr.fit(df)imputed_data = imr.transform(df.values)#strategy的其他選項median(中位數)或most_frequent(最頻繁的值)#最頻繁的值為填補「分類特徵」的有效方法`

# 2.處理離群值

`#用標準差來丟棄離群值factor = 3                 #超過3個標準差視為離群值upper_lim = data['column'].mean () + data['column'].std () * factorlower_lim = data['column'].mean () - data['column'].std () * factordata = data[(data['column'] < upper_lim) & (data['column'] > lower_lim)]#用百分位數「丟棄」離群值upper_lim = data['column'].quantile(.95)lower_lim = data['column'].quantile(.05)data = data[(data['column'] < upper_lim) & (data['column'] > lower_lim)]#用百分位數「保留」離群值upper_lim = data['column'].quantile(.95)lower_lim = data['column'].quantile(.05)data.loc[(df[column] > upper_lim),column] = upper_limdata.loc[(df[column] < lower_lim),column] = lower_lim#檢視特徵是否有離群值(只出現一次)df['想處理的特徵'].sort_values()df = df.drop([XXXXXXXXX-1])     df = df.drop([XXXXXXXXX-2])df = df.drop([XXXXXXXXX-3])df = df.drop([............])    #移除離群值`

# 3.區間分化

`#數值區間分化data['bin'] = pd.cut(data['value'], bins=[0,30,70,100], labels=["Low", "Mid", "High"])   value   bin0      2   Low1     45   Mid2      7   Low3     85  High4     28   Low#類別區間分化Country0      Spain1      Chile2  Australia3      Italy4     Brazilconditions = [    data['Country'].str.contains('Spain'),    data['Country'].str.contains('Italy'),    data['Country'].str.contains('Chile'),    data['Country'].str.contains('Brazil')]choices = ['Europe', 'Europe', 'South America', 'South America']data['Continent'] = np.select(conditions, choices, default='Other')      Country      Continent0      Spain         Europe1      Chile  South America2  Australia          Other3      Italy         Europe4     Brazil  South America`

# 4.對數轉換

Log(x+1)

`#Exampledata = pd.DataFrame({'value':[2,45, -23, 85, 28, 2, 35, -12]})data['log+1'] = (data['value']+1).transform(np.log)#負數將會出現錯誤data['log'] = (data['value']-data['value'].min()+1) .transform(np.log)     value  log(x+1)  log(x-min(x)+1)0      2   1.09861          3.258101     45   3.82864          4.234112    -23       nan          0.000003     85   4.45435          4.691354     28   3.36730          3.951245      2   1.09861          3.258106     35   3.58352          4.077547    -12       nan          2.48491`

# 5.One-hot encoding

`encoded_columns = pd.get_dummies(data['column'])data = data.join(encoded_columns).drop('column', axis=1)`

# 6.合併群集

`#選擇最高頻率標籤data.groupby('id').agg(lambda x: x.value_counts().index[0]).#樞紐分析data.pivot_table(index='column_to_group', columns='column_to_encode', values='aggregation_column', aggfunc=np.sum, fill_value = 0)#合併數值欄位grouped = data.groupby('column_to_group')sums = grouped[sum_cols].sum().add_suffix('_sum')avgs = grouped[mean_cols].mean().add_suffix('_avg')new_df = pd.concat([sums, avgs], axis=1)`

# 7.特徵分割

`data.name0  Luther N. Gonzalez1    Charles M. Young2        Terry Lawson3       Kristen White4      Thomas Logsdon#Extracting first namesdata.name.str.split(" ").map(lambda x: x[0])0     Luther1    Charles2      Terry3    Kristen4     Thomas#Extracting last namesdata.name.str.split(" ").map(lambda x: x[-1])0    Gonzalez1       Young2      Lawson3       White4     Logsdon`

# 8.調整資料規模

`#Normalizationdata = pd.DataFrame({'value':[2,45, -23, 85, 28, 2, 35, -12]})data['normalized'] = (data['value'] - data['value'].min()) / (data['value'].max() - data['value'].min())value  normalized0      2        0.231     45        0.632    -23        0.003     85        1.004     28        0.475      2        0.236     35        0.547    -12        0.10#Standardizationdata = pd.DataFrame({'value':[2,45, -23, 85, 28, 2, 35, -12]})data['standardized'] = (data['value'] - data['value'].mean()) / data['value'].std()value  standardized0      2         -0.521     45          0.702    -23         -1.233     85          1.844     28          0.225      2         -0.526     35          0.427    -12         -0.92`

# 9.提取日期

`from datetime import datedata = pd.DataFrame({'date':['01-01-2017','04-12-2008','23-06-1988','25-08-1999','20-02-1993',]})#轉換字串成時間data['date'] = pd.to_datetime(data.date, format="%d-%m-%Y")#提取年data['year'] = data['date'].dt.year#提取月data['month'] = data['date'].dt.month#提取過了多少年data['passed_years'] = date.today().year - data['date'].dt.year#提取過了多少月data['passed_months'] = (date.today().year - data['date'].dt.year) * 12 + date.today().month - data['date'].dt.month#從日期轉換出當天是星期幾data['day_name'] = data['date'].dt.day_name()        date  year  month  passed_years  passed_months   day_name0 2017-01-01  2017      1             2             26     Sunday1 2008-12-04  2008     12            11            123   Thursday2 1988-06-23  1988      6            31            369   Thursday3 1999-08-25  1999      8            20            235  Wednesday4 1993-02-20  1993      2            26            313   Saturday`

Written by