Python機器學習筆記(八):機器學習的特徵工程技術

本文分Part1與Part2,分別是特徵選擇與特徵處理

Part1:特徵選擇

1.單變量選擇

使用卡方檢驗(chi ^ 2)

# Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)
import pandas
import numpy
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
# load data
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(X, Y)
# summarize scores
numpy.set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(X)
# summarize selected features
print(features[0:5,:])

2.RFE遞迴特徵消除

重複排除不具有影響力的特徵,來減少訓練的特徵數,直到指定特徵數量。

# Feature Extraction with RFE
from pandas import read_csv
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
# load data
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
model = LogisticRegression()
rfe = RFE(model, 3)
fit = rfe.fit(X, Y)
print("Num Features: %d") % fit.n_features_
print("Selected Features: %s") % fit.support_
print("Feature Ranking: %s") % fit.ranking_

3.主成分分析(PCA)

資料降維

# Feature Extraction with PCA
import numpy
from pandas import read_csv
from sklearn.decomposition import PCA
# load data
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
pca = PCA(n_components=3)
fit = pca.fit(X)
# summarize components
print("Explained Variance: %s") % fit.explained_variance_ratio_
print(fit.components_)

4.特徵重要度

決策樹或隨機森林等演算法可找出Feature Importance

import xgboost as xgb
housing_dmatrix = xgb.DMatrix(X,y)
# Create the parameter dictionary: params
params = {"objective":"reg:linear","max_depth":"4"}
# Train the model: xg_reg
xg_reg = xgb.train(dtrain=housing_dmatrix,params=params,num_boost_round=10)
# Plot the feature importances
xgb.plot_importance(xg_reg)
-------------------------------------------------------# Feature Importance with Extra Trees Classifier
from pandas import read_csv
from sklearn.ensemble import ExtraTreesClassifier
# load data
url = "https://raw.githubusercontent.com/jbrownlee/Datasets/master/pima-indians-diabetes.data.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']
dataframe = read_csv(url, names=names)
array = dataframe.values
X = array[:,0:8]
Y = array[:,8]
# feature extraction
model = ExtraTreesClassifier()
model.fit(X, Y)
print(model.feature_importances_)

Part2:特徵處理

匯入模組

import pandas as pd
import numpy as np

1.處理遺失值

(1)填入固定值(2)填入平均值(3)填入眾數(4)填入前一個或後一筆的數據(5)插值法(6)KNN(7)隨機森林threshold = 0.7#丟棄遺失值比例高於0.7的columns
data = data[data.columns[data.isnull().mean() < threshold]]

#丟棄遺失值比例高於0.7的rows
data = data.loc[data.isnull().mean(axis=1) < threshold]
#填補遺失值成0
data = data.fillna(0)
#填補遺失值成該欄位的中位數
data = data.fillna(data.median())
#填補類別欄位的遺失值
data['column_name'].fillna(data['column_name'].value_counts()
.idxmax(), inplace=True)
#只要擁有一個移失值,就移除整欄資料
data = data.dropna(axis=1)
--------------------------------------------------------------------使用scikit-learn進行遺失值處理from sklearn.preprocessing import Imputer
imr = Imputer(missing_values='NaN' , strategy='mean', axis=0)
imr = imr.fit(df)
imputed_data = imr.transform(df.values)
#strategy的其他選項median(中位數)或most_frequent(最頻繁的值)
#最頻繁的值為填補「分類特徵」的有效方法

2.處理離群值

#用標準差來丟棄離群值
factor = 3 #超過3個標準差視為離群值
upper_lim = data['column'].mean () + data['column'].std () * factor
lower_lim = data['column'].mean () - data['column'].std () * factor

data = data[(data['column'] < upper_lim) & (data['column'] > lower_lim)]
#用百分位數「丟棄」離群值
upper_lim = data['column'].quantile(.95)
lower_lim = data['column'].quantile(.05)

data = data[(data['column'] < upper_lim) & (data['column'] > lower_lim)]
#用百分位數「保留」離群值
upper_lim = data['column'].quantile(.95)
lower_lim = data['column'].quantile(.05)
data.loc[(df[column] > upper_lim),column] = upper_lim
data.loc[(df[column] < lower_lim),column] = lower_lim
#檢視特徵是否有離群值(只出現一次)
df['想處理的特徵'].sort_values()
df = df.drop([XXXXXXXXX-1])
df = df.drop([XXXXXXXXX-2])
df = df.drop([XXXXXXXXX-3])
df = df.drop([............]) #移除離群值

3.區間分化

#數值區間分化data['bin'] = pd.cut(data['value'], bins=[0,30,70,100], labels=["Low", "Mid", "High"])   value   bin
0 2 Low
1 45 Mid
2 7 Low
3 85 High
4 28 Low
#類別區間分化Country
0 Spain
1 Chile
2 Australia
3 Italy
4 Brazil
conditions = [
data['Country'].str.contains('Spain'),
data['Country'].str.contains('Italy'),
data['Country'].str.contains('Chile'),
data['Country'].str.contains('Brazil')]

choices = ['Europe', 'Europe', 'South America', 'South America']

data['Continent'] = np.select(conditions, choices, default='Other')
Country Continent
0 Spain Europe
1 Chile South America
2 Australia Other
3 Italy Europe
4 Brazil South America

4.對數轉換

Log(x+1)

#Example
data = pd.DataFrame({'value':[2,45, -23, 85, 28, 2, 35, -12]})
data['log+1'] = (data['value']+1).transform(np.log)#負數將會出現錯誤
data['log'] = (data['value']-data['value'].min()+1) .transform(np.log)
value log(x+1) log(x-min(x)+1)
0 2 1.09861 3.25810
1 45 3.82864 4.23411
2 -23 nan 0.00000
3 85 4.45435 4.69135
4 28 3.36730 3.95124
5 2 1.09861 3.25810
6 35 3.58352 4.07754
7 -12 nan 2.48491

5.One-hot encoding

encoded_columns = pd.get_dummies(data['column'])
data = data.join(encoded_columns).drop('column', axis=1)

6.合併群集

#選擇最高頻率標籤
data.groupby('id').agg(lambda x: x.value_counts().index[0]).
#樞紐分析
data.pivot_table(index='column_to_group', columns='column_to_encode', values='aggregation_column', aggfunc=np.sum, fill_value = 0)
#合併數值欄位
grouped = data.groupby('column_to_group')

sums = grouped[sum_cols].sum().add_suffix('_sum')
avgs = grouped[mean_cols].mean().add_suffix('_avg')

new_df = pd.concat([sums, avgs], axis=1)

7.特徵分割

data.name
0 Luther N. Gonzalez
1 Charles M. Young
2 Terry Lawson
3 Kristen White
4 Thomas Logsdon
#Extracting first names
data.name.str.split(" ").map(lambda x: x[0])
0 Luther
1 Charles
2 Terry
3 Kristen
4 Thomas
#Extracting last names
data.name.str.split(" ").map(lambda x: x[-1])
0 Gonzalez
1 Young
2 Lawson
3 White
4 Logsdon

8.調整資料規模

#Normalization
data = pd.DataFrame({'value':[2,45, -23, 85, 28, 2, 35, -12]})

data['normalized'] = (data['value'] - data['value'].min()) / (data['value'].max() - data['value'].min())
value normalized
0 2 0.23
1 45 0.63
2 -23 0.00
3 85 1.00
4 28 0.47
5 2 0.23
6 35 0.54
7 -12 0.10
#Standardizationdata = pd.DataFrame({'value':[2,45, -23, 85, 28, 2, 35, -12]})

data['standardized'] = (data['value'] - data['value'].mean()) / data['value'].std()
value standardized
0 2 -0.52
1 45 0.70
2 -23 -1.23
3 85 1.84
4 28 0.22
5 2 -0.52
6 35 0.42
7 -12 -0.92

9.提取日期

from datetime import date

data = pd.DataFrame({'date':
['01-01-2017',
'04-12-2008',
'23-06-1988',
'25-08-1999',
'20-02-1993',
]})

#轉換字串成時間
data['date'] = pd.to_datetime(data.date, format="%d-%m-%Y")

#提取年
data['year'] = data['date'].dt.year

#提取月
data['month'] = data['date'].dt.month

#提取過了多少年
data['passed_years'] = date.today().year - data['date'].dt.year

#提取過了多少月
data['passed_months'] = (date.today().year - data['date'].dt.year) * 12 + date.today().month - data['date'].dt.month

#從日期轉換出當天是星期幾
data['day_name'] = data['date'].dt.day_name()
date year month passed_years passed_months day_name
0 2017-01-01 2017 1 2 26 Sunday
1 2008-12-04 2008 12 11 123 Thursday
2 1988-06-23 1988 6 31 369 Thursday
3 1999-08-25 1999 8 20 235 Wednesday
4 1993-02-20 1993 2 26 313 Saturday
Image for post
Image for post
數據預處理
Image for post
Image for post
特徵選擇
Image for post
Image for post
特徵降維

Written by

Machine Learning / Deep Learning / Python / Flutter cakeresume.com/yanwei-liu

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store