特征编码
- 1. 离散变量编码
- 1.1 标签专用 sklearn.preprocessing.LabelEncoder
- 1.1.1 单列编码
- 1.1.2 多标签特征同时编码(封装类方式)
- 1.2 特征专用(不能是一维) sklearn.preprocessing.OrdinalEncoder
- 1.3 独热编码(离散变量编码) sklearn.preprocessing.OneHotEncoder
- 1.3.1 原理 & 过程
- 1.3.2 封装函数
- 1.3.3 多维数据编码
- 1.3.4 一维数据编码 .reshape(-1, 1)
- 1.3.5 keras实现编码 keras.utils.np_util.sto_categorical(data)
- 1.4 string特征转int (df['f1'] == 'Yes').astype(int)
- 2. 连续变量分箱(连续变量编码)
- 2.1 原理
- 2.2 等宽分箱 KBinsDiscretizer(strategy='uniform')
- 2.3 等频分箱 KBinsDiscretizer(strategy='quantile')
- 2.4 聚类分箱 KBinsDiscretizer(strategy='kmeans')
- 2.5 连续变量二值化
- 2.5.1 sklearn.preprocessing.Binarizer
- 2.5.2 DataFrame简单操作 (df['f1'] > 阈值).astype(int)
- 2.6 Regression Class Cutoff(连续值转整数寻找最佳划分阈值)
1. 离散变量编码
import pandas as pd
# index_col=0 第0列是索引
data = pd.read_csv('./data_preprocessing.csv', index_col=0)
data_ = data.copy()
1.1 标签专用 sklearn.preprocessing.LabelEncoder
1.1.1 单列编码
# 标签专用
from sklearn.preprocessing import LabelEncoder
# data.loc[:,'Survived']
# .iloc[] indexloc不能用列标签
y = data_.iloc[:,-1]
le = LabelEncoder()
label = le.fit_transform(y)
data_.iloc[:,-1] = label
data_
Age | Sex | Embarked | Survived | |
0 | 22.0 | male | S | 0 |
1 | 38.0 | female | C | 2 |
2 | 26.0 | female | S | 2 |
3 | NaN | female | S | 2 |
4 | 35.0 | male | S | 0 |
5 | NaN | male | Q | 1 |
6 | 58.0 | male | Q | 0 |
7 | 20.0 | female | C | 2 |
8 | 2.0 | female | S | 1 |
1.1.2 多标签特征同时编码(封装类方式)
from sklearn.preprocessing import LabelEncoder
class MultiColumnLabelEncoder:
def __init__(self,columns = None):
self.columns = columns # array of column names to encode
def fit(self,X,y=None):
return self # not relevant here
def transform(self,X):
'''
如果指定了待编码的特证名self.columns,则按照X中特征编码;
如果未指定,则遍历传入DataFrame的所有特征依次进行编码。
'''
output = X.copy()
if self.columns is not None:
for col in self.columns:
output[col] = LabelEncoder().fit_transform(output[col])
else:
for colname,col in output.iteritems():
output[colname] = LabelEncoder().fit_transform(col)
return output
def fit_transform(self,X,y=None):
return self.fit(X,y).transform(X)
label_encoder = MultiColumnLabelEncoder(columns=columns_with_strings_as_values)
train = label_encoder.fit_transform(train)
test = label_encoder.transform(test)
1.2 特征专用(不能是一维) sklearn.preprocessing.OrdinalEncoder
# 特征专用(不能是一维)
from sklearn.preprocessing import OrdinalEncoder
# data_.iloc[:,-1] 范围最后一列n
# data_.iloc[:,1:-1] 范围[1, n)
data_.iloc[:,1:-1] = OrdinalEncoder().fit_transform(data_.iloc[:,1:-1])
data_
Age | Sex | Embarked | Survived | |
0 | 22.0 | 1.0 | 2.0 | 0 |
1 | 38.0 | 0.0 | 0.0 | 2 |
2 | 26.0 | 0.0 | 2.0 | 2 |
3 | NaN | 0.0 | 2.0 | 2 |
4 | 35.0 | 1.0 | 2.0 | 0 |
5 | NaN | 1.0 | 1.0 | 1 |
6 | 58.0 | 1.0 | 1.0 | 0 |
7 | 20.0 | 0.0 | 0.0 | 2 |
8 | 2.0 | 0.0 | 2.0 | 1 |
1.3 独热编码(离散变量编码) sklearn.preprocessing.OneHotEncoder
1.3.1 原理 & 过程
- 原理
'''
名义变量:没有联系,不可互相计算。OneHotEncoder。舱门(S,C,Q)、性别;
有序变量:有联系,不可互相计算。OrdinalEncoder。学历(小学,高中,大学);
有距变量:有联系,可互相计算。重量(2kg,4kg,6kg).
单纯转为(1,2,3),自带数学性质影响建模。
OneHotEncoder独热编码,名义变量>>>哑变量
'S'[0, 'S'[[1,0,0],
'Q' 1, >>> 'Q' [0,1,0],
'C' 2] 'C' [0,0,1]]
'''
'''
二分类离散变量,转换后知到一列取值已知则另一列取值也确定
OneHotEncoder(drop='if_binary') 跳过二分类,只对多分类离散变量进行转化
ID Gender ID Gender_F Gender_M
1 F 1 1 0
2 M >>> 2 0 1
3 M 3 0 1
4 F 4 1 0
ID Gender Income ID Gender Income_High Income_medium Income_Low
1 F High 1 0 1 0 0
2 M Medium >>> 2 1 0 1 0
3 M High 3 1 1 0 0
4 F Low 4 0 0 0 1
'''
- 数据
X = pd.DataFrame({'Gender': ['F', 'M', 'M', 'F'],
'Income': ['High', 'Medium', 'High', 'Low']})
X
Gender | Income | |
0 | F | High |
1 | M | Medium |
2 | M | High |
3 | F | Low |
- 代码
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(drop='if_binary')
enc.fit_transform(X).toarray()
'''
array([[0., 1., 0., 0.],
[1., 0., 0., 1.],
[1., 1., 0., 0.],
[0., 0., 1., 0.]])
'''
# 转换规则
'''
二分类 F >>> 0,M >>> 1
多分类 第一列High,第二列Low,第三列Medium
'''
enc.categories_
'''
[array(['F', 'M'], dtype=object),
array(['High', 'Low', 'Medium'], dtype=object)]
'''
# 编码后命名列 原列名_字段取值
# 原始列名
cate_cols = X.columns.tolist()
cate_cols
'''
['Gender', 'Income']
'''
# 新编码字段名称存储
cate_cols_new = []
# 提取独热编码后所有特征的名称
for idx, colname in enumerate(cate_cols):
# 二分类离散变量
if len(enc.categories_[idx]) == 2:
cate_cols_new.append(colname)
# 多分类离散变量
else:
for f in enc.categories_[idx]:
feature_name = colname + '_' + f
cate_cols_new.append(feature_name)
cate_cols_new
'''
['Gender', 'Income_High', 'Income_Low', 'Income_Medium']
'''
# 组合成新DataFrame
pd.DataFrame(enc.fit_transform(X).toarray()
,columns=cate_cols_new)
Gender | Income_High | Income_Low | Income_Medium | |
0 | 0.0 | 1.0 | 0.0 | 0.0 |
1 | 1.0 | 0.0 | 0.0 | 1.0 |
2 | 1.0 | 1.0 | 0.0 | 0.0 |
3 | 0.0 | 0.0 | 1.0 | 0.0 |
1.3.2 封装函数
def cate_colName(Transformer, category_cols, drop='if_binary'):
"""
离散字段独热编码后字段名创建函数
:param Transformer: 独热编码转化器
:param category_cols: 原始列名
:param drop: 独热编码转化器的drop参数
"""
# 新编码字段名称存储
cate_cols_new = []
col_value = Transformer.categories_
# 提取独热编码后所有特征的名称
for idx, colname in enumerate(cate_cols):
# 二分类离散变量
if (len(col_value[idx]) == 2) & (drop == 'if_binary'):
cate_cols_new.append(colname)
# 多分类离散变量
else:
for f in col_value[idx]:
feature_name = colname + '_' + f
cate_cols_new.append(feature_name)
return (cate_cols_new)
cate_colName(enc, cate_cols)
'''
['Gender', 'Income_High', 'Income_Low', 'Income_Medium']
'''
1.3.3 多维数据编码
# 独热编码
from sklearn.preprocessing import OneHotEncoder
X = data.iloc[:,1:-1] # 不能是一维
enc = OneHotEncoder(categories='auto').fit(X)
result = enc.transform(X).toarray()
result # 5列 Sex2+Embarked3
'''
array([[0., 1., 0., 0., 1.],
[1., 0., 1., 0., 0.],
[1., 0., 0., 0., 1.],
[1., 0., 0., 0., 1.],
[0., 1., 0., 0., 1.],
[0., 1., 0., 1., 0.],
[0., 1., 0., 1., 0.],
[1., 0., 1., 0., 0.],
[1., 0., 0., 0., 1.]])
'''
# 更新到原数据
# axis=1 在1维相连
newdata = pd.concat([data, pd.DataFrame(result)], axis=1)
newdata.drop(['Sex','Embarked'], axis=1, inplace=True)
newdata
Age | Survived | 0 | 1 | 2 | 3 | 4 | |
0 | 22.0 | No | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
1 | 38.0 | Yes | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 |
2 | 26.0 | Yes | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
3 | NaN | Yes | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
4 | 35.0 | No | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
5 | NaN | Unknown | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 |
6 | 58.0 | No | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 |
7 | 20.0 | Yes | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 |
8 | 2.0 | Unknown | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
print(enc.get_feature_names())
newdata.columns = ['Age','Survived','female','male','Embarked_C','Embarked_Q','Embarked_S']
newdata
[‘x0_female’ ‘x0_male’ ‘x1_C’ ‘x1_Q’ ‘x1_S’]
Age | Survived | female | male | Embarked_C | Embarked_Q | Embarked_S | |
0 | 22.0 | No | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
1 | 38.0 | Yes | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 |
2 | 26.0 | Yes | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
3 | NaN | Yes | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
4 | 35.0 | No | 0.0 | 1.0 | 0.0 | 0.0 | 1.0 |
5 | NaN | Unknown | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 |
6 | 58.0 | No | 0.0 | 1.0 | 0.0 | 1.0 | 0.0 |
7 | 20.0 | Yes | 1.0 | 0.0 | 1.0 | 0.0 | 0.0 |
8 | 2.0 | Unknown | 1.0 | 0.0 | 0.0 | 0.0 | 1.0 |
1.3.4 一维数据编码 .reshape(-1, 1)
from sklearn.preprocessing import OneHotEncoder
# (n, )维数据.reshape(-1, 1) >>> (n, 1)
onehot = OneHotEncoder().fit_transform(Y_train.reshape(-1, 1))
# 转array类型, 才是需要的结果
onehot = onehot.toarray()
print('onehot', onehot)
1.3.5 keras实现编码 keras.utils.np_util.sto_categorical(data)
from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils
# 数值是1 2 3
Y
# 数值转为0 1 2
encoder = LabelEncoder()
Y_labelencoded = encoder.fit_transform(Y)
# 转为独热编码 类型是<class 'numpy.ndarray'>
Y_onehot = np_utils.to_categorical(Y_encoded)
# 和上面方法的结果是完全一样的
1.4 string特征转int (df[‘f1’] == ‘Yes’).astype(int)
'''
== 'Yes'返回一串TrueFalse
.astype(int)转换成1 0
'''
original_data['Attrition'] = (original_data['Attrition'] == 'Yes').astype(int)
2. 连续变量分箱(连续变量编码)
2.1 原理
'''
字段 连续型 >>> 离散型
减少异常值影响,消除特征量纲影响
对于线性模型来说引入非线性因素,提升模型表现
对于树模型来说损失连续变量信息,影响模型效果
[0,30)->0 [30,60)->1 [60,inf)->2
ID Income ID Income_Level
1 0 1 0
2 10 2 0
3 180 >>> 3 2
4 30 4 1
5 55 5 1
'''
'''
等宽分箱 uniforme 一定程度受异常值影响
等频分箱 quantile 完全忽略异常值影响
聚类分箱 kmeans 兼顾变量原始数值分布,优先考虑
'''
2.2 等宽分箱 KBinsDiscretizer(strategy=‘uniform’)
# 等宽分箱
# 根据连续变量的取值范围,划分宽度相等的区间
income = np.array([0, 10, 180, 30, 55, 35, 25, 75, 80, 10]).reshape(-1, 1)
income
'''
array([[ 0],
[ 10],
[180],
[ 30],
[ 55],
[ 35],
[ 25],
[ 75],
[ 80],
[ 10]])
'''
from sklearn.preprocessing import KBinsDiscretizer
'''
KBinsDiscretizer转化器 (discrete离散的)
n_bins 分箱个数
strategy 分箱方式
'uniforme' 等宽分箱
'quantile' 等频分箱
'kmeans' 聚类分箱
encode 分箱后的离散字段进一步编码方式
'ordinal' 二分类-自然数编码
'onehot' 多分类-独热编码
'''
dis = KBinsDiscretizer(n_bins=3, strategy='uniform', encode='ordinal')
dis.fit_transform(income)
'''
array([[0.],
[0.],
[2.],
[0.],
[0.],
[0.],
[0.],
[1.],
[1.],
[0.]])
'''
# 查看分箱边界
dis.bin_edges_
'''
array([array([ 0., 60., 120., 180.])], dtype=object)
'''
2.3 等频分箱 KBinsDiscretizer(strategy=‘quantile’)
'''
根据分箱数和连续变量数,划分样本数量相等的区间
若样本数无法整除箱数,最后一个箱子包含余数样本(10/3 -> 3/3/4).
'''
np.sort(income.flatten(), axis=0) # 分两个箱的话会以32.5划分
'''
array([ 0, 10, 10, 25, 30, 35, 55, 75, 80, 180])
'''
dis = KBinsDiscretizer(n_bins=3, strategy='quantile', encode='ordinal')
dis.fit_transform(income)
'''
array([[0.],
[0.],
[2.],
[1.],
[1.],
[1.],
[0.],
[2.],
[2.],
[0.]])
'''
# 查看分箱边界
dis.bin_edges_
'''
array([array([ 0., 25., 55., 180.])], dtype=object)
'''
2.4 聚类分箱 KBinsDiscretizer(strategy=‘kmeans’)
# 对连续变量进行聚类(多KMeans聚类),按样本所属类别作为标记代替原始值
from sklearn import cluster
kmeans = cluster.KMeans(n_clusters=3)
kmeans.fit(income)
kmeans.labels_
'''
array([0, 0, 1, 0, 2, 0, 0, 2, 2, 0], dtype=int32)
'''
dis = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='kmeans')
dis.fit_transform(income) # 分类结果和上面相同但更合理,小数字更能体现收入水平低
'''
array([[0.],
[0.],
[2.],
[0.],
[1.],
[0.],
[0.],
[1.],
[1.],
[0.]])
'''
dis.bin_edges_
'''
array([array([ 0. , 44.16666667, 125. , 180. ])],
dtype=object)
'''
2.5 连续变量二值化
2.5.1 sklearn.preprocessing.Binarizer
# Age二值化
data_2 = data.copy()
data_2.loc[:,'Age'] = data_2.loc[:,'Age'].fillna(data_2.loc[:,'Age'].mean())
X = data_2.iloc[:,0].values.reshape(-1,1)
X
'''
array([[22. ],
[38. ],
[26. ],
[28.71428571],
[35. ],
[28.71428571],
[58. ],
[20. ],
[ 2. ]])
'''
from sklearn.preprocessing import Binarizer
transformer = Binarizer(threshold=25).fit_transform(X)
transformer # 25为边界
'''
array([[0.],
[1.],
[1.],
[1.],
[1.],
[1.],
[1.],
[0.],
[0.]])
'''
2.5.2 DataFrame简单操作 (df[‘f1’] > 阈值).astype(int)
student['age'] = (student['age'] > 12).astype(int)
2.6 Regression Class Cutoff(连续值转整数寻找最佳划分阈值)
【notebook】xgboost, cudf, Regression Class Cutoff(连续值转类别整数划分最佳阈值)
from functools import partial
'''
回归预测的连续值通过划分归为固定类别
找到最佳的划分阈值
'''
class OptimizedRounder(object):
def __init__(self):
self.coef_ = 0
def _kappa_loss(self, coef, X, y):
X_p = np.copy(X)
for i, pred in enumerate(X_p):
if pred < coef[0]:
X_p[i] = 3
elif pred >= coef[0] and pred < coef[1]:
X_p[i] = 4
elif pred >= coef[1] and pred < coef[2]:
X_p[i] = 5
elif pred >= coef[2] and pred < coef[3]:
X_p[i] = 6
elif pred >= coef[3] and pred < coef[4]:
X_p[i] = 7
else:
X_p[i] = 8
'''
相关性系数
取负数,则kappa系数越小越好
'''
ll = cohen_kappa_score(y, X_p, weights='quadratic')
return -ll
def fit(self, X, y):
loss_partial = partial(self._kappa_loss, X=X, y=y)
initial_coef = [3.5, 4.5, 5.5, 6.5, 7.5]
'''
self._kappa_loss的参数Xy固定,
初始化coef=initial_coef,
找到最优coef
'''
self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')
def predict(self, X, coef):
X_p = np.copy(X)
for i, pred in enumerate(X_p):
if pred < coef[0]:
X_p[i] = 3
elif pred >= coef[0] and pred < coef[1]:
X_p[i] = 4
elif pred >= coef[1] and pred < coef[2]:
X_p[i] = 5
elif pred >= coef[2] and pred < coef[3]:
X_p[i] = 6
elif pred >= coef[3] and pred < coef[4]:
X_p[i] = 7
else:
X_p[i] = 8
return X_p.astype('int')
def coefficients(self):
return self.coef_['x']
下面代码是在10折交叉验证中的某一个循环
# 根据预测结果和真实值划分最佳的归类阈值
optR = OptimizedRounder()
optR.fit(xgb_valid_preds, val_target)
# 本轮在验证集上的预测结果
temp_oof = optR.predict(xgb_valid_preds, optR.coefficients())