python离散变量等频分箱 python离散变量编码

转载

小鱼儿 2023-09-05 13:44:42

文章标签 python离散变量等频分箱 python pandas sklearn High 文章分类 Python 后端开发

特征编码

1. 离散变量编码

1.1 标签专用 sklearn.preprocessing.LabelEncoder

1.1.1 单列编码
1.1.2 多标签特征同时编码(封装类方式)

1.2 特征专用(不能是一维) sklearn.preprocessing.OrdinalEncoder
1.3 独热编码(离散变量编码) sklearn.preprocessing.OneHotEncoder

1.3.1 原理 & 过程
1.3.2 封装函数
1.3.3 多维数据编码
1.3.4 一维数据编码 .reshape(-1, 1)
1.3.5 keras实现编码 keras.utils.np_util.sto_categorical(data)

1.4 string特征转int (df['f1'] == 'Yes').astype(int)

2. 连续变量分箱(连续变量编码)

2.1 原理
2.2 等宽分箱 KBinsDiscretizer(strategy='uniform')
2.3 等频分箱 KBinsDiscretizer(strategy='quantile')
2.4 聚类分箱 KBinsDiscretizer(strategy='kmeans')
2.5 连续变量二值化

2.5.1 sklearn.preprocessing.Binarizer
2.5.2 DataFrame简单操作 (df['f1'] > 阈值).astype(int)

2.6 Regression Class Cutoff(连续值转整数寻找最佳划分阈值)

1. 离散变量编码

import pandas as pd
# index_col=0 第0列是索引
data = pd.read_csv('./data_preprocessing.csv', index_col=0)
data_ = data.copy()

1.1 标签专用 sklearn.preprocessing.LabelEncoder

1.1.1 单列编码

# 标签专用
from sklearn.preprocessing import LabelEncoder

# data.loc[:,'Survived']
# .iloc[] indexloc不能用列标签
y = data_.iloc[:,-1]
le = LabelEncoder()
label = le.fit_transform(y)
data_.iloc[:,-1] = label
data_

	Age	Sex	Embarked	Survived
0	22.0	male	S	0
1	38.0	female	C	2
2	26.0	female	S	2
3	NaN	female	S	2
4	35.0	male	S	0
5	NaN	male	Q	1
6	58.0	male	Q	0
7	20.0	female	C	2
8	2.0	female	S	1

1.1.2 多标签特征同时编码(封装类方式)

from sklearn.preprocessing import LabelEncoder


class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        如果指定了待编码的特证名self.columns，则按照X中特征编码；
        如果未指定，则遍历传入DataFrame的所有特征依次进行编码。
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

label_encoder = MultiColumnLabelEncoder(columns=columns_with_strings_as_values)
train = label_encoder.fit_transform(train)
test = label_encoder.transform(test)

1.2 特征专用(不能是一维) sklearn.preprocessing.OrdinalEncoder

# 特征专用（不能是一维）
from sklearn.preprocessing import OrdinalEncoder

# data_.iloc[:,-1]   范围最后一列n
# data_.iloc[:,1:-1] 范围[1, n)
data_.iloc[:,1:-1] = OrdinalEncoder().fit_transform(data_.iloc[:,1:-1])
data_

	Age	Sex	Embarked	Survived
0	22.0	1.0	2.0	0
1	38.0	0.0	0.0	2
2	26.0	0.0	2.0	2
3	NaN	0.0	2.0	2
4	35.0	1.0	2.0	0
5	NaN	1.0	1.0	1
6	58.0	1.0	1.0	0
7	20.0	0.0	0.0	2
8	2.0	0.0	2.0	1

1.3 独热编码(离散变量编码) sklearn.preprocessing.OneHotEncoder

1.3.1 原理 & 过程

原理

'''
名义变量：没有联系，不可互相计算。OneHotEncoder。舱门(S,C,Q)、性别;
有序变量：有联系，不可互相计算。OrdinalEncoder。学历(小学,高中,大学);
有距变量：有联系，可互相计算。重量(2kg,4kg,6kg).
单纯转为(1,2,3),自带数学性质影响建模。
OneHotEncoder独热编码，名义变量>>>哑变量
'S'[0,          'S'[[1,0,0],
'Q' 1,   >>>    'Q' [0,1,0], 
'C' 2]          'C' [0,0,1]]
'''

'''
二分类离散变量，转换后知到一列取值已知则另一列取值也确定
OneHotEncoder(drop='if_binary') 跳过二分类，只对多分类离散变量进行转化
ID Gender     ID Gender_F Gender_M
1  F          1  1        0
2  M     >>>  2  0        1
3  M          3  0        1
4  F          4  1        0
ID Gender Income     ID Gender Income_High Income_medium Income_Low
1  F      High       1  0      1           0             0 
2  M      Medium >>> 2  1      0           1             0
3  M      High       3  1      1           0             0
4  F      Low        4  0      0           0             1
'''

数据

X = pd.DataFrame({'Gender': ['F', 'M', 'M', 'F'],
                  'Income': ['High', 'Medium', 'High', 'Low']})
X

	Gender	Income
0	F	High
1	M	Medium
2	M	High
3	F	Low

代码

from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(drop='if_binary')
enc.fit_transform(X).toarray()
'''
    array([[0., 1., 0., 0.],
           [1., 0., 0., 1.],
           [1., 1., 0., 0.],
           [0., 0., 1., 0.]])
'''

# 转换规则
'''
二分类 F >>> 0,M >>> 1
多分类 第一列High，第二列Low，第三列Medium
'''
enc.categories_
'''
    [array(['F', 'M'], dtype=object),
     array(['High', 'Low', 'Medium'], dtype=object)]
'''

# 编码后命名列 原列名_字段取值
# 原始列名
cate_cols = X.columns.tolist()
cate_cols
'''
    ['Gender', 'Income']
'''

# 新编码字段名称存储
cate_cols_new = []
# 提取独热编码后所有特征的名称
for idx, colname in enumerate(cate_cols):
    # 二分类离散变量
    if len(enc.categories_[idx]) == 2:
        cate_cols_new.append(colname)
    # 多分类离散变量
    else:
        for f in enc.categories_[idx]:
            feature_name = colname + '_' + f
            cate_cols_new.append(feature_name)
cate_cols_new
'''
    ['Gender', 'Income_High', 'Income_Low', 'Income_Medium']
'''

# 组合成新DataFrame
pd.DataFrame(enc.fit_transform(X).toarray()
            ,columns=cate_cols_new)

	Gender	Income_High	Income_Low	Income_Medium
0	0.0	1.0	0.0	0.0
1	1.0	0.0	0.0	1.0
2	1.0	1.0	0.0	0.0
3	0.0	0.0	1.0	0.0

1.3.2 封装函数

def cate_colName(Transformer, category_cols, drop='if_binary'):
    """
    离散字段独热编码后字段名创建函数
    
    :param Transformer: 独热编码转化器
    :param category_cols: 原始列名
    :param drop: 独热编码转化器的drop参数
    """
    
    # 新编码字段名称存储
    cate_cols_new = []
    col_value = Transformer.categories_
    # 提取独热编码后所有特征的名称
    for idx, colname in enumerate(cate_cols):
        # 二分类离散变量
        if (len(col_value[idx]) == 2) & (drop == 'if_binary'):
            cate_cols_new.append(colname)
        # 多分类离散变量
        else:
            for f in col_value[idx]:
                feature_name = colname + '_' + f
                cate_cols_new.append(feature_name)
    return (cate_cols_new)

cate_colName(enc, cate_cols)
'''
    ['Gender', 'Income_High', 'Income_Low', 'Income_Medium']
'''

1.3.3 多维数据编码

# 独热编码
from sklearn.preprocessing import OneHotEncoder
X = data.iloc[:,1:-1] # 不能是一维
enc = OneHotEncoder(categories='auto').fit(X)
result = enc.transform(X).toarray()
result # 5列 Sex2+Embarked3 
'''
   array([[0., 1., 0., 0., 1.],
           [1., 0., 1., 0., 0.],
           [1., 0., 0., 0., 1.],
           [1., 0., 0., 0., 1.],
           [0., 1., 0., 0., 1.],
           [0., 1., 0., 1., 0.],
           [0., 1., 0., 1., 0.],
           [1., 0., 1., 0., 0.],
           [1., 0., 0., 0., 1.]])
'''

# 更新到原数据
# axis=1 在1维相连
newdata = pd.concat([data, pd.DataFrame(result)], axis=1)
newdata.drop(['Sex','Embarked'], axis=1, inplace=True)
newdata

	Age	Survived	0	1	2	3	4
0	22.0	No	0.0	1.0	0.0	0.0	1.0
1	38.0	Yes	1.0	0.0	1.0	0.0	0.0
2	26.0	Yes	1.0	0.0	0.0	0.0	1.0
3	NaN	Yes	1.0	0.0	0.0	0.0	1.0
4	35.0	No	0.0	1.0	0.0	0.0	1.0
5	NaN	Unknown	0.0	1.0	0.0	1.0	0.0
6	58.0	No	0.0	1.0	0.0	1.0	0.0
7	20.0	Yes	1.0	0.0	1.0	0.0	0.0
8	2.0	Unknown	1.0	0.0	0.0	0.0	1.0

print(enc.get_feature_names())
newdata.columns = ['Age','Survived','female','male','Embarked_C','Embarked_Q','Embarked_S']
newdata

[‘x0_female’ ‘x0_male’ ‘x1_C’ ‘x1_Q’ ‘x1_S’]

	Age	Survived	female	male	Embarked_C	Embarked_Q	Embarked_S
0	22.0	No	0.0	1.0	0.0	0.0	1.0
1	38.0	Yes	1.0	0.0	1.0	0.0	0.0
2	26.0	Yes	1.0	0.0	0.0	0.0	1.0
3	NaN	Yes	1.0	0.0	0.0	0.0	1.0
4	35.0	No	0.0	1.0	0.0	0.0	1.0
5	NaN	Unknown	0.0	1.0	0.0	1.0	0.0
6	58.0	No	0.0	1.0	0.0	1.0	0.0
7	20.0	Yes	1.0	0.0	1.0	0.0	0.0
8	2.0	Unknown	1.0	0.0	0.0	0.0	1.0

1.3.4 一维数据编码 .reshape(-1, 1)

from sklearn.preprocessing import OneHotEncoder

# (n, )维数据.reshape(-1, 1) >>> (n, 1)
onehot = OneHotEncoder().fit_transform(Y_train.reshape(-1, 1))
# 转array类型, 才是需要的结果
onehot = onehot.toarray()
print('onehot', onehot)

1.3.5 keras实现编码 keras.utils.np_util.sto_categorical(data)

from sklearn.preprocessing import LabelEncoder
from keras.utils import np_utils

# 数值是1 2 3
Y

# 数值转为0 1 2
encoder = LabelEncoder()
Y_labelencoded = encoder.fit_transform(Y)
# 转为独热编码 类型是<class 'numpy.ndarray'>
Y_onehot = np_utils.to_categorical(Y_encoded)

# 和上面方法的结果是完全一样的

1.4 string特征转int (df[‘f1’] == ‘Yes’).astype(int)

'''
== 'Yes'返回一串TrueFalse 
.astype(int)转换成1 0
'''
original_data['Attrition'] = (original_data['Attrition'] == 'Yes').astype(int)

2. 连续变量分箱(连续变量编码)

2.1 原理

'''
字段 连续型 >>> 离散型
减少异常值影响，消除特征量纲影响
对于线性模型来说引入非线性因素，提升模型表现
对于树模型来说损失连续变量信息，影响模型效果

[0,30)->0 [30,60)->1 [60,inf)->2
ID Income       ID Income_Level
1  0            1  0
2  10           2  0
3  180   >>>    3  2
4  30           4  1
5  55           5  1
'''
'''
等宽分箱 uniforme 一定程度受异常值影响
等频分箱 quantile 完全忽略异常值影响
聚类分箱 kmeans 兼顾变量原始数值分布,优先考虑
'''

2.2 等宽分箱 KBinsDiscretizer(strategy=‘uniform’)

# 等宽分箱
# 根据连续变量的取值范围，划分宽度相等的区间
income = np.array([0, 10, 180, 30, 55, 35, 25, 75, 80, 10]).reshape(-1, 1)
income
'''
    array([[  0],
           [ 10],
           [180],
           [ 30],
           [ 55],
           [ 35],
           [ 25],
           [ 75],
           [ 80],
           [ 10]])
'''

from sklearn.preprocessing import KBinsDiscretizer
'''
KBinsDiscretizer转化器 (discrete离散的)
    n_bins 分箱个数
    strategy 分箱方式
        'uniforme' 等宽分箱
        'quantile' 等频分箱
        'kmeans' 聚类分箱
    encode 分箱后的离散字段进一步编码方式
        'ordinal' 二分类-自然数编码
        'onehot' 多分类-独热编码
'''

dis = KBinsDiscretizer(n_bins=3, strategy='uniform', encode='ordinal')
dis.fit_transform(income)
'''
    array([[0.],
           [0.],
           [2.],
           [0.],
           [0.],
           [0.],
           [0.],
           [1.],
           [1.],
           [0.]])
'''

# 查看分箱边界
dis.bin_edges_
'''
    array([array([  0.,  60., 120., 180.])], dtype=object)
'''

2.3 等频分箱 KBinsDiscretizer(strategy=‘quantile’)

'''
根据分箱数和连续变量数，划分样本数量相等的区间
若样本数无法整除箱数，最后一个箱子包含余数样本(10/3 -> 3/3/4).
'''
np.sort(income.flatten(), axis=0) # 分两个箱的话会以32.5划分
'''
    array([  0,  10,  10,  25,  30,  35,  55,  75,  80, 180])
'''

dis = KBinsDiscretizer(n_bins=3, strategy='quantile', encode='ordinal')
dis.fit_transform(income)
'''
    array([[0.],
           [0.],
           [2.],
           [1.],
           [1.],
           [1.],
           [0.],
           [2.],
           [2.],
           [0.]])
'''

# 查看分箱边界
dis.bin_edges_
'''
    array([array([  0.,  25.,  55., 180.])], dtype=object)
'''

2.4 聚类分箱 KBinsDiscretizer(strategy=‘kmeans’)

# 对连续变量进行聚类(多KMeans聚类)，按样本所属类别作为标记代替原始值
from sklearn import cluster

kmeans = cluster.KMeans(n_clusters=3)
kmeans.fit(income)
kmeans.labels_
'''
    array([0, 0, 1, 0, 2, 0, 0, 2, 2, 0], dtype=int32)
'''

dis = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='kmeans')
dis.fit_transform(income) # 分类结果和上面相同但更合理，小数字更能体现收入水平低
'''
    array([[0.],
           [0.],
           [2.],
           [0.],
           [1.],
           [0.],
           [0.],
           [1.],
           [1.],
           [0.]])
'''

dis.bin_edges_
'''
    array([array([  0.        ,  44.16666667, 125.        , 180.        ])],
          dtype=object)
'''

2.5 连续变量二值化

2.5.1 sklearn.preprocessing.Binarizer

# Age二值化
data_2 = data.copy()
data_2.loc[:,'Age'] = data_2.loc[:,'Age'].fillna(data_2.loc[:,'Age'].mean())
X = data_2.iloc[:,0].values.reshape(-1,1)
X
'''
    array([[22.        ],
           [38.        ],
           [26.        ],
           [28.71428571],
           [35.        ],
           [28.71428571],
           [58.        ],
           [20.        ],
           [ 2.        ]])
'''

from sklearn.preprocessing import Binarizer

transformer = Binarizer(threshold=25).fit_transform(X)
transformer # 25为边界
'''

    array([[0.],
           [1.],
           [1.],
           [1.],
           [1.],
           [1.],
           [1.],
           [0.],
           [0.]])
'''

2.5.2 DataFrame简单操作 (df[‘f1’] > 阈值).astype(int)

student['age'] = (student['age'] > 12).astype(int)

2.6 Regression Class Cutoff(连续值转整数寻找最佳划分阈值)

【notebook】xgboost, cudf, Regression Class Cutoff(连续值转类别整数划分最佳阈值)

from functools import partial

'''
回归预测的连续值通过划分归为固定类别
找到最佳的划分阈值
'''
class OptimizedRounder(object):
    def __init__(self):
        self.coef_ = 0

    def _kappa_loss(self, coef, X, y):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 3
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 4
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 5
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 6
            elif pred >= coef[3] and pred < coef[4]:
                X_p[i] = 7
            else:
                X_p[i] = 8
        '''
        相关性系数
        取负数，则kappa系数越小越好
        '''
        ll = cohen_kappa_score(y, X_p, weights='quadratic')
        return -ll

    def fit(self, X, y):
        loss_partial = partial(self._kappa_loss, X=X, y=y)
        initial_coef = [3.5, 4.5, 5.5, 6.5, 7.5]
        '''
        self._kappa_loss的参数Xy固定，
        初始化coef=initial_coef,
        找到最优coef
        '''
        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead')

    def predict(self, X, coef):
        X_p = np.copy(X)
        for i, pred in enumerate(X_p):
            if pred < coef[0]:
                X_p[i] = 3
            elif pred >= coef[0] and pred < coef[1]:
                X_p[i] = 4
            elif pred >= coef[1] and pred < coef[2]:
                X_p[i] = 5
            elif pred >= coef[2] and pred < coef[3]:
                X_p[i] = 6
            elif pred >= coef[3] and pred < coef[4]:
                X_p[i] = 7
            else:
                X_p[i] = 8
        return X_p.astype('int')

    def coefficients(self):
        return self.coef_['x']

下面代码是在10折交叉验证中的某一个循环

# 根据预测结果和真实值划分最佳的归类阈值
optR = OptimizedRounder()
optR.fit(xgb_valid_preds, val_target)
# 本轮在验证集上的预测结果
temp_oof = optR.predict(xgb_valid_preds, optR.coefficients())

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。

上一篇：mysql 大表关联小表 mysql数据表关联

下一篇：android nfc权限请求 nfc无法获取root权限

提问和评论都可以，用心的回复会被更多人看到评论

发布评论

相关文章

官方博客	全部文章	热门标签	班级博客
了解我们	网站地图	意见反馈

鸿蒙开发者社区	51CTO学堂
51CTO	软考资讯

python离散变量等频分箱 python离散变量编码

python离散变量等频分箱 python离散变量编码

特征编码

1. 离散变量编码

1.1 标签专用 sklearn.preprocessing.LabelEncoder

1.1.1 单列编码

1.1.2 多标签特征同时编码(封装类方式)

1.2 特征专用(不能是一维) sklearn.preprocessing.OrdinalEncoder

1.3 独热编码(离散变量编码) sklearn.preprocessing.OneHotEncoder

1.3.1 原理 & 过程

1.3.2 封装函数

1.3.3 多维数据编码

1.3.4 一维数据编码 .reshape(-1, 1)

1.3.5 keras实现编码 keras.utils.np_util.sto_categorical(data)

1.4 string特征转int (df[‘f1’] == ‘Yes’).astype(int)

2. 连续变量分箱(连续变量编码)

2.1 原理

2.2 等宽分箱 KBinsDiscretizer(strategy=‘uniform’)

2.3 等频分箱 KBinsDiscretizer(strategy=‘quantile’)

2.4 聚类分箱 KBinsDiscretizer(strategy=‘kmeans’)

2.5 连续变量二值化

2.5.1 sklearn.preprocessing.Binarizer

2.5.2 DataFrame简单操作 (df[‘f1’] > 阈值).astype(int)

2.6 Regression Class Cutoff(连续值转整数寻找最佳划分阈值)

51CTO博客