特征预处理

转载

mob604756ec7b7c 2016-08-10 20:28:00

# -*- coding: utf-8 -*-
"""
Spyder Editor

This is a temporary script file.
"""

import numpy as np
from sklearn.preprocessing import StandardScaler

#模块1 标准化
#无量纲化使不同规格的数据转换到同一规格。常见的无量纲化方法有标准化和区间缩放法。
#标准化的前提是特征值服从正态分布，标准化后，其转换成标准正态分布。

x = np.array([[1,0,1],[0,1,0],[1,0,0]],dtype=float)

y = StandardScaler().fit_transform(x)

print y

'''
[[ 0.70710678 -0.70710678  1.41421356]
 [-1.41421356  1.41421356 -0.70710678]
 [ 0.70710678 -0.70710678 -0.70710678]]
'''

#模块2 区间缩放
#区间缩放法利用了边界值信息，将特征的取值区间缩放到某个特点的范围，例如[0, 1]等。
print '\n222222222222222222222222222222222\n'
from sklearn.preprocessing import MinMaxScaler
x = np.array([[1,2,3],[4,5,6],[7,8,9]],dtype=float)
y = MinMaxScaler().fit_transform(x)
print y  #注意fit+transform

y1 = MinMaxScaler().fit(x)
print y1 #注意至少存储相关参数，并没有转变

y2 = y1.transform(x)
print y2 #进行实际变换
'''
[[ 0.          1.          0.        ]
 [ 1.          0.          1.        ]
 [ 0.66666667  0.66666667  0.125     ]]
MinMaxScaler(copy=True, feature_range=(0, 1))
[[ 0.          1.          0.        ]
 [ 1.          0.          1.        ]
 [ 0.66666667  0.66666667  0.125     ]]
'''

#模块3 使用preproccessing库的Normalizer类对数据进行归一化的代码如下：
from sklearn.datasets import load_iris
iris = load_iris()
print iris.data
print iris.target

from sklearn.preprocessing import Normalizer
print Normalizer().fit_transform(iris.data)

#模块4 二值化
print '\n4\n'
from sklearn.preprocessing import Binarizer
X = [[ 1., -1.,  2.],[ 2.,  0.,  0.], [ 0.,  1., -1.]]
binarizer = Binarizer().fit(X)
# fit does nothing 
# binarizer Binarizer(copy=True, threshold=0.0)
print binarizer.transform(X)
'''
[[ 1.  0.  1.]
 [ 1.  0.  0.]
 [ 0.  1.  0.]]
'''
#It is possible to adjust the threshold of the binarizer:
binarizer = Binarizer(threshold=1.1)
print binarizer.transform(X)
'''
[[ 0.  0.  1.]
 [ 1.  0.  0.]
 [ 0.  0.  0.]]
'''
#模块5 OneHotEncoder
'''
4.3.4. Encoding categorical features
Often features are not given as continuous values but categorical. 
For example a person could have features 
["male", "female"], 
["from Europe", "from US", "from Asia"],
["uses Firefox", "uses Chrome", "uses Safari", "uses Internet Explorer"]. 
Such features can be efficiently coded as integers, 
for instance ["male", "from US", "uses Internet Explorer"] could be expressed as [0, 1, 3]
while ["female", "from Asia", "uses Chrome"] would be [1, 2, 1].
Such integer representation can not be used directly with scikit-learn estimators,
as these expect(期望、期待) continuous input, and would interpret the categories as being ordered,
which is often not desired (i.e. the set of browsers was ordered arbitrarily).
One possibility to convert categorical features to features that can be used with scikit-learn estimators is to use a one-of-K or one-hot encoding, which is implemented in OneHotEncoder. 
This estimator transforms each categorical feature with m possible values into m binary features, with only one active.
Continuing the example above:
'''
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
print enc.fit([[0, 0, 3], [1, 1, 0], [0, 2, 1], [1, 0, 2]])
print enc.transform([[0, 1, 3]]).toarray()
print enc.transform([[1, 1, 0]]).toarray()
print enc.transform([[0, 2, 1]]).toarray()
print enc.transform([[1, 0, 2]]).toarray()

'''
By default, how many values each feature can take is inferred automatically from the dataset. 
It is possible to specify this explicitly using the parameter n_values. 
There are two genders, three possible continents and four web browsers in our dataset. 
Then we fit the estimator, and transform a data point.
In the result, the first two numbers encode the gender, the next set of three numbers the continent and the last four the web browser.
'''

#模块6 补充缺失数值
print '\n666666666666666666666666666666666666666\n'
import numpy as np
from sklearn.preprocessing import Imputer
imp = Imputer(missing_values='NaN', strategy='mean', axis=0)
print imp.fit([[1, 2], [np.nan, 3], [7, 6]])
#Imputer(axis=0, copy=True, missing_values='NaN', strategy='mean', verbose=0)
x = [[np.nan, 2], [6, np.nan], [7, 6]]
print x
#[[nan, 2], [6, nan], [7, 6]]
# x is list
print(imp.transform(x))
'''
[[ 4.          2.        ]
 [ 6.          3.66666667]
 [ 7.          6.        ]]
'''

#注意到 水平缺失值 4 = （1+7）/2, 列的平均值
#注意到 水平缺失值 3.6666 = (2+3+6)/3,列的平均值




#模块7 多项式特征
#Generating polynomial features
print '\n777777777777777777777777777777\n'
from sklearn.preprocessing import PolynomialFeatures
X = np.arange(6).reshape(3, 2)
print X
poly = PolynomialFeatures(2)
poly.fit_transform(X)
print poly.fit_transform(X)
#(x1,x2)->（1,x1,x2,x1的平方,x1x2,x2的平方)
'''
[[0 1]
 [2 3]
 [4 5]]
[[  1.   0.   1.   0.   0.   1.]
 [  1.   2.   3.   4.   6.   9.]
 [  1.   4.   5.  16.  20.  25.]]
'''
X = np.arange(9).reshape(3, 3)
poly = PolynomialFeatures(degree=3, interaction_only=True)
print poly.fit_transform(X)
'''
[[   1.    0.    1.    2.    0.    0.    2.    0.]
 [   1.    3.    4.    5.   12.   15.   20.   60.]
 [   1.    6.    7.    8.   42.   48.   56.  336.]]
'''

#模块8  函数变换
#FunctionTransformer
print '\n888888888888888888888888888888\n'
'''
Often, you will want to convert an existing Python function into a transformer to assist in data cleaning
or processing. You can implement a transformer from an arbitrary function with FunctionTransformer.
For example, to build a transformer that applies a log transformation in a pipeline, do:
'''
import numpy as np
from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(np.log1p)
X = np.array([[0, 1], [2, 3]])
print transformer.transform(X)
'''
[[ 0.          0.69314718]
 [ 1.09861229  1.38629436]]
'''