In [19]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
np.set_printoptions(precision=4)
×
…
In [20]:
x
from sklearn import preprocessing
×
…
In [21]:
dataSet_df = pd.read_table('datingTestSet2.txt', names=['feature_1', 'feature_2', 'feature_3', 'label'])
dataSet_df
×
Out[21]:
feature_1 | feature_2 | feature_3 | label | |
0 | 40920 | 8.326976 | 0.953952 | 3 |
1 | 14488 | 7.153469 | 1.673904 | 2 |
2 | 26052 | 1.441871 | 0.805124 | 1 |
3 | 75136 | 13.147394 | 0.428964 | 1 |
4 | 38344 | 1.669788 | 0.134296 | 1 |
5 | 72993 | 10.141740 | 1.032955 | 1 |
6 | 35948 | 6.830792 | 1.213192 | 3 |
7 | 42666 | 13.276369 | 0.543880 | 3 |
8 | 67497 | 8.631577 | 0.749278 | 1 |
9 | 35483 | 12.273169 | 1.508053 | 3 |
10 | 50242 | 3.723498 | 0.831917 | 1 |
11 | 63275 | 8.385879 | 1.669485 | 1 |
12 | 5569 | 4.875435 | 0.728658 | 2 |
13 | 51052 | 4.680098 | 0.625224 | 1 |
14 | 77372 | 15.299570 | 0.331351 | 1 |
15 | 43673 | 1.889461 | 0.191283 | 1 |
16 | 61364 | 7.516754 | 1.269164 | 1 |
17 | 69673 | 14.239195 | 0.261333 | 1 |
18 | 15669 | 0.000000 | 1.250185 | 2 |
19 | 28488 | 10.528555 | 1.304844 | 3 |
20 | 6487 | 3.540265 | 0.822483 | 2 |
21 | 37708 | 2.991551 | 0.833920 | 1 |
22 | 22620 | 5.297865 | 0.638306 | 2 |
23 | 28782 | 6.593803 | 0.187108 | 3 |
24 | 19739 | 2.816760 | 1.686209 | 2 |
25 | 36788 | 12.458258 | 0.649617 | 3 |
26 | 5741 | 0.000000 | 1.656418 | 2 |
27 | 28567 | 9.968648 | 0.731232 | 3 |
28 | 6808 | 1.364838 | 0.640103 | 2 |
29 | 41611 | 0.230453 | 1.151996 | 1 |
… | … | … | … | … |
970 | 53711 | 12.149747 | 1.389088 | 3 |
971 | 64371 | 9.149678 | 0.874905 | 1 |
972 | 9289 | 9.666576 | 1.370330 | 2 |
973 | 60613 | 3.620110 | 0.287767 | 1 |
974 | 18338 | 5.238800 | 1.253646 | 2 |
975 | 22845 | 14.715782 | 1.503758 | 3 |
976 | 74676 | 14.445740 | 1.211160 | 1 |
977 | 34143 | 13.609528 | 0.364240 | 3 |
978 | 14153 | 3.141585 | 0.424280 | 2 |
979 | 9327 | 0.000000 | 0.120947 | 2 |
980 | 18991 | 0.454750 | 1.033280 | 2 |
981 | 9193 | 0.510310 | 0.016395 | 2 |
982 | 2285 | 3.864171 | 0.616349 | 2 |
983 | 9493 | 6.724021 | 0.563044 | 2 |
984 | 2371 | 4.289375 | 0.012563 | 2 |
985 | 13963 | 0.000000 | 1.437030 | 2 |
986 | 2299 | 3.733617 | 0.698269 | 2 |
987 | 5262 | 2.002589 | 1.380184 | 2 |
988 | 4659 | 2.502627 | 0.184223 | 2 |
989 | 17582 | 6.382129 | 0.876581 | 2 |
990 | 27750 | 8.546741 | 0.128706 | 3 |
991 | 9868 | 2.694977 | 0.432818 | 2 |
992 | 18333 | 3.951256 | 0.333300 | 2 |
993 | 3780 | 9.856183 | 0.329181 | 2 |
994 | 18190 | 2.068962 | 0.429927 | 2 |
995 | 11145 | 3.410627 | 0.631838 | 2 |
996 | 68846 | 9.974715 | 0.669787 | 1 |
997 | 26575 | 10.650102 | 0.866627 | 3 |
998 | 48111 | 9.134528 | 0.728045 | 3 |
999 | 43757 | 7.882601 | 1.332446 | 3 |
1000 rows × 4 columns
…
In [33]:
# ----------------------
# 标准化
'''
公式为:(X-mean)/std 计算时对每个属性/每列分别进行。
将数据按期属性(按列进行)减去其均值,并处以其方差。得到的结果是,对于每个属性/每列来说所有数据都聚集在0附近,方差为1。
'''
standardize_x = preprocessing.scale(dataSet_df.iloc[:, :-1].values)
standardize_x
×
Out[33]:
array([[ 0.3319, 0.4166, 0.2452],
[-0.8725, 0.1399, 1.6939],
[-0.3455, -1.2067, -0.0542],
...,
[-0.3217, 0.9643, 0.0695],
[ 0.6596, 0.607 , -0.2093],
[ 0.4612, 0.3118, 1.0068]])
…
In [52]:
int(standardize_x.mean())
×
Out[52]:
0
…
In [53]:
int(standardize_x.var())
×
Out[53]:
1
…
In [34]:
# ----------------------
# 归一化
'''
1、把数变为[0,1]之间的小数
主要是为了数据处理方便提出来的,把数据映射到0~1范围之内处理,更加便捷快速。
2、把有量纲表达式变为无量纲表达式
归一化是一种简化计算的方式,即将有量纲的表达式,经过变换,化为无量纲的表达式,成为纯量。
'''
normalize_x = preprocessing.normalize(dataSet_df.iloc[:, :-1].values)
normalize_x
×
Out[34]:
array([[ 1.0000e+00, 2.0349e-04, 2.3313e-05],
[ 1.0000e+00, 4.9375e-04, 1.1554e-04],
[ 1.0000e+00, 5.5346e-05, 3.0904e-05],
...,
[ 1.0000e+00, 4.0076e-04, 3.2611e-05],
[ 1.0000e+00, 1.8986e-04, 1.5133e-05],
[ 1.0000e+00, 1.8014e-04, 3.0451e-05]])
…
In [48]:
len(normalize_x[(normalize_x >= 0)])
×
Out[48]:
3000
…
In [54]:
len(normalize_x[( normalize_x <= 1)])
×
Out[54]:
3000
…