In [19]:


import numpy as np
import pandas as pd
from pandas import Series, DataFrame
np.set_printoptions(precision=4)

×


In [20]:

x

from sklearn import preprocessing


×





In [21]:

dataSet_df = pd.read_table('datingTestSet2.txt', names=['feature_1', 'feature_2', 'feature_3', 'label'])
dataSet_df



×


Out[21]:



feature_1

feature_2

feature_3

label

0

40920

8.326976

0.953952

3

1

14488

7.153469

1.673904

2

2

26052

1.441871

0.805124

1

3

75136

13.147394

0.428964

1

4

38344

1.669788

0.134296

1

5

72993

10.141740

1.032955

1

6

35948

6.830792

1.213192

3

7

42666

13.276369

0.543880

3

8

67497

8.631577

0.749278

1

9

35483

12.273169

1.508053

3

10

50242

3.723498

0.831917

1

11

63275

8.385879

1.669485

1

12

5569

4.875435

0.728658

2

13

51052

4.680098

0.625224

1

14

77372

15.299570

0.331351

1

15

43673

1.889461

0.191283

1

16

61364

7.516754

1.269164

1

17

69673

14.239195

0.261333

1

18

15669

0.000000

1.250185

2

19

28488

10.528555

1.304844

3

20

6487

3.540265

0.822483

2

21

37708

2.991551

0.833920

1

22

22620

5.297865

0.638306

2

23

28782

6.593803

0.187108

3

24

19739

2.816760

1.686209

2

25

36788

12.458258

0.649617

3

26

5741

0.000000

1.656418

2

27

28567

9.968648

0.731232

3

28

6808

1.364838

0.640103

2

29

41611

0.230453

1.151996

1






970

53711

12.149747

1.389088

3

971

64371

9.149678

0.874905

1

972

9289

9.666576

1.370330

2

973

60613

3.620110

0.287767

1

974

18338

5.238800

1.253646

2

975

22845

14.715782

1.503758

3

976

74676

14.445740

1.211160

1

977

34143

13.609528

0.364240

3

978

14153

3.141585

0.424280

2

979

9327

0.000000

0.120947

2

980

18991

0.454750

1.033280

2

981

9193

0.510310

0.016395

2

982

2285

3.864171

0.616349

2

983

9493

6.724021

0.563044

2

984

2371

4.289375

0.012563

2

985

13963

0.000000

1.437030

2

986

2299

3.733617

0.698269

2

987

5262

2.002589

1.380184

2

988

4659

2.502627

0.184223

2

989

17582

6.382129

0.876581

2

990

27750

8.546741

0.128706

3

991

9868

2.694977

0.432818

2

992

18333

3.951256

0.333300

2

993

3780

9.856183

0.329181

2

994

18190

2.068962

0.429927

2

995

11145

3.410627

0.631838

2

996

68846

9.974715

0.669787

1

997

26575

10.650102

0.866627

3

998

48111

9.134528

0.728045

3

999

43757

7.882601

1.332446

3

1000 rows × 4 columns


In [33]:

# ----------------------
# 标准化
'''
公式为:(X-mean)/std  计算时对每个属性/每列分别进行。
将数据按期属性(按列进行)减去其均值,并处以其方差。得到的结果是,对于每个属性/每列来说所有数据都聚集在0附近,方差为1。
'''
standardize_x = preprocessing.scale(dataSet_df.iloc[:, :-1].values)
standardize_x



×

Out[33]:



array([[ 0.3319,  0.4166,  0.2452],
       [-0.8725,  0.1399,  1.6939],
       [-0.3455, -1.2067, -0.0542],
       ..., 
       [-0.3217,  0.9643,  0.0695],
       [ 0.6596,  0.607 , -0.2093],
       [ 0.4612,  0.3118,  1.0068]])





In [52]:

int(standardize_x.mean())



×


Out[52]:



0





In [53]:



int(standardize_x.var())

×


Out[53]:

1





In [34]:



# ----------------------
# 归一化
'''
1、把数变为[0,1]之间的小数
主要是为了数据处理方便提出来的,把数据映射到0~1范围之内处理,更加便捷快速。
2、把有量纲表达式变为无量纲表达式
归一化是一种简化计算的方式,即将有量纲的表达式,经过变换,化为无量纲的表达式,成为纯量。
'''
normalize_x = preprocessing.normalize(dataSet_df.iloc[:, :-1].values)
normalize_x

×



Out[34]:


array([[  1.0000e+00,   2.0349e-04,   2.3313e-05],
       [  1.0000e+00,   4.9375e-04,   1.1554e-04],
       [  1.0000e+00,   5.5346e-05,   3.0904e-05],
       ..., 
       [  1.0000e+00,   4.0076e-04,   3.2611e-05],
       [  1.0000e+00,   1.8986e-04,   1.5133e-05],
       [  1.0000e+00,   1.8014e-04,   3.0451e-05]])




In [48]:


len(normalize_x[(normalize_x >= 0)])


×

Out[48]:



3000





In [54]:



len(normalize_x[( normalize_x <= 1)])


×

Out[54]:

3000