sklearn官网API查询API Reference — scikit-learn 1.2.2 documentation

scikit-learn中自带了一些数据集,比如说最著名的Iris数据集。

python 取 二维 数组 列_python 取 二维 数组 列

数据集中第3列和第4列数据表示花瓣的长度和宽度,类别标签列已经转成了数字,比如0=Iris-Setosa,1=Iris-Versicolor,2=Iris-Virginica.

一、导入python库和实验数据集

from IPython.display import Image
%matplotlib inline
# Added version check for recent scikit-learn 0.18 checks
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version

from sklearn import datasets
import numpy as np
iris = datasets.load_iris() 
#http://scikit-learn.org/stable/auto_examples/datasets/plot_iris_dataset.html
X = iris.data[:, [2, 3]]
y = iris.target  #取species列,类别
print('Class labels:', np.unique(y))
#Output:Class labels: [0 1 2]

二、数据集切分

把数据集切分成训练集和测试集,这里70%的训练集,30%的测试集

if Version(sklearn_version) < '0.18':
    from sklearn.cross_validation import train_test_split
else:
    from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=0)  #train_test_split方法分割数据集

X_train.shape
#Output:(105, 2)
X_test.shape
#Output:(45, 2)
X.shape
#Output:(150, 2)
y_train.shape
#Output: (105,)
y_test.shape
#Output: (45,)

三、对特征做标准化

非树形模型一般都要对特征数据进行标准化处理,避免数据波动的影响。处理后各维特征有0均值,单位方差。也叫z-score规范化(零均值规范化)。计算方式是将特征值减去均值,除以标准差。

#scaler = sklearn.preprocessing.StandardScaler().fit(train)
#scaler.transform(train);scaler.transform(test)
#fit()方法建模,transform()方法转换
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()   #初始化一个对象sc去对数据集作变换
sc.fit(X_train)   #用对象去拟合数据集X_train,并且存下来拟合参数
#Output:StandardScaler(copy=True, with_mean=True, with_std=True)
#type(sc.fit(X_train))
#Output:sklearn.preprocessing.data.StandardScaler
sc.scale_  #sc.std_同样输出结果
#Output:array([ 1.79595918,  0.77769705])
sc.mean_
#Output:array([ 3.82857143,  1.22666667])

import numpy as np
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
#test标准化原理
at=X_train_std[:5]*sc.scale_+sc.mean_
a=X_train[:5]
at==a
#Output:
#array([[ True,  True],
#       [ True,  True],
#       [ True,  True],
#       [ True,  True],
#       [ True,  True]], dtype=bool)

四、各种算法分类及可视化

下面各算法中,通过plot_decision_region函数可视化,方便直观看分类结果

from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import warnings
def versiontuple(v):#Numpy版本检测函数
    return tuple(map(int, (v.split("."))))  
def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
    #画决策边界,X是特征,y是标签,classifier是分类器,test_idx是测试集序号
    # setup marker generator and color map
    markers = ('s', 'x', 'o', '^', 'v')
    colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
    cmap = ListedColormap(colors[:len(np.unique(y))])

    # plot the decision surface
    x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1   #第一个特征取值范围作为横轴
    x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1   #第二个特征取值范围作为纵轴
    xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
                           np.arange(x2_min, x2_max, resolution))  #reolution是网格剖分粒度,xx1和xx2数组维度一样
    Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)   
    #classifier指定分类器,ravel是数组展平;Z的作用是对组合的二种特征进行预测
    Z = Z.reshape(xx1.shape)   #Z是列向量
    plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)  
    #contourf(x,y,z)其中x和y为两个等长一维数组,z为二维数组,指定每一对xy所对应的z值。
    #对等高线间的区域进行填充(使用不同的颜色)
    plt.xlim(xx1.min(), xx1.max())
    plt.ylim(xx2.min(), xx2.max())

    for idx, cl in enumerate(np.unique(y)):
        plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
                    alpha=0.8, c=cmap(idx),
                    marker=markers[idx], label=cl)   #全数据集,不同类别样本点的特征作为坐标(x,y),用不同颜色画散点图

    # highlight test samples
    if test_idx:
        # plot all samples
        if not versiontuple(np.__version__) >= versiontuple('1.9.0'):
            X_test, y_test = X[list(test_idx), :], y[list(test_idx)]
            warnings.warn('Please update to NumPy 1.9.0 or newer')
        else:
            X_test, y_test = X[test_idx, :], y[test_idx]   #X_test取测试集样本两列特征,y_test取测试集标签

        plt.scatter(X_test[:, 0],
                    X_test[:, 1],
                    c='',
                    alpha=1.0,
                    linewidths=1,
                    marker='o',
                    s=55, label='test set')   #c设置颜色,测试集不同类别的实例点画图不区别颜色

用scikit-learn中的感知器做分类(三类分类)

from sklearn.linear_model import Perceptron
#http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Perceptron.html#sklearn.linear_model.Perceptron
#ppn = Perceptron(n_iter=40, eta0=0.1, random_state=0)
ppn = Perceptron()  #y=w.x+b
ppn.fit(X_train_std, y_train)
#Output:Perceptron(alpha=0.0001, class_weight=None, eta0=1.0, fit_intercept=True,
#      n_iter=5, n_jobs=1, penalty=None, random_state=0, shuffle=True,
#      verbose=0, warm_start=False)
ppn.coef_  #分类决策函数中的特征系数w
#Output:array([[-1.48746619, -1.1229737 ],
#       [ 3.0624304 , -2.18594118],
#       [ 2.9272062 ,  2.64027405]])
ppn.intercept_  #分类决策函数中的偏置项b
#Output:array([-1.,  0., -2.])
y_pred = ppn.predict(X_test_std)  #对测试集做类别预测
y_pred
#Output:array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1, 0,
#       0, 2, 0, 0, 1, 0, 0, 2, 1, 0, 2, 2, 1, 0, 2, 1, 1, 2, 0, 2, 0, 0])
y_test
#Output:array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1, 0,
#       0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 1, 1, 1, 2, 0, 2, 0, 0])
y_pred == y_test
#Output:array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
#        True, False,  True,  True,  True,  True,  True,  True,  True,
#        True,  True,  True,  True,  True,  True,  True,  True,  True,
#        True, False,  True,  True,  True,  True,  True,  True,  True,
#        True, False,  True,  True,  True,  True,  True,  True,  True], dtype=bool)
print('Misclassified samples: %d' % (y_test != y_pred).sum())
#Output:Misclassified samples: 3
from sklearn.metrics import accuracy_score
print('Accuracy: %.2f' % accuracy_score(y_test, y_pred))  #预测准确度,(len(y_test)-3)/len(y_test):0.9333333333333333
#Output:Accuracy: 0.93


用标准化的数据做一个感知器分类器


%matplotlib inline
X_combined_std = np.vstack((X_train_std, X_test_std))   #shape是(150,2)
y_combined = np.hstack((y_train, y_test))   #shape是(150,)

plot_decision_regions(X=X_combined_std, y=y_combined,
                      classifier=ppn, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')

plt.tight_layout()   #紧凑显示图片,居中显示;避免出现叠影
# plt.savefig('./figures/iris_perceptron_scikit.png', dpi=300)
plt.show()

python 取 二维 数组 列_python 取 二维 数组 列_02

用scikit-learn中的LR预测属于每个类别的概率(三类分类)

from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=1000.0, random_state=0)
#http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html#sklearn.linear_model.LogisticRegression
lr.fit(X_train_std, y_train)
plot_decision_regions(X_combined_std, y_combined, classifier=lr, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/logistic_regression.png', dpi=300)
plt.show()

python 取 二维 数组 列_python_03

过拟合/overfitting 与 正则化/regularization

python 取 二维 数组 列_机器学习_04

python 取 二维 数组 列_python 取 二维 数组 列_05

weights, params = [], []

for c in range(-5,6):
    lr = LogisticRegression(C=10**c, random_state=0)   #默认L2正则,C是正则化系数的倒数(C越小,特征权重越小)
    lr.fit(X_train_std, y_train)
    weights.append(lr.coef_[1])   ##############################lr.coef_[1]
    params.append(10**c)

weights = np.array(weights)
plt.plot(params,weights[:, 0],label='petal length')
plt.plot(params,weights[:, 1],linestyle='--',label='petal width')
plt.ylabel('weight coefficient')
plt.xlabel('C')
plt.legend(loc='upper left')
plt.xscale('log')   #在x轴上画对数坐标轴
# plt.savefig('./figures/regression_path.png', dpi=300)
plt.show()

python 取 二维 数组 列_sklearn_06

用scikit-learn中的SVM做分类(三类分类)

from sklearn.svm import SVC
#http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC
svm = SVC(kernel='linear', C=1.0, random_state=0)
svm.fit(X_train_std, y_train)

plot_decision_regions(X_combined_std, y_combined, classifier=svm, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/support_vector_machine_linear.png', dpi=300)
plt.show()

python 取 二维 数组 列_机器学习_07

上图利用的是线性SVM分类,但是从上图可见有些点被分错类了,进一步,考虑利用核函数进行非线性分类

from sklearn.svm import SVC
svm = SVC(kernel='rbf',random_state=0,gamma=0.2,C=1.0)
svm.fit(X_train_std,y_train)
plot_decision_regions(X_combined_std, y_combined, classifier=svm, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/support_vector_machine_rbf_iris_1.png', dpi=300)
plt.show()

python 取 二维 数组 列_sklearn_08

用scikit-learn中的决策树做分类(三类分类)

from sklearn.tree import DecisionTreeClassifier
#http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html#sklearn.tree.DecisionTreeClassifier
tree = DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=0)
#特征选择的度量,entropy是信息增益;max_depth参数是数的最大深度
tree.fit(X_train, y_train)
X_combined = np.vstack((X_train, X_test))
y_combined = np.hstack((y_train, y_test))
plot_decision_regions(X_combined, y_combined, classifier=tree, test_idx=range(105, 150))
plt.xlabel('petal length [cm]')
plt.ylabel('petal width [cm]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/decision_tree_decision.png', dpi=300)
plt.show()

python 取 二维 数组 列_sklearn_09

用scikit-learn中的随机森林做分类(三类分类)

from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(criterion='entropy', n_estimators=10, random_state=1, n_jobs=2)
#http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html#sklearn.ensemble.RandomForestClassifier
#criterion特征选择度量,n_estimators随机森林中单棵树数目,n_jobs设置并行生成树模型得数目
forest.fit(X_train, y_train)
plot_decision_regions(X_combined, y_combined, classifier=forest, test_idx=range(105, 150))
plt.xlabel('petal length [cm]')
plt.ylabel('petal width [cm]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/random_forest.png', dpi=300)
plt.show()

python 取 二维 数组 列_机器学习_10

用scikit-learn中的k-近邻做分类(三类分类)

from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, p=2, metric='minkowski')
#http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html#sklearn.neighbors.KNeighborsClassifier
#p是度量范数,metric='minkowski'距离度量标准2范数
knn.fit(X_train_std, y_train)
plot_decision_regions(X_combined_std, y_combined, classifier=knn, test_idx=range(105, 150))
plt.xlabel('petal length [standardized]')
plt.ylabel('petal width [standardized]')
plt.legend(loc='upper left')
plt.tight_layout()
# plt.savefig('./figures/k_nearest_neighbors.png', dpi=300)
plt.show()

python 取 二维 数组 列_机器学习_11