机器学习经典算法之(二十)逻辑回归

    (一)逻辑回归简介:

    logistic回归又称logistic回归分析,是一种广义的线性回归分析模型,常用于数据挖掘,疾病自动诊断,广告投放、邮件判断等领域。

    该算法可根据已知的一系列因变量估计离散数值(比方说二进制数值 0 或 1 ,是或否,真或假)。简单来说,它通过将数据拟合进一个逻辑函数来预估一个事件出现的概率。因此,它也被叫做逻辑回归。因为它预估的是概率,所以它的输出值大小在 0 和 1 之间(正如所预计的一样)。

    Logistic回归模型的适用条件

    1 因变量为二分类的分类变量或某事件的发生率,并且是数值型变量。但是需要注意,重复计数现象指标不适用于Logistic回归。

    2 残差和因变量都要服从二项分布。二项分布对应的是分类变量,所以不是正态分布,进而不是用最小二乘法,而是最大似然法来解决方程估计和检验问题。

    3 自变量和Logistic概率是线性关系

    4 各观测对象间相互独立。

    Logistic应用场景:

    寻因:寻找某一疾病的危险因素等。

    预测:如果已经建立了logistic回归模型,则可以根据模型,预测在不同的自变量情况下,发生某病或某种情况的概率有多大。

    判别:实际上跟预测有些类似,也是根据logistic模型,判断某人属于某病或属于某种情况的概率有多大,也就是看一下这个人有多大的可能性是属于某病。

    这是logistic回归最常用的三个用途,实际中的logistic回归用途是极为广泛的,logistic回归几乎已经成了流行病学和医学中最常用的分析方法。

(二)原理实现代码(参考考机器实战)

from numpyimport *

defloadDataSet():

    dataMat = []; labelMat = []

    fr = open('testSet.txt')

    for line in fr.readlines():

        lineArr = line.strip().split()

        dataMat.append([1.0, float(lineArr[0]),float(lineArr[1])])

        labelMat.append(int(lineArr[2]))

    return dataMat,labelMat

defsigmoid(inX):

    return 1.0/(1+exp(-inX))

defgradAscent(dataMatIn, classLabels):

    dataMatrix = mat(dataMatIn)             #convert to NumPy matrix

    labelMat = mat(classLabels).transpose()#convert to NumPy matrix

    m,n = shape(dataMatrix)

    alpha= 0.001

    maxCycles = 500

    weights = ones((n,1))

    for k in range(maxCycles):              #heavy on matrix operations

        h = sigmoid(dataMatrix*weights)     #matrix mult

        error = (labelMat - h)              #vector subtraction

       weights = weights + alpha * dataMatrix.transpose()* error #matrix mult

    return weights

defplotBestFit(weights):

    import matplotlib.pyplot as plt

    dataMat,labelMat=loadDataSet()

    dataArr = array(dataMat)

    n = shape(dataArr)[0]

    xcord1 = []; ycord1 = []

    xcord2 = []; ycord2 = []

    for i in range(n):

        if int(labelMat[i])== 1:

            xcord1.append(dataArr[i,1]);ycord1.append(dataArr[i,2])

        else:

            xcord2.append(dataArr[i,1]);ycord2.append(dataArr[i,2])

    fig = plt.figure()

    ax = fig.add_subplot(111)

    ax.scatter(xcord1, ycord1, s=30, c='red',marker='s')

    ax.scatter(xcord2, ycord2, s=30, c='green')

    x = arange(-3.0, 3.0, 0.1)

    y = (-weights[0]-weights[1]*x)/weights[2]

    ax.plot(x, y)

    plt.xlabel('X1'); plt.ylabel('X2');

    plt.show()

defstocGradAscent0(dataMatrix, classLabels):

    m,n = shape(dataMatrix)

    alpha = 0.01

    weights = ones(n)   #initialize to all ones

    for i in range(m):

        h = sigmoid(sum(dataMatrix[i]*weights))

        error = classLabels[i] - h

        weights = weights + alpha * error *dataMatrix[i]

    return weights

defstocGradAscent1(dataMatrix, classLabels, numIter=150):

    m,n = shape(dataMatrix)

    weights = ones(n)   #initialize to all ones

    for j in range(numIter):

        dataIndex = range(m)

        for i in range(m):

            alpha = 4/(1.0+j+i)+0.0001    #apha decreases with iteration, does not

            randIndex =int(random.uniform(0,len(dataIndex)))#go to 0 because of the constant

            h =sigmoid(sum(dataMatrix[randIndex]*weights))

            error = classLabels[randIndex] - h

            weights = weights + alpha * error *dataMatrix[randIndex]

            del(dataIndex[randIndex])

    return weights

defclassifyVector(inX, weights):

    prob = sigmoid(sum(inX*weights))

    if prob > 0.5: return 1.0

    else: return 0.0

def colicTest():

    frTrain = open('horseColicTraining.txt');frTest = open('horseColicTest.txt')

    trainingSet = []; trainingLabels = []

    for line in frTrain.readlines():

        currLine = line.strip().split('\t')

        lineArr =[]

        for i in range(21):

            lineArr.append(float(currLine[i]))

        trainingSet.append(lineArr)

        trainingLabels.append(float(currLine[21]))

    trainWeights =stocGradAscent1(array(trainingSet), trainingLabels, 1000)

    errorCount = 0; numTestVec = 0.0

    for line in frTest.readlines():

        numTestVec += 1.0

        currLine = line.strip().split('\t')

       lineArr =[]

        for i in range(21):

            lineArr.append(float(currLine[i]))

        if int(classifyVector(array(lineArr),trainWeights))!= int(currLine[21]):

            errorCount += 1

    errorRate = (float(errorCount)/numTestVec)

    print "the error rate of this test is:%f" % errorRate

    return errorRate

def multiTest():

    numTests = 10; errorSum=0.0

    for k in range(numTests):

        errorSum += colicTest()

    print "after %d iterations the averageerror rate is: %f" % (numTests, errorSum/float(numTests))

(三)sklearn中应用举例:

以鸢尾花数据集为例,先进行数据降维,绘制出数据点,然后利用逻辑回归,多类采用ovr策略,观察模型的分类情况。

from sklearn.decomposition import PCA

from sklearn.datasets import load_iris

from sklearn.linear_model import LogisticRegression

import matplotlib.pyplot as plt

import numpy asnp

irisData=load_iris()

X_train=irisData.data

y_train=irisData.target

pca=PCA(n_components=2)

X=pca.fit_transform(X_train)

f=plt.figure()

ax=f.add_subplot(1,1,1)

ax.plot(X[:,0][y_train==0],X[:,1][y_train==0],'bo')

ax.scatter(X[:,0][y_train==1],X[:,1][y_train==1],c='r')

ax.scatter(X[:,0][y_train==2],X[:,1][y_train==2],c='y')

ax.set_title('Data')

plt.show()

clf=LogisticRegression(multi_class='ovr',solver='lbfgs',class_weight={0:1,1:1,2:1})

clf.fit(X,y_train)

score=clf.score(X,y_train)

x0min,x0max=X[:,0].min(),X[:,0].max()

x1min,x1max=X[:,1].min(),X[:,1].max()

h=0.05

xx,yy=np.meshgrid(np.arange(x0min-1,x0max+1,h),np.arange(x1min-1,x1max+1,h))

x_=xx.reshape([xx.shape[0]*xx.shape[1],1])

y_=yy.reshape([yy.shape[0]*yy.shape[1],1])

test_x=np.c_[x_,y_]

test_predict=clf.predict(test_x)

z=test_predict.reshape(xx.shape)

plt.contourf(xx,yy,z,cmap=plt.cm.Paired)

plt.axis('tight')

colors='bgy'

for i,color inzip(clf.classes_,colors):

    idx=np.where(y_train==i)

   plt.scatter(X[idx,0],X[idx,1],c=color,cmap=plt.cm.Paired)

plt.title("score:%s"%score)

plt.show()