线性回归岭回归套索回归比较
import numpy as np
from sklearn.linear_model import LinearRegression,Ridge,Lasso
#50样本,200特征
#无解:无数个解
X = np.random.randn(50,200)
w = np.random.randn(200)
#将其中的190个置为0
index = np.arange(0,200)
np.random.shuffle(index)
w[index[:190]] = 0
y = X.dot(w)
import warnings
warnings.filterwarnings("ignore")
linear = LinearRegression(fit_intercept=False)
ridge = RidgeCV(alphas = [0.001,0.01,0.1,1,2,5,10],cv =5,fit_intercept=False)
lasso = LassoCV(alphas = [0.001,0.01,0.1,1,2,5,10],cv =3,fit_intercept=False)
#lasso = LassoCV(eps = 0.001,n_alphas = 100) 最小alpha/最大alpha = 0.001,一共100个alphas
linear.fit(X,y)
ridge.fit(X,y)
lasso.fit(X,y)
linear_w = linear.coef_
ridge_w = ridge.coef_
lasso_w = lasso.coef_
plt.figure(figsize = (12,9))
axes = plt.subplot(2,2,1)
axes.plot(w)
axes = plt.subplot(2,2,2)
axes.plot(linear_w)
axes.set_title("Linear")
axes = plt.subplot(2,2,3)
axes.plot(ridge_w)
axes.set_title("ridge")
axes = plt.subplot(2,2,4)
axes.plot(lasso_w)
axes.set_title("lasso")
岭回归可以让系数变小(绝对值变小)
lasso适用于样本量比特征少的情况(解稀松的,大部分是0小部分有值)
Ridge岭回归alpha优化
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
X = 1/(np.arange(1,11)+np.arange(0,10).reshape(-1,1))
y = np.ones(10)
ridge = Ridge(fit_intercept = False)
alphas = np.logspace(start=-10,stop=-2,num=200)
coefs = []
for a in alphas:
ridge.set_params(alpha = a)
ridge.fit(X,y)
coefs.append(ridge.coef_)
_ = plt.plot(alphas,coefs)
plt.xscale("log")
plt.ylabel("coef",fontsize = 25,c = "r",rotation = 0)
plt.xlabel("alpha",fontsize = 25)
选择alpha使得系数波动不要太大,处于平滑区内的alpha
逻辑斯蒂回归使用和概率计算
#逻辑斯蒂回归回归,用于分类而不是回归
import numpy as np
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn import datasets
from sklearn.model_selection import train_test_split
X,y = datasets.load_iris(True)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
lr = LogisticRegression()
lr.fit(X_train,y_train)
#3类,每类有4个特征
w = lr.coef_
b = lr.intercept_
proba_ = lr.predict_proba(X_test)
proba_
结果:
array([[5.09606079e-01, 4.90348717e-01, 4.52032702e-05],
[9.80326848e-03, 4.50092430e-01, 5.40104302e-01],
[1.45104392e-01, 6.49836682e-01, 2.05058926e-01],
[5.51336582e-02, 5.26504746e-01, 4.18361596e-01],
[9.48111023e-02, 5.91492957e-01, 3.13695940e-01],
[2.60577624e-01, 6.45623725e-01, 9.37986509e-02],
[2.11365086e-01, 6.50469037e-01, 1.38165876e-01],
[5.10353603e-01, 4.89628899e-01, 1.74987222e-05],
[5.08742746e-01, 4.91240196e-01, 1.70586108e-05],
[1.96810364e-02, 4.79731866e-01, 5.00587098e-01],
[1.94451514e-02, 5.31561018e-01, 4.48993830e-01],
[5.10854562e-01, 4.89122127e-01, 2.33113786e-05],
[2.22292060e-04, 4.62528462e-01, 5.37249246e-01],
[3.01995025e-03, 4.41724881e-01, 5.55255169e-01],
[1.56646964e-02, 4.81293032e-01, 5.03042272e-01],
[3.40320487e-01, 6.11925850e-01, 4.77536634e-02],
[5.11071680e-01, 4.88912238e-01, 1.60819097e-05],
[1.22637070e-03, 4.50146325e-01, 5.48627305e-01],
[1.32742740e-01, 6.21872581e-01, 2.45384680e-01],
[5.11788868e-01, 4.88183011e-01, 2.81211093e-05],
[1.46877645e-02, 5.34107398e-01, 4.51204837e-01],
[1.35101782e-01, 6.27332561e-01, 2.37565657e-01],
[2.85783260e-04, 4.67009821e-01, 5.32704396e-01],
[2.13811230e-03, 4.62006103e-01, 5.35855785e-01],
[5.12675382e-01, 4.87273574e-01, 5.10444754e-05],
[1.37568735e-03, 4.51505176e-01, 5.47119137e-01],
[1.11661461e-03, 4.45720620e-01, 5.53162765e-01],
[9.24839593e-03, 5.00073269e-01, 4.90678335e-01],
[5.12965348e-01, 4.86940172e-01, 9.44806442e-05],
[3.10479637e-01, 6.04577181e-01, 8.49431822e-02]])
lr.predict(X_test)
#或者
proba_.argmax(axis = 1)
结果:
array([0, 2, 1, 1, 1, 1, 1, 0, 0, 2, 1, 0, 2, 2, 2, 1, 0, 2, 1, 0, 1, 1,
2, 2, 0, 2, 2, 1, 0, 1])
参数multi_class默认ovr,one verse rest(一个拿出来,剩下的分开)
逻辑斯蒂回归原理
把分类问题转化为概率问题了~
例子
假如有一个罐子,里面有黑白两种颜色的球,数目多少不知,两种颜色的比例也不知。我们想知道罐中白球和黑球的比例,但我们不能把罐中的球全部拿出来数。现在我们可以每次任意从已经摇匀的罐中拿一个球出来,记录球的颜色,然后把拿出来的球再放回罐中。这个过程可以重复,我们可以用记录的球的颜色来估计罐中黑白球的比例。假如在前面的一百次重复记录中,有七十次是白球,请问罐中白球所占的比例最有可能是多少?
请问罐子中白球的比例是多少?很多种可能10%,5%,95%
很多人马上就有答案了:70%。而其后的理论支撑是什么呢?
我们假设罐中白球的比例是p,那么黑球的比例就是1-p。因为每抽一个球出来,在记录颜色之后,我们把抽出的球放回了罐中并摇匀,所以每次抽出来的球的颜色服从同一独立分布。这里我们把一次抽出来球的颜色称为一次抽样。题目中在一百次抽样中,七十次是白球的概率是P(Data| M),这里Data是所有的数据,M是所给出的模型,表示每次抽出来的球是白色的概率为p。如果第一抽样的结果记为x1,第二抽样的结果记为x2... 那么Data = (x1,x2,…,x100)。这样,
P(Data | M)
= P(x1,x2,…,x100|M)
= P(x1|M)P(x2|M)…P(x100|M)
= p^70(1-p)^30.
那么p在取什么值的时候,P(Data |M)的值最大呢?将p^70(1-p)^30对p求导,并其等于零。
70p^69(1-p)^30-p^70*30(1-p)^29
=解方程可以得到p=0.7
在边界点p=0,1,P(Data|M)=0。所以当p=0.7时,P(Data|M)的值最大。这和我们常识中按抽样中的比例来计算的结果是一样的。
x是数据,y是目标值
求l(θ)的最大值,l(θ)取负就是J(θ),也就是求 J(θ)的最小值
Sigmoid函数由下列公式定义
其对x的导数可以用自身表示:
逻辑斯蒂多分类概率的计算
(1)二分类概率
import numpy as np
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn import datasets
from sklearn.model_selection import train_test_split
X,y = datasets.load_iris(True)
cond = y!= 2
X = X[cond]
y = y[cond]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
lr = LogisticRegression()
lr.fit(X_train,y_train)
w = lr.coef_
b = lr.intercept_
#算法预测概率
proba_ = lr.predict_proba(X_test)
proba_
结果:
array([[0.9824598 , 0.0175402 ],
[0.95560881, 0.04439119],
[0.94432792, 0.05567208],
[0.9750612 , 0.0249388 ],
[0.97827567, 0.02172433],
[0.0082129 , 0.9917871 ],
[0.01061486, 0.98938514],
[0.00916639, 0.99083361],
[0.02936961, 0.97063039],
[0.9681616 , 0.0318384 ],
[0.02180225, 0.97819775],
[0.00153131, 0.99846869],
[0.03473995, 0.96526005],
[0.0469187 , 0.9530813 ],
[0.94206507, 0.05793493],
[0.98719755, 0.01280245],
[0.00338637, 0.99661363],
[0.00509725, 0.99490275],
[0.98049792, 0.01950208],
[0.00848055, 0.99151945]])
#手动计算概率
h = X_test.dot(w[0].T) + b
#利用sigmoid函数求概率p
p = 1/(1+np.e**(-h))
#输出p和1-p
np.c_[1-p,p]
结果:
array([[0.9824598 , 0.0175402 ],
[0.95560881, 0.04439119],
[0.94432792, 0.05567208],
[0.9750612 , 0.0249388 ],
[0.97827567, 0.02172433],
[0.0082129 , 0.9917871 ],
[0.01061486, 0.98938514],
[0.00916639, 0.99083361],
[0.02936961, 0.97063039],
[0.9681616 , 0.0318384 ],
[0.02180225, 0.97819775],
[0.00153131, 0.99846869],
[0.03473995, 0.96526005],
[0.0469187 , 0.9530813 ],
[0.94206507, 0.05793493],
[0.98719755, 0.01280245],
[0.00338637, 0.99661363],
[0.00509725, 0.99490275],
[0.98049792, 0.01950208],
[0.00848055, 0.99151945]])
#二分类概率,算法预测概率和手动计算概率是一样的
(2)多分类概率
import numpy as np
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn import datasets
from sklearn.model_selection import train_test_split
X,y = datasets.load_iris(True)
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2)
#'multinomial' is unavailable when solver='liblinear'
lr = LogisticRegression(multi_class = "multinomial",solver = "saga")
lr.fit(X_train,y_train)
#算法计算多分类概率
proba_ = lr.predict_proba(X_test)
proba_
结果:
array([[2.88410409e-02, 8.68430604e-01, 1.02728355e-01],
[1.35425475e-04, 3.53459725e-02, 9.64518602e-01],
[2.74366088e-02, 8.45599073e-01, 1.26964318e-01],
[9.70765988e-01, 2.92329109e-02, 1.10114994e-06],
[3.15180958e-02, 8.72366639e-01, 9.61152657e-02],
[2.18358607e-03, 4.89066995e-01, 5.08749419e-01],
[1.96830364e-02, 7.58260236e-01, 2.22056727e-01],
[9.44853451e-01, 5.51408435e-02, 5.70540180e-06],
[1.13405580e-02, 6.87002802e-01, 3.01656640e-01],
[9.79587742e-01, 2.04117299e-02, 5.27555711e-07],
[5.98951042e-05, 1.18702819e-01, 8.81237286e-01],
[3.47164044e-02, 7.33230956e-01, 2.32052639e-01],
[1.60196473e-04, 6.14016641e-02, 9.38438139e-01],
[1.03166644e-03, 2.35284113e-01, 7.63684221e-01],
[1.12150578e-01, 8.36546008e-01, 5.13034134e-02],
[3.50753564e-03, 3.58180255e-01, 6.38312209e-01],
[9.50721555e-01, 4.92754157e-02, 3.02895857e-06],
[3.22018571e-02, 7.12631766e-01, 2.55166376e-01],
[4.17156930e-02, 8.95362961e-01, 6.29213464e-02],
[7.28390583e-05, 4.82165378e-02, 9.51710623e-01],
[9.34871389e-05, 1.43184860e-01, 8.56721653e-01],
[9.57147401e-01, 4.28510583e-02, 1.54064852e-06],
[1.71849989e-02, 9.09170591e-01, 7.36444101e-02],
[1.63339316e-05, 5.16006260e-02, 9.48383040e-01],
[9.63886223e-01, 3.61103171e-02, 3.45990329e-06],
[4.59892725e-02, 8.79289824e-01, 7.47209033e-02],
[9.82485407e-01, 1.75141232e-02, 4.69953109e-07],
[3.20659441e-02, 8.68291530e-01, 9.96425258e-02],
[2.44365427e-02, 9.02853574e-01, 7.27098834e-02],
[6.67512001e-04, 1.09834800e-01, 8.89497688e-01]])
softmax公式以及例子
x = np.array([1,3,-1,10])
#softmax 软最大:将数值转化成概率,比较
prob = np.e**(x)/((np.e**(x)).sum())
prob
结果:
array([1.23280114e-04, 9.10923680e-04, 1.66841492e-05, 9.98949112e-01])
w = lr.coef_
b = lr.intercept_
h = X_test.dot(w.T) + b
#softmax函数,手动计算多分类概率
p = np.e**(h)/((np.e**(h)).sum(axis = 1).reshape(-1,1))
p
结果:
array([[2.88410409e-02, 8.68430604e-01, 1.02728355e-01],
[1.35425475e-04, 3.53459725e-02, 9.64518602e-01],
[2.74366088e-02, 8.45599073e-01, 1.26964318e-01],
[9.70765988e-01, 2.92329109e-02, 1.10114994e-06],
[3.15180958e-02, 8.72366639e-01, 9.61152657e-02],
[2.18358607e-03, 4.89066995e-01, 5.08749419e-01],
[1.96830364e-02, 7.58260236e-01, 2.22056727e-01],
[9.44853451e-01, 5.51408435e-02, 5.70540180e-06],
[1.13405580e-02, 6.87002802e-01, 3.01656640e-01],
[9.79587742e-01, 2.04117299e-02, 5.27555711e-07],
[5.98951042e-05, 1.18702819e-01, 8.81237286e-01],
[3.47164044e-02, 7.33230956e-01, 2.32052639e-01],
[1.60196473e-04, 6.14016641e-02, 9.38438139e-01],
[1.03166644e-03, 2.35284113e-01, 7.63684221e-01],
[1.12150578e-01, 8.36546008e-01, 5.13034134e-02],
[3.50753564e-03, 3.58180255e-01, 6.38312209e-01],
[9.50721555e-01, 4.92754157e-02, 3.02895857e-06],
[3.22018571e-02, 7.12631766e-01, 2.55166376e-01],
[4.17156930e-02, 8.95362961e-01, 6.29213464e-02],
[7.28390583e-05, 4.82165378e-02, 9.51710623e-01],
[9.34871389e-05, 1.43184860e-01, 8.56721653e-01],
[9.57147401e-01, 4.28510583e-02, 1.54064852e-06],
[1.71849989e-02, 9.09170591e-01, 7.36444101e-02],
[1.63339316e-05, 5.16006260e-02, 9.48383040e-01],
[9.63886223e-01, 3.61103171e-02, 3.45990329e-06],
[4.59892725e-02, 8.79289824e-01, 7.47209033e-02],
[9.82485407e-01, 1.75141232e-02, 4.69953109e-07],
[3.20659441e-02, 8.68291530e-01, 9.96425258e-02],
[2.44365427e-02, 9.02853574e-01, 7.27098834e-02],
[6.67512001e-04, 1.09834800e-01, 8.89497688e-01]])
#多分类概率手动计算、算法计算的结果一样