文章目录
- 普通最小二乘回归(OLS)
- 简单线性回归
- 多项式回归
- 多元线性回归
- 回归诊断
- 标准方法
- QQ图正态性检验
- 残差图
- 误差的独立性
- 成分残差图(偏残差图) 线性
- 同方差性
- 线性模型假设综合验证
- 异常观测值
- 高杠杆值
- 强影响点
- 变量添加图
- 气泡图
- 选择最佳模型
- 逐步回归
- 全子集回归
- k重交叉验证
- 相对重要性
普通最小二乘回归(OLS)
OLS回归是通过预测变量的加权和来预测量化的因变量
简单线性回归
fit <- lm(weight~height,data = women)
summary(fit) ## 模型拟合结果
fitted(fit) ## 模型的拟合值
residuals(fit) ## 拟合模型的残差
plot(women$height,women$weight, ## 简单做图
xlab = "Height",
ylab = "Weight")
abline(fit)
anova(fit) ##生成拟合模型的方差分析表
多项式回归
fit <- lm(weight~height+I(height^2),data = women) ## 一元二次回归
summary(fit)
plot(women$height,women$weight, ## 简单做图
xlab = "Height",
ylab = "Weight")
lines(women$height,fitted(fit))
fit <- lm(weight~height+I(height^2)+I(height^3),data = women) ## 一元三次回归
summary(fit)
library(car)
scatterplot(weight~height,data = women,
spread=FALSE,smoother.arg=list(lty=2),pch=19,
main="Women Age 30-39",
xlab = "Height",
ylab = "Weight")
多元线性回归
state <- as.data.frame(state.x77[,c(5,1,3,2,7)])
cor(state)
library(car)
scatterplotMatrix(state,spread=FALSE,smoother.args=list(lty=2),
main="Scatter Plot Matrix")
fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state) ## 多元回归
summary(fit)
fit <- lm(mpg~hp+wt+hp:wt,data = mtcars) ##有显著交互的回归
summary(fit)
library(effects)
plot(effect("hp:wt",fit,,list(wt=c(2.2,3.2,4.2))),mutiline=TRUE)
confint(fit)
回归诊断
标准方法
fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
summary(fit)
par(mfrow=c(2,2))
plot(fit)
左上表示残差分布图
右上表示正态分布检验,落在线上表示符合正态分布
左下表示方差齐性检验,随机分布则表示方差齐性
右下是残差与杠杆图,表示可以从中鉴别离群点、高杠杆点和强影响点
QQ图正态性检验
library(car)
fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
summary(fit)
par(mfrow=c(1,1))
qqPlot(fit)
残差图
fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
summary(fit)
residplot <- function(fit,nbreaks=10){
z <- rstudent(fit)
hist(z,breaks = nbreaks,freq = FALSE,
xlab = "Studentized Residual",
main = "Distribution of Errors")
rug(jitter(z),col = "brown")
curve(dnorm(x,mean = mean(z),sd=sd(z)),
add = TRUE,col="blue",lwd=2)
lines(density(z)$x,density(z)$y,
col="red",lwd=2,lty=2)
legend("topright",
legend = c("Normal Curve","Kernel Density Curve"),
lty = 1:2,col = c("blue","red"),cex = .7)
}
residplot(fit)
误差的独立性
fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
summary(fit)
durbinWatsonTest(fit)
成分残差图(偏残差图) 线性
library(car)
fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
summary(fit)
crPlots(fit)
线性模型对该模型似乎是合适的
同方差性
library(car)
fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
summary(fit)
ncvTest(fit)
spreadLevelPlot(fit)
线性模型假设综合验证
library(gvlma)
fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
summary(fit)
gvmodel <- gvlma(fit)
summary(gvmodel)
异常观测值
library(car)
fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
summary(fit)
outlierTest(fit)
高杠杆值
library(car)
fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
summary(fit)
hat.plot <- function(fit){
p <- length(coefficients(fit))
n <- length(fitted(fit))
plot(hatvalues(fit),main = "Index Plot of Hat Value")
abline(h=c(2,3)*p/n,col="red",lty=2)
identify(1:n,hatvalues(fit),names(hatvalues(fit)))
}
hat.plot(fit)
强影响点
library(car)
fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
summary(fit)
cutoff <- 4/(nrow(state)-length(fit$coefficients)-2)
plot(fit,which = 4,cook.levels = cutoff)
abline(h=cutoff,lty=2,col="red")
变量添加图
library(car)
fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
summary(fit)
avPlots(fit,ask=FALSE,id.method="identity")
气泡图
library(car)
fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
summary(fit)
influencePlot(fit,id.method="identity",main="Influence Plot",
sub="Circle size is proportional to Cook's distance")
选择最佳模型
fit1 <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
fit2 <- lm(Murder~Population+ Illiteracy,data = state)
anova(fit1,fit2)
AIC(fit1,fit2) ## Aic 方法
逐步回归
library(MASS)
fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
stepAIC(fit,direction = "backward")
全子集回归
library(leaps)
fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
leaps <- regsubsets(Murder~Population+ Illiteracy+Income+Frost,data = state,nbest = 4)
plot(leaps,scale = "adjr2")
library(car)
subsets(leaps,statistic = "cp",
main="Cp Plot for All Subsets Regression")
abline(1,1,lty=2,col="red")
k重交叉验证
shrinkage <- function(fit, k=10){
require(bootstrap)
theta.fit <- function(x,y){lsfit(x,y)}
theta.predict <- function(fit,x){cbind(1,x)%*%fit$coef}
x <- fit$model[,2:ncol(fit$model)]
y <- fit$model[,1]
results <- crossval(x, y, theta.fit, theta.predict, ngroup=k)
r2 <- cor(y, fit$fitted.values)^2
r2cv <- cor(y, results$cv.fit)^2
cat("Original R-square =", r2, "\n")
cat(k, "Fold Cross-Validated R-square =", r2cv, "\n")
cat("Change =", r2-r2cv, "\n")
}
states <- as.data.frame(state.x77[, c("Murder", "Population", "Illiteracy", "Income", "Frost")])
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data = states)
fit2<-lm(Murder~Population+Illiteracy,data=states)
shrinkage(fit)
shrinkage(fit2)#R平方减少得越少,预测则越精确。
相对重要性
states <- as.data.frame(state.x77[,c("Murder", "Population",
"Illiteracy", "Income", "Frost")])
zstates <- as.data.frame(scale(states))
#scale()函数将数据标准化为均值为0、标准差为1的数据集,这样用R回归即可获得标准化的回归系数。
#(注意,scale()函数返回的是一个矩阵,而lm()函数要求一个数据框,你需要用一个中间步骤来转换一下。
zfit <- lm(Murder~Population + Income + Illiteracy + Frost, data=zstates)
coef(zfit)#Illiteracy是最重要的预测变量,而Frost是最不重要的
#相对重要性~相对权重法
relweights <- function(fit,...){
R <- cor(fit$model)
nvar <- ncol(R)
rxx <- R[2:nvar, 2:nvar]
rxy <- R[2:nvar, 1]
svd <- eigen(rxx)
evec <- svd$vectors
ev <- svd$values
delta <- diag(sqrt(ev))
lambda <- evec %*% delta %*% t(evec)
lambdasq <- lambda ^ 2
beta <- solve(lambda) %*% rxy
rsquare <- colSums(beta ^ 2)
rawwgt <- lambdasq %*% beta ^ 2
import <- (rawwgt / rsquare) * 100
import <- as.data.frame(import)
row.names(import) <- names(fit$model[2:nvar])
names(import) <- "Weights"
import <- import[order(import),1, drop=FALSE]
dotchart(import$Weights, labels=row.names(import),
xlab="% of R-Square", pch=19,
main="Relative Importance of Predictor Variables",
sub=paste("Total R-Square=", round(rsquare, digits=3)),
...)
return(import)
}
states <- as.data.frame(state.x77[,c("Murder", "Population",
"Illiteracy", "Income", "Frost")])
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)
relweights(fit, col="blue")#在这个模型中Illiteracy比Income相对更重要