文章目录

  • 普通最小二乘回归(OLS)
  • 简单线性回归
  • 多项式回归
  • 多元线性回归
  • 回归诊断
  • 标准方法
  • QQ图正态性检验
  • 残差图
  • 误差的独立性
  • 成分残差图(偏残差图) 线性
  • 同方差性
  • 线性模型假设综合验证
  • 异常观测值
  • 高杠杆值
  • 强影响点
  • 变量添加图
  • 气泡图
  • 选择最佳模型
  • 逐步回归
  • 全子集回归
  • k重交叉验证
  • 相对重要性


普通最小二乘回归(OLS)

OLS回归是通过预测变量的加权和来预测量化的因变量

简单线性回归

fit <- lm(weight~height,data = women)
summary(fit)     ## 模型拟合结果
fitted(fit)      ## 模型的拟合值
residuals(fit)   ## 拟合模型的残差

plot(women$height,women$weight,  ## 简单做图
     xlab = "Height",
     ylab = "Weight")
abline(fit)
anova(fit)   ##生成拟合模型的方差分析表

R语言二元逻辑回归 r语言二次回归_回归

多项式回归

fit <- lm(weight~height+I(height^2),data = women)  ## 一元二次回归
summary(fit)

plot(women$height,women$weight,  ## 简单做图
     xlab = "Height",
     ylab = "Weight")
lines(women$height,fitted(fit))

fit <- lm(weight~height+I(height^2)+I(height^3),data = women)  ## 一元三次回归
summary(fit)
library(car)
scatterplot(weight~height,data = women,
            spread=FALSE,smoother.arg=list(lty=2),pch=19,
            main="Women Age 30-39",
            xlab = "Height",
            ylab = "Weight")

R语言二元逻辑回归 r语言二次回归_R语言二元逻辑回归_02

R语言二元逻辑回归 r语言二次回归_学习_03

多元线性回归

state <- as.data.frame(state.x77[,c(5,1,3,2,7)])
cor(state)
library(car)

scatterplotMatrix(state,spread=FALSE,smoother.args=list(lty=2),
                  main="Scatter Plot Matrix")

fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)  ## 多元回归
summary(fit)

fit <- lm(mpg~hp+wt+hp:wt,data = mtcars)  ##有显著交互的回归
summary(fit)

library(effects)

plot(effect("hp:wt",fit,,list(wt=c(2.2,3.2,4.2))),mutiline=TRUE)

confint(fit)

R语言二元逻辑回归 r语言二次回归_学习_04

回归诊断

标准方法

fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
summary(fit)

par(mfrow=c(2,2))
plot(fit)

R语言二元逻辑回归 r语言二次回归_回归_05


左上表示残差分布图

右上表示正态分布检验,落在线上表示符合正态分布

左下表示方差齐性检验,随机分布则表示方差齐性

右下是残差与杠杆图,表示可以从中鉴别离群点、高杠杆点和强影响点

QQ图正态性检验

library(car)

fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
summary(fit)
par(mfrow=c(1,1))
qqPlot(fit)

R语言二元逻辑回归 r语言二次回归_回归_06

残差图

fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
summary(fit)

residplot <- function(fit,nbreaks=10){
  z <- rstudent(fit)
  hist(z,breaks = nbreaks,freq = FALSE,
       xlab = "Studentized Residual",
       main = "Distribution of Errors")
  rug(jitter(z),col = "brown")
  curve(dnorm(x,mean = mean(z),sd=sd(z)),
        add = TRUE,col="blue",lwd=2)
  lines(density(z)$x,density(z)$y,
        col="red",lwd=2,lty=2)
  legend("topright",
         legend = c("Normal Curve","Kernel Density Curve"),
         lty = 1:2,col = c("blue","red"),cex = .7)
}
residplot(fit)

R语言二元逻辑回归 r语言二次回归_R语言二元逻辑回归_07

误差的独立性

fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
summary(fit)
durbinWatsonTest(fit)

R语言二元逻辑回归 r语言二次回归_r语言_08

成分残差图(偏残差图) 线性

library(car)
fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
summary(fit)
crPlots(fit)

R语言二元逻辑回归 r语言二次回归_R语言二元逻辑回归_09


线性模型对该模型似乎是合适的

同方差性

library(car)
fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
summary(fit)

ncvTest(fit)
spreadLevelPlot(fit)

R语言二元逻辑回归 r语言二次回归_回归_10

线性模型假设综合验证

library(gvlma)
fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
summary(fit)

gvmodel <- gvlma(fit)
summary(gvmodel)

R语言二元逻辑回归 r语言二次回归_学习_11

异常观测值

library(car)

fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
summary(fit)
outlierTest(fit)

R语言二元逻辑回归 r语言二次回归_学习_12

高杠杆值

library(car)

fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
summary(fit)

hat.plot <- function(fit){
  p <- length(coefficients(fit))
  n <- length(fitted(fit))
  plot(hatvalues(fit),main = "Index Plot of Hat Value")
  abline(h=c(2,3)*p/n,col="red",lty=2)
  identify(1:n,hatvalues(fit),names(hatvalues(fit)))
}
hat.plot(fit)

强影响点

library(car)

fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
summary(fit)

cutoff <- 4/(nrow(state)-length(fit$coefficients)-2)
plot(fit,which = 4,cook.levels = cutoff)
abline(h=cutoff,lty=2,col="red")

R语言二元逻辑回归 r语言二次回归_拟合_13

变量添加图

library(car)

fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
summary(fit)

avPlots(fit,ask=FALSE,id.method="identity")

R语言二元逻辑回归 r语言二次回归_R语言二元逻辑回归_14

气泡图

library(car)

fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)
summary(fit)

influencePlot(fit,id.method="identity",main="Influence Plot",
              sub="Circle size is proportional to Cook's distance")

R语言二元逻辑回归 r语言二次回归_回归_15

选择最佳模型

fit1 <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)

fit2 <- lm(Murder~Population+ Illiteracy,data = state)

anova(fit1,fit2)

AIC(fit1,fit2)  ## Aic 方法

逐步回归

library(MASS)

fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)

stepAIC(fit,direction = "backward")

全子集回归

library(leaps)

fit <- lm(Murder~Population+ Illiteracy+Income+Frost,data = state)

leaps <- regsubsets(Murder~Population+ Illiteracy+Income+Frost,data = state,nbest = 4)

plot(leaps,scale = "adjr2")

library(car)

subsets(leaps,statistic = "cp",
        main="Cp Plot for All Subsets Regression")
abline(1,1,lty=2,col="red")

R语言二元逻辑回归 r语言二次回归_拟合_16

k重交叉验证

shrinkage <- function(fit, k=10){
  require(bootstrap)
  theta.fit <- function(x,y){lsfit(x,y)}
  theta.predict <- function(fit,x){cbind(1,x)%*%fit$coef}
  x <- fit$model[,2:ncol(fit$model)]
  y <- fit$model[,1]
  results <- crossval(x, y, theta.fit, theta.predict, ngroup=k)
  r2 <- cor(y, fit$fitted.values)^2
  r2cv <- cor(y, results$cv.fit)^2
  cat("Original R-square =", r2, "\n")
  cat(k, "Fold Cross-Validated R-square =", r2cv, "\n")
  cat("Change =", r2-r2cv, "\n")
}
states <- as.data.frame(state.x77[, c("Murder", "Population", "Illiteracy", "Income", "Frost")])
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data = states)
fit2<-lm(Murder~Population+Illiteracy,data=states)
shrinkage(fit)
shrinkage(fit2)#R平方减少得越少,预测则越精确。

R语言二元逻辑回归 r语言二次回归_学习_17

相对重要性

states <- as.data.frame(state.x77[,c("Murder", "Population",
                                     "Illiteracy", "Income", "Frost")])
zstates <- as.data.frame(scale(states))
#scale()函数将数据标准化为均值为0、标准差为1的数据集,这样用R回归即可获得标准化的回归系数。
#(注意,scale()函数返回的是一个矩阵,而lm()函数要求一个数据框,你需要用一个中间步骤来转换一下。
zfit <- lm(Murder~Population + Income + Illiteracy + Frost, data=zstates)
coef(zfit)#Illiteracy是最重要的预测变量,而Frost是最不重要的
#相对重要性~相对权重法
relweights <- function(fit,...){
  R <- cor(fit$model)
  nvar <- ncol(R)
  rxx <- R[2:nvar, 2:nvar]
  rxy <- R[2:nvar, 1]
  svd <- eigen(rxx)
  evec <- svd$vectors
  ev <- svd$values
  delta <- diag(sqrt(ev))
  lambda <- evec %*% delta %*% t(evec)
  lambdasq <- lambda ^ 2
  beta <- solve(lambda) %*% rxy
  rsquare <- colSums(beta ^ 2)
  rawwgt <- lambdasq %*% beta ^ 2
  import <- (rawwgt / rsquare) * 100
  import <- as.data.frame(import)
  row.names(import) <- names(fit$model[2:nvar])
  names(import) <- "Weights"
  import <- import[order(import),1, drop=FALSE]
  dotchart(import$Weights, labels=row.names(import),
           xlab="% of R-Square", pch=19,
           main="Relative Importance of Predictor Variables",
           sub=paste("Total R-Square=", round(rsquare, digits=3)),
           ...)
  return(import)
}
states <- as.data.frame(state.x77[,c("Murder", "Population",
                                     "Illiteracy", "Income", "Frost")])
fit <- lm(Murder ~ Population + Illiteracy + Income + Frost, data=states)
relweights(fit, col="blue")#在这个模型中Illiteracy比Income相对更重要

R语言二元逻辑回归 r语言二次回归_回归_18