看书标记——关于R语言
- chapter 6
- 6.2 任务实
【R语言 商务数据分析实战6】
chapter 6
P2P信用贷款风险控制(用户逾期还款概率模型)
关于数据库的应用+数据清洗+实时数据识别>>探索性分析+寻找关键因素>>建立GBM(梯度提升机)模型+ROC评价模型+参数自动调节
6.2 任务实
对数据进行初步探索,确定关键因素,所以本章节更多的是关于数据格式和数据清洗的一个训练
分析用户信息完善程度与逾期率的关系
# 设置工作目录
setwd()
df.tr.master <- read.csv("./data/Training_Master.csv") # 读取训练集
df.ts.master <- read.csv("./data/Test_Master.csv") # 读取测试集
df.ts.master["target"] <- NA # 测试集没有target,添加一列,并用NA赋值,方便后续合表
df.tr.master[df.tr.master == "不详"] <- NA
df.ts.master[df.ts.master == "不详"] <- NA
df.master <- rbind(df.tr.master, df.ts.master)
df.master[, "na.num"] <- apply(is.na(df.master), 1, sum) # 缺失值个数
# 绘制用户信息完整度和逾期率的关系图
# 绘制主表中用户信息缺失的情况,以缺失个数为纵坐标
plot(df.master[order(df.master[, "na.num"]), "na.num"],
ylab = "用户缺失信息的个数")
lines(x = c(0:50000), y = rep(2, 50001), type = "l", col = "red", lwd = 2)
lines(x = c(0:50000), y = rep(10, 50001), type = "l", col = "red", lwd = 2)
# 剔除离群点,离群点样本数较少,存在偶然性
rid.out <- which((df.master[, "na.num"] <= 10 & df.master[, "na.num"] >= 2))
rid.out.tg <- df.master[rid.out, "target"]
na.num.fre <- table(rid.out.tg, df.master[rid.out, "na.num"])
tg.fre <- na.num.fre[2,] / (na.num.fre[1,] + na.num.fre[2,]) # 计算逾期率
row.names(na.num.fre)
barplot(tg.fre)
group <- c()
for (i in 1:7) {
if (i %% 3 == 1) {
tg.fre.group <- tg.fre[i] + tg.fre[i + 1] + tg.fre[i + 2]
group <- c(group, tg.fre.group)
}
}
# 分组
barplot(group, xaxt = "n", ylab = "逾期率",
xlab = "用户信息缺失的个数", ylim = c(0, 0.25))
text.group <- c("2-4", "5-7", "8-10")
axis(1, at = c(0.7, 1.9, 3.1), labels = text.group, tick = FALSE)
分析用户信息修改情况与逾期率关系
# 分布分析
df.tr.update <- read.csv("./data/Training_Userupdate.csv") # 读取训练集
df.ts.update <- read.csv("./data/Test_Userupdate.csv") # 读取测试集
df.update <- rbind(df.tr.update, df.ts.update) # 合并update数据
# 计算用户更新信息的天数
df.update.num <- table(unique(df.update[c("Idx", "UserupdateInfo2")])$Idx)
df.update.num <- data.frame(df.update.num)
colnames(df.update.num) <- c("Idx", "update.num")
# 绘制用户修改信息天数与逾期率的关系图
# 绘制更新信息表中用户修改信息的情况,以修改的天数为纵坐标
plot(df.update.num[order(df.update.num[, "update.num"]), "update.num"],
ylab = "用户修改信息的天数")
lines(x = c(0:50000), y = rep(5, 50001), type = "l", col = "red", lwd = 2)
rid.out <- which(df.update.num[, "update.num"] <= 5) # 剔除离群点
update.num <- merge(df.master, df.update.num, by = "Idx")
rid.out.tg <- update.num[rid.out, "target"]
update.num <- table(rid.out.tg, df.update.num[rid.out, "update.num"])
update.num <- update.num[2, ] / (update.num[1,] + update.num[2, ]) # 计算逾期率
barplot(update.num, ylim = c(0, 0.12), xlab = "修改信息的天数", ylab = "逾期率")
分析用户所在区域经济发展状况与逾期率关系
# 省GDP
df.gdp.prov <- read.csv("./data/Province_GDP.csv")
library(stringr)
# 去除省或市后面的空格
df.gdp.prov[, "province"] <- str_replace_all(df.gdp.prov[, "province"], " ", "")
# 将省字去掉
df.gdp.prov[, "province"] <- str_replace_all(df.gdp.prov[, "province"], "省", "")
# 将市字去掉
df.gdp.prov[, "province"] <- str_replace_all(df.gdp.prov[, "province"], "市", "")
df.gdp.prov <- df.gdp.prov[, c("province", "provGDPpp")]
prov <- c("UserInfo_7", "UserInfo_19")
# 去掉省字
for (i in (1:length(prov))) {
df.master[, prov[i]] <- str_replace_all(df.master[, prov[i]], "省", "")
}
# 将省人均GDP加入主表中
df.master <- merge(df.master, df.gdp.prov, by.x = "UserInfo_7",
by.y = "province", all = TRUE, sort = FALSE)
# 绘制各省逾期情况图
gdp.tg <- data.frame(df.master$target, df.master$UserInfo_7,
df.master$provGDPpp)
gdp.tg <- gdp.tg[order(gdp.tg[, 3], decreasing = FALSE),]
colnames(gdp.tg) <- c("target", "province", "provGDPpp")
gdp.fre <- table(gdp.tg$target, gdp.tg$provGDPpp)
gdp.fre <- gdp.fre[2,] / (gdp.fre[1,] + gdp.fre[2,])
barplot(gdp.fre, xaxt = "n", ylim = c(0, 0.12), ylab = "逾期率")
text.x = c("甘肃", "贵州", "云南", "西藏", "广西", "安徽", "江西", "山西",
"四川", "河南", "海南", "黑龙江", "青海", "河北", "湖南",
"新疆", "宁夏", "陕西", "湖北", "重庆", "吉林", "山东", "福建",
"广东", "辽宁", "内蒙古", "浙江", "江苏", "上海", "北京", "天津")
num.x = seq(0.8, 36.963, 1.193)
axis(1, at = num.x, labels = text.x, las = 2, tick = FALSE)
par(new = T)
gdp.order <- df.gdp.prov[order(df.gdp.prov[, 2]),]
plot(gdp.order[, 2], ann = FALSE, type = "l", lwd = 2, col = "red",
axes = FALSE, sub = "省人均GDP")
gdp = seq(0, 120000, 30000)
axis(4, at = gdp, labels = gdp, col = "red", lwd = 2)
legend(2, 105000, lty = c(1, NA), pch = c(NA, 15), lwd = c(2, 1),
col = c("red", "gray"), legend = c("省人均GDP", "各省逾期率"))
分析用户借款月份与逾期率关系
# 获取借款成交的月份
df.master[, "listing.month"] <- as.numeric(format(as.Date(df.master$ListingInfo,
format = "%Y/%m/%d"), "%m"))
df.master$ListingInfo <- NULL
write.csv(df.master, "./tmp/df_master.csv", row.names = FALSE) # 写出数据
# 绘制用户借款月份和逾期率的关系图
mon.fre <- table(df.master$target, df.master$listing.month)
mon.fre <- mon.fre[2, ] / (mon.fre[1, ] + mon.fre[2, ])
barplot(mon.fre, xlab = "用户借款月份", ylab = "逾期率")
分析用户信息完善程度与逾期率关系
# 设置工作目录
setwd()
df.tr.master <- read.csv("./data/Training_Master.csv") # 读取训练集
df.ts.master <- read.csv("./data/Test_Master.csv") # 读取测试集
df.ts.master["target"] <- NA # 测试集没有target,添加一列,并用NA赋值,方便后续合表
df.tr.master[df.tr.master == "不详"] <- NA
df.ts.master[df.ts.master == "不详"] <- NA
df.master <- rbind(df.tr.master, df.ts.master)
df.master[, "na.num"] <- apply(is.na(df.master), 1, sum) # 缺失值个数
# 绘制用户信息完整度和逾期率的关系图
# 绘制主表中用户信息缺失的情况,以缺失个数为纵坐标
plot(df.master[order(df.master[, "na.num"]), "na.num"],
ylab = "用户缺失信息的个数")
lines(x = c(0:50000), y = rep(2, 50001), type = "l", col = "red", lwd = 2)
lines(x = c(0:50000), y = rep(10, 50001), type = "l", col = "red", lwd = 2)
# 剔除离群点,离群点样本数较少,存在偶然性
rid.out <- which((df.master[, "na.num"] <= 10 & df.master[, "na.num"] >= 2))
rid.out.tg <- df.master[rid.out, "target"]
na.num.fre <- table(rid.out.tg, df.master[rid.out, "na.num"])
tg.fre <- na.num.fre[2,] / (na.num.fre[1,] + na.num.fre[2,]) # 计算逾期率
row.names(na.num.fre)
barplot(tg.fre)
group <- c()
for (i in 1:7) {
if (i %% 3 == 1) {
tg.fre.group <- tg.fre[i] + tg.fre[i + 1] + tg.fre[i + 2]
group <- c(group, tg.fre.group)
}
}
# 分组
barplot(group, xaxt = "n", ylab = "逾期率",
xlab = "用户信息缺失的个数", ylim = c(0, 0.25))
text.group <- c("2-4", "5-7", "8-10")
axis(1, at = c(0.7, 1.9, 3.1), labels = text.group, tick = FALSE)
(对P2P数据进行预处理)
使用第三方平台信息构建新特征
# 设置工作目录
setwd()
library(stringr)
# 代码 6-5
df.master <- read.csv("./tmp/df_master.csv", stringsAsFactors = FALSE)
library(stringr)
# 第三方信息处理
colnames(df.master)
third.Info1 <- df.master[, 91:209] # 提取第三方数据
third.Info_var <- apply(third.Info1, 2, var) # 按列求方差
# 按降序排序
third.Info_var1 <- third.Info_var[order((third.Info_var), decreasing = TRUE)]
barplot(third.Info_var1[1:20], las = 2, col = rainbow(20),
width = 3, legend.text = FALSE) # 绘制方差前20的柱状图
third.info <- matrix(nrow = 17, ncol = 7)
for (i in (1:17)) {
for (j in (1:7)) {
third.info[i, j] <- paste("ThirdParty_Info_Period", j, "_", i, sep = "")
}
}
# 对ThirdParty_Infoi_j分别进行简单统计,如求最大值,最小值中位数和标准差
for (i in (1:17)) {
colns <- paste("ThirdParty_Info_", i, "_max", sep = "")
df.master[, colns] <- apply(df.master[, third.info[i, ]], 1, max)
}
for (i in (1:17)) {
colns <- paste("ThirdParty_Info_", i, "_min", sep = "")
df.master[, colns] <- apply(df.master[, third.info[i, ]], 1, min)
}
for (i in (1:17)) {
colns <- paste("ThirdParty_Info_", i, "_median", sep = "")
df.master[, colns] <- apply(df.master[, third.info[i, ]], 1, median)
}
for (i in (1:17)) {
colns <- paste("ThirdParty_Info_", i, "_sd", sep = "")
df.master[, colns] <- apply(df.master[, third.info[i, ]], 1, sd)
}
for (i in (1:7)) {
colns <- paste("ThirdParty_Info_period", i, "_max", sep = "")
df.master[, colns] <- apply(df.master[, third.info[, i]], 1, max)
}
for (i in (1:7)) {
colns <- paste("ThirdParty_Info_period", i, "_min", sep = "")
df.master[, colns] <- apply(df.master[, third.info[, i]], 1, min)
}
for (i in (1:7)) {
colns <- paste("ThirdParty_Info_period", i, "_median", sep = "")
df.master[, colns] <- apply(df.master[, third.info[, i]], 1, median)
}
for (i in (1:7)) {
colns <- paste("ThirdParty_Info_period", i, "_sd", sep = "")
df.master[, colns] <- apply(df.master[, third.info[, i]], 1, sd)
}
write.csv(df.master, "./tmp/df_master_clear.csv", row.names = FALSE) # 写出数据
对登陆信息表与更新信息表进行长宽表转换
# 读取数据
df.tr.update <- read.csv("./data/Training_Userupdate.csv")
df.ts.update <- read.csv("./data/Test_Userupdate.csv")
df.tr.log <- read.csv("./data/Training_LogInfo.csv")
df.ts.log <- read.csv("./data/Test_LogInfo.csv")
# 合并数据
df.update <- rbind(df.tr.update, df.ts.update)
df.log <- rbind(df.tr.log, df.ts.log)
# 求出最早修改日期,最晚修改日期和放款日期之差。
# 将字段变成日期形式
df.log1 <- data.frame(df.log[, 1], as.Date(df.log[, 2]), as.Date(df.log[, 5]))
df.log1 <- unique(df.log1) # 去掉重复字段
# 求出借款成交时间和登录时间的时间差
df.log2 <- data.frame(df.log1[, 1], df.log1[, 2] - df.log1[, 3])
colnames(df.log2) <- c("Idx", "date.diff") # 对字段进行重命名,方便后期合表
df.fir.log <- aggregate(df.log2[, 2], list(df.log2[, 1]), max) # 最早修改时间
colnames(df.fir.log) <- c("Idx", "first.log") # 对字段进行重命名,方便后期合表
df.fir.log[, "first.log"] <- sapply(df.fir.log[, "first.log"], as.numeric)
df.last.log <- aggregate(df.log2[, 2], list(df.log2[, 1]), min) # 最晚修改时间
colnames(df.last.log) <- c("Idx", "last.log") # 对字段进行重命名,方便后期合表
df.last.log[, "last.log"] <- sapply(df.last.log[, "last.log"], as.numeric)
# 将操作类型和操作代码,以“,”分隔,保存在新增的log.full列
df.log[, "log.full"] <- as.data.frame(paste(df.log[, 3], df.log[, 4], sep = ","))
# 计算用户总的登录类型数目
df.log.cats <- as.data.frame(table(unique(df.log[, c(1, 6)])["Idx"]))
colnames(df.log.cats) <- c("Idx", "log.cats") # 对表中字段进行重命名
# 对每种登录类型按Idx计数
df.log.type <- as.data.frame.matrix(xtabs( ~ Idx + log.full, df.log, sparse = TRUE))
df.log.type[, "Idx"] <- rownames(df.log.type) # 增加Idx字段,方便后续合表
# 计算用户第一次登录之后的每一天登录平台的频率
df.log.1tab <- as.data.frame(table(df.log[, 1]))
df.log.fre <- data.frame(df.log.1tab[, 1], df.log.1tab[, 2] /
as.data.frame(as.numeric(df.fir.log[, 2]) + 1))
colnames(df.log.fre) <- c("Idx", "log.frequency")
# 计算用户登录平台的天数
df.log.num <- table(unique(df.log[c("Idx", "LogInfo3")])$Idx)
df.log.num <- data.frame(df.log.num)
colnames(df.log.num) <- c("Idx", "lognum")
# 根据Idx字段合并表格
df.log.final <- merge(df.fir.log, df.last.log, by = "Idx")
df.log.final <- merge(df.log.final, df.log.cats, by = "Idx")
df.log.final <- merge(df.log.final, df.log.fre, by = "Idx")
df.log.final <- merge(df.log.final, df.log.type, by = "Idx")
df.log.final <- merge(df.log.final, df.log.num, by = "Idx")
write.csv(df.log.final, "./tmp/df_log_final.csv", row.names = FALSE) # 写出数据
# Process UserUpdate
# 字符转换
# 将update数据框中的‘_’替换成为""
df.update[, 3] <- str_replace_all(df.update[, 3], "[_]", "")
df.update[, 3] <- str_to_lower(df.update[, 3]) # 将所有字母变成小写
# 对每种修改的特征按Idx计数
df.update1 <- as.data.frame.matrix(xtabs( ~ Idx + UserupdateInfo1,
df.update, sparse = TRUE))
# 计算平均每个特征用户会修改的频率
df.update1[, "updatecompl"] <- apply(df.update1, 1, sum) / length(df.update1)
Idx <- rownames(df.update1)
df.update1 <- data.frame(Idx, df.update1) # 增加Idx字段,方便后续合表
df.update2 <- data.frame(df.update[, 1], as.Date(df.update[, 2]),
as.Date(df.update[, 4])) # 将字段变成日期形式
df.update2 <- unique(df.update2)
# 求出时间差
df.update3 <- data.frame(df.update2[, 1], df.update2[, 2] - df.update2[, 3])
colnames(df.update3) <- c("Idx", "date.diff") # 对表中字段进行重命名
# 最早修改时间
df.fir.update <- aggregate(df.update3[, 2], list(df.update3[, 1]), max)
colnames(df.fir.update) <- c("Idx", "first.update") # 对表中字段进行重命名
df.fir.update[, "first.update"] <- sapply(df.fir.update[, "first.update"],
as.numeric)
# 最晚修改时间
df.last.update <- aggregate(df.update3[, 2], list(df.update3[, 1]), min)
colnames(df.last.update) <- c("Idx", "last.update") # 对表中字段进行重命名
df.last.update[, "last.update"] <- sapply(df.last.update[, "last.update"],
as.numeric)
# 计算用户更改特征数目
df.update.cats <- as.data.frame(table(unique(df.update[, c(1, 3)])["Idx"]))
colnames(df.update.cats) <- c("Idx", "update.cats") # 对表中字段进行重命名
# 计算用户第一次更新信息之后的每一天更新信息的频率
df.update.1tab <- as.data.frame(table(df.update1[, 1]))
df.update.fre <- data.frame(df.update.1tab[, 1], df.update.1tab[, 2] /
as.data.frame(as.numeric(df.fir.update[, 2]) + 1))
colnames(df.update.fre) <- c("Idx", "update.frequency")
# 计算用户更新信息的天数
df.update.num <- table(unique(df.update[c("Idx", "UserupdateInfo2")])$Idx)
df.update.num <- data.frame(df.update.num)
colnames(df.update.num) <- c("Idx", "update.num")
# 合并数据框
df.update.final <- merge(df.fir.update, df.last.update, by = "Idx")
df.update.final <- merge(df.update.final, df.update.cats, by = "Idx")
df.update.final <- merge(df.update.final, df.update.fre, by = "Idx")
df.update.final <- merge(df.update.final, df.update1, by = "Idx")
df.update.final <- merge(df.update.final, df.update.num, by = "Idx")
write.csv(df.update.final, "./tmp/df_update_final.csv", row.names = FALSE)
转换与清洗P2P信贷数据
# 第一部分:针对类别型特征的处理
df.master <- read.csv("./tmp/df_master_clear.csv", stringsAsFactors = TRUE)
# 将列UserInfo_9的空格符去掉
df.master[, "UserInfo_9"] <- str_replace_all(df.master[, "UserInfo_9"], " ", "")
df.master[, "UserInfo_9"] <- as.factor(df.master[, "UserInfo_9"]) # 字符转换为因子
# 省份是否相同,相同为1,不同为0
df.master[, "diffprov"] <- as.integer(df.master["UserInfo_7"] ==
df.master["UserInfo_19"])
# 当diffprov列为NA时,为-1
df.master[is.na(df.master["diffprov"]), "diffprov"] <- as.integer(-1)
# 字符处理
city <- c("UserInfo_2", "UserInfo_4", "UserInfo_8", "UserInfo_20")
for (i in (1:length(city))) {
# 去掉市字
df.master[, city[i]] <- str_replace_all(df.master[, city[i]], "市", "")
}
# 市是否相同
diff.cols <- c()
for (i in (1:(length(city) - 1))) {
for (j in (length(city):(i + 1))) {
tmp <- paste("UserInfodiff_", strsplit(city[i], "_")[[1]][2], "_",
strsplit(city[j], "_")[[1]][2], sep = "")
# 用UserInfodiff_1_2表示第1列和第2列的城市是否相同
diff.cols <- c(diff.cols, tmp)
df.master[, tmp] = as.integer(df.master[, city[i]] == df.master[, city[j]])
}
}
# 为空则赋值-1
df.master[, diff.cols][is.na(df.master[, diff.cols])] <- as.integer(-1)
prov <- c("UserInfo_7", "UserInfo_19")
# 用prov_i代替省名
for (i in (1:length(prov))) {
df.master[, prov[i]] <- as.factor(df.master[, prov[i]])
df.master[, prov[i]] <- as.integer(df.master[, prov[i]])
df.master[, prov[i]] <- paste("prov", df.master[, prov[i]], sep = "_")
df.master[, prov[i]] <- as.factor(df.master[, prov[i]])
}
# 用city_i代替市名
for (i in (1:length(city))) {
df.master[, city[i]] <- as.factor(df.master[, city[i]])
df.master[, city[i]] <- as.integer(df.master[, city[i]])
df.master[, city[i]] <- paste("city", df.master[, city[i]], sep = "_")
df.master[, city[i]] <- as.factor(df.master[, city[i]])
}
# 数据转换
# factor型特征
# 哑变量化函数
DummyFact <- function(x, a = 0.002) {
# x为各特征的名称
# a为默认值,指样本占总样本的百分比为0.002
x <- as.character(x)
tmp <- table(x) / length(x) # 样本占总样本的百分比
# 小数量样本占总数比率超过a,则合并为other
if (sum(tmp[tmp < a]) > a) {
tmp1 <- tmp[tmp < a]
rowlist <- rep(F, length(x))
for (i in (1:length(tmp1))) {
rowlist <- (rowlist | (names(tmp1[i]) == x))
}
x[rowlist] <- "other"
}
# 缺失值占总数比率大于a,则为unknown
rowlist <- is.na(x) | is.infinite(x)
if (1 - sum(tmp) >= a) {
x[rowlist] <- "unknow"
}
else {
# 小于a,则不归为一类,赋值为众数
x[rowlist] <- names(tmp[tmp == max(tmp)])
}
x <- as.factor(x)
}
# 哑变量化
library(caret)
isf <- as.logical(lapply(df.master[1, ], is.factor))
df.master.f <- df.master[, isf]
df.master.nf <- df.master[, !isf]
df.master.f <- apply(df.master.f, 2, DummyFact)
mainEffects <- dummyVars(~., data = df.master.f, sep = "_")
df.master.f <- predict(mainEffects, df.master.f)
df.master <- cbind(df.master.nf, df.master.f)
第二部分:数值型数据的缺失值处理
df.update.final <- read.csv("./tmp/df_update_final.csv",
stringsAsFactors = FALSE)
df.log.final <- read.csv("./tmp/df_log_final.csv", stringsAsFactors = FALSE)
# 缺失值统计
MissSum <- function(x) {
a <- sum(is.na(x)) # x:向量
return(a)
}
miss.sum.num <- sapply(df.master, MissSum) # 统计各列缺失值个数
miss.rate <- miss.sum.num / length(df.master[, 1])
# 绘制特征缺失比率图
barplot(miss.rate[order(miss.rate, decreasing = TRUE)[1:5]], ylim = c(0, 1),
main = "特征缺失比率图", xlab = "特征名称", ylab = "缺失比率")
df.master <- df.master[miss.rate < 0.965] # 将缺失值大于0.965的列剔除
# 多重共线性处理,先处理缺失值,数值型缺失值填充为中位数
library(Hmisc)
num.miss.col <- sapply(df.master, anyNA)
num.miss.col <- as.data.frame(num.miss.col[num.miss.col == TRUE])
num.miss.col <- setdiff(row.names(num.miss.col), "target")
df.master[num.miss.col] <- impute(df.master[num.miss.col], median) # 插补中位数
第三部分:筛选冗余特征
inval.cols <- c("Idx", "target")
line.cols <- names(df.master)
line.cols <- setdiff(line.cols, inval.cols)
df.master[, line.cols][is.na(df.master[, line.cols])] <- as.integer(-1)
df.cor <- cor(df.master[,line.cols])
df.len <- length(df.cor[1, ])
cor.col <- c(1:df.len)
# 将相关系数大于0.99的列剔除
for (i in (1:(df.len - 1))) {
for (j in c((i + 1):df.len)) {
if (!is.na(df.cor[i, j])) {
if (abs(df.cor[i, j]) > 0.99) {
tmp <- which(cor.col == j)
if (length(tmp) > 0) {
cor.col <- cor.col[-tmp]
}
}
}
}
}
df.master <- df.master[c("Idx", line.cols[cor.col], "target")]
# 合并数据
df.final <- merge(df.update.final, df.log.final, by = "Idx", all = TRUE)
df.final <- merge(df.final, df.master, by = "Idx", all = TRUE)
# log和update两张表数据缺失值填充为0
inval.cols <- c("Idx", "target")
log.up.col <- c(colnames(df.update.final)[-1], colnames(df.log.final)[-1])
mat <- is.na(df.final[, log.up.col])
int0 <- as.integer(0)
for (i in (1:length(mat[1, ]))) {
df.final[mat[, i], log.up.col[i]] = int0
}
# 同一值去字段
SmVal <- function(x) {
# x:vector
valtlb <- table(x) / length(x)
if (sum(valtlb[valtlb > 0.99]) != 0)
return(F)
else
return(T)
}
cols <- apply(df.final, 2, SmVal)
df.final <- df.final[, cols]
# 变异系数去字段
Csd <- function(x) {
# x:vector
a <- sd(x, na.rm = TRUE) / mean(x, na.rm = TRUE)
return(a)
}
cols <- sapply(df.final, is.numeric)
cols[inval.cols] <- F
# 求出数值型字段标准差,要求不包含缺失值
cov.num.cols <- lapply(df.final[cols], Csd)
df.final <- df.final[cov.num.cols >= 0.1] # 剔除变异系数小于0.1的字段
write.csv(df.final, "./tmp/df_final.csv", row.names = FALSE)
(建模部分)
关于GBM(gradient boosting machine)梯度提升机(分类、回归、排序问题):这是一种提高模型精度的建模方法,原理与Boosting相似,但是其对于正确和错误样本的加权与Boosting不同。GBM在梯度方向上建立新的模型,因为是以残差减少作为梯度提升的方向,所以新模型的建立都是为了减少残差而存在的。关于模型的评价可以用ROC曲线评定,TPR/FPR。
优点:鲁棒性强(适用能力强)、相对少参数时可以有较好的预测效果,和SVM一样是泛化能力较强的算法
缺点:弱学习器之间存在依赖关系,难以并行训练数据,调参与训练时间长
# 设置工作目录并读取数据
setwd()
df.final <- read.csv("./tmp/df_final.csv", stringsAsFactors = FALSE)
# 建模
# 对预处理后的数据分训练集和测试集
line.cols <- names(df.final)
line.cols <- setdiff(line.cols, "Idx")
train.data <- df.final[ ! is.na(df.final$target), line.cols]
test.data <- df.final[is.na(df.final$target), line.cols]
test.idx <- df.final[is.na(df.final$target), "Idx"]
test.idx <- data.frame(Idx = test.idx)
test.data[, "target"] <- NULL
library(gbm)
library(pROC)
gbm.model <- gbm(target ~ ., data = train.data, distribution = "adaboost",
n.trees = 1500, shrinkage = 0.01,
interaction.depth = 4, bag.fraction = 0.5,
train.fraction = 0.5, n.minobsinnode = 10,
cv.folds = 3, keep.data = TRUE, verbose = FALSE, n.cores = 2)
best.iter <- gbm.perf(gbm.model, method = "cv") # 用交叉检验确定最佳迭代次数
best.iter
impval <- summary(gbm.model, best.iter) # 查看特征重要程度
# 画出特征重要性图
barplot(impval[c(1 : 20), 2], names = impval[c(1 : 20), 1],
col = rainbow(20), las = 2, cex.names = 0.5, ylim = c(0, 4),
ylab = "特征的重要程度")
legend(16, 4.1, legend = impval[c(1 : 20), 1],
fill = rainbow(20), bty = "o", ce)
# 评价模型
gbm.pred <- predict.gbm(gbm.model, test.data, type = "response" ) # 预测
final.test <- read.csv("./data/Test_Master_result.csv") # 导入真实结果
test.merge <- merge(test.idx, final.test, by = "Idx")
library(ROCR)
pred.both <- prediction(gbm.pred, test.merge$target)
perf.both <- performance(pred.both, "tpr", "fpr")
plot(perf.both, main = "ROC曲线", col = "blue", lwd = 5) # 画出ROC曲线图
roc(test.merge$target, gbm.pred) # 求出曲线下方面积