实际工作需要县以上的行政区域数据,方便做地址数据的清晰。
原数据地址:​国家统计局

原数据格式:

爬取中国行政区域数据_字符串

爬取后数据:

爬取中国行政区域数据_html_02

代码如下:

library(rvest)
url <- "http://www.stats.gov.cn/tjsj/tjbz/xzqhdm/201703/t20170310_1471429.html"
web <- read_html(url)
## 爬取原始数据
areadata <- web %>% html_nodes('div.TRS_PreAppend')%>% html_nodes("p.MsoNormal") %>% html_nodes("span")%>% html_text()
## 辅助函数
### 函数功能:计算字符串中空格个数、实际字符个数
#### 两个参数:x字符串;type返回值设置,默认返回空格个数
trim_compute <- function(x,type="trim_num"){
if(type=="trim_num"){
trim_num <- nchar(x)-nchar(stringr::str_trim(x))
return(trim_num)
}else if(type=="char_num"){
char_num <- nchar(stringr::str_trim(x))
return(char_num)
}
}
trim_num <- unlist(lapply(areadata,trim_compute,type="trim_num"))
char_num <- unlist(lapply(areadata,trim_compute,type="char_num"))
rawdata <- data.frame(areadata = stringr::str_trim(areadata),trim_num,char_num,stringsAsFactors = FALSE)
head(rawdata)
## char_num为0说明该元素没有值,全部是空格,需要删除
rawdata1 <- subset(rawdata,char_num!=0)
head(rawdata1)
## 拆分编码和名称
area_info <-
data.frame(
area_code = rawdata1$areadata[seq(from = 1, to = 7016, by = 2)],
area_name = rawdata1$areadata[seq(from = 2, to = 7016, by = 2)],
stringsAsFactors = FALSE
)
head(area_info)

## 拆分省、市、区
#### 如果area_code后四位是0000则为省
#### 如果area_code后两位是00且后四位不是0000则为地级市
is_province <-
as.numeric(stringr::str_sub(area_info$area_code,-4,-1) == "0000")
is_city <- ((stringr::str_sub(area_info$area_code, -4, -1) != "0000") *
(stringr::str_sub(area_info$area_code, -2, -1) == "00"))
province_index <- stringr::str_sub(area_info$area_code,1,2)
city_index <- stringr::str_sub(area_info$area_code,1,4)

area_info <-
data.frame(
area_info,
is_province = is_province ,
is_city = is_city,
province_index = province_index,
city_index = city_index,
stringsAsFactors = FALSE
)
head(area_info,n=6)

## 先提取省、直辖市、自治区的数据
province_data <-
subset(
area_info,
is_province == 1,
select = c("area_code", "area_name", "province_index", "city_index")
)
colnames(province_data) <- c("province_code", "province_name", "province_index", "city_index")
## 地级市数据
city_data <-
subset(
area_info,
is_city == 1,
select = c("area_code", "area_name", "province_index", "city_index")
)
head(city_data)
colnames(city_data) <- c("city_code", "city_name", "province_index", "city_index")

## 县区数据
county_data <-
subset(
area_info,
is_city != 1 &is_province != 1,
select = c("area_code", "area_name","city_index")
)
head(county_data)
colnames(county_data) <- c("county_code", "county_name","city_index")
assistdata <- merge(x=province_data,y=city_data,by.x ="province_index",by.y ="province_index",all = TRUE)
head(assistdata)
assistdata <- assistdata[,c("province_code","province_name","city_code","city_name","city_index.y")]
## all = TRUE 东莞这个奇葩没有县区。。。
finish_data <- merge(assistdata,county_data,by.x ="city_index.y",by.y ="city_index",all = TRUE)
head(finish_data)
finish_data <- finish_data[,-1]
## 输出Excel
library(openxlsx)
write.xlsx(finish_data,"china_areadata.xlsx")

数据文件

链接:​​https://pan.baidu.com/s/1nAmUmme7FSkSd6Oxd8CTkg​​​ 密码:3vkm
2017-10-15 于杭州