GitHub地址:https://github.com/fanorfan/CrawlChinesePlaceNames
目录
- 准备工作
- 安装scrapy模块
- 创建scrapy项目
- python代码
- settings.py
- items.py
- pipelines.py
- place_spiders.py
- 程序效果
准备工作
安装scrapy模块
cmd命令行输入以下命令:
pip install scrapy -i https://pypi.douban.com/simple/
创建scrapy项目
cmd命令行切换到要创建项目的路径,例如:
Microsoft Windows [版本 10.0.18363.720]
(c) 2019 Microsoft Corporation。保留所有权利。
C:\Users\Administrator>D:
D:\>
输入创建scrapy项目的命令:
scrapy startproject CrawlChinesePlaceNames
用Pycharm打开创建好的项目,目录如下:
python代码
- 在与spiders文件夹同级目录下创建main.py文件代码如下:
# Author : 不凡不弃
# Datetime : 2020/5/19 0019 20:53
# description : 主函数,指定并执行爬虫任务
from scrapy.cmdline import execute
import os
import sys
if __name__ == '__main__':
sys.path.append(os.path.dirname(os.path.abspath(__file__)))
# CrawlPlaceSpiders:要执行的爬虫任务名称(在place_spiders.py文件中定义的)
execute(['scrapy', 'crawl', 'CrawlPlaceSpiders'])
- 在spiders文件夹里创建place_spiders.py文件,这个文件将要写爬虫的具体逻辑代码。
- 要爬取的网站:http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/
settings.py
settings.py文件里写一些配置参数,代码如下:
# -*- coding: utf-8 -*-
BOT_NAME = 'CrawlChinesePlaceNames'
SPIDER_MODULES = ['CrawlChinesePlaceNames.spiders']
NEWSPIDER_MODULE = 'CrawlChinesePlaceNames.spiders'
# 下载延迟1
DOWNLOAD_DELAY = 1
# 随机下载延迟
RANDOMIZE_DOWNLOAD_DELAY = True
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'
# COOKIES_ENABLED = True
# 保存json文件所需的设置
ITEM_PIPELINES = {
'CrawlChinesePlaceNames.pipelines.CrawlchineseplacenamesPipeline': 300,
}
items.py
items.py文件里写我们想要爬取的字段名,代码如下:
# -*- coding: utf-8 -*-
# Author : 不凡不弃
# Datetime : 2020/5/19 0019 20:53
# description : 爬取的字段
import scrapy
class ChineseItem(scrapy.Item):
province_num = scrapy.Field() # 省代号
province_name = scrapy.Field() # 省名
province_url = scrapy.Field() # 省url
city_num = scrapy.Field() # 市代号
city_name = scrapy.Field() # 市名
city_url = scrapy.Field() # 市url
county_num = scrapy.Field() # 县代号
county_name = scrapy.Field() # 县名
county_url = scrapy.Field() # 县url
town_num = scrapy.Field() # 镇代号
town_name = scrapy.Field() # 镇名
town_url = scrapy.Field() # 镇url
village_num = scrapy.Field() # 村代号
village_class = scrapy.Field() # 村分类
village_name = scrapy.Field() # 村名
pipelines.py
pipelines.py文件里写爬取到的数据的保存逻辑代码,我们是保存为json文件。代码如下:
# -*- coding: utf-8 -*-
# Author : 不凡不弃
# Datetime : 2020/5/19 0019 20:53
# description : 保存爬取到的数据
import json
import codecs
class CrawlchineseplacenamesPipeline:
def __init__(self):
# 创建要保存到的文件
self.file = codecs.open('data.json', 'w', 'utf-8')
# self.file.write("[")
# item参数代表的就是爬取到的一条一条的数据
def process_item(self, item, spider):
# 将item转为字典格式,一行一行写入
lines = json.dumps(dict(item), ensure_ascii=False) + '\n'
self.file.write(lines)
return item
def close_spider(self, spider):
# self.file.write("]")
# 关闭资源
self.file.close()
place_spiders.py
place_spiders.py文件写爬虫的具体逻辑代码如下:
- 首先会执行parse()函数
- 该函数会处理初始的网址start_urls
- response.xpath("//a[@href]")中的"//a[@href]"是一个xpath表达式,用于提取指定的html元素,效果如下:
- 对提取到的省份信息遍历,暂时保存字段:省代号、省名、省url
- 对省url进行爬取(进入第二步)
- 第二步会执行parse_city函数
- 该函数会处理上一步传入的省url的网址
- common()函数中定义xpath表达式适用于市、县、镇的网址
- 对提取到的市信息遍历,获取上一步暂存的item数据,重新构造item,暂时保存字段:市代号、市名、市url
- 对市url进行爬取(进入第三步)
- 第三步会执行parse_county函数
- 原理同第二步
- 第四步会执行parse_town函数
- 原理同第二步
- 第五步会执行parse_village函数
- xpath表达式改变"//tr[@class='villagetr']"
- "td[1]/text()"获取村代号;"td[1]/text()"获取村分类;"td[1]/text()"获取村名
- 村已经是最深层的爬取网址,直接将保存的数据item返回,不再有爬取的url
# Author : 不凡不弃
# Datetime : 2020/5/19 0019 20:53
# description : 爬虫任务模块
import scrapy
from CrawlChinesePlaceNames.items import ChineseItem
import time
# 根网址
base_url = 'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/'
# 抽取链接的公用方法
def common(response):
common_list = response.xpath("//a")[:-1]
number_list = [number for n, number in enumerate(common_list) if n % 2 == 0]
name_list = [name for n, name in enumerate(common_list) if n % 2 == 1]
return zip(number_list, name_list)
class CrawlPlaceSpiders(scrapy.Spider):
# 爬虫名称
name = "CrawlPlaceSpiders"
# 允许的域名
allowed_domains = ["'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html'"]
# 首页
start_urls = [
'http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/index.html'
]
# 客户端信息设置
custom_settings = {
"DEFAULT_REQUEST_HEADERS": {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.100 Safari/537.36'
}
}
# 爬虫任务的第一层任务
def parse(self, response):
"""
省
:param response:
:return:
"""
# 匹配省份链接
province_list = response.xpath("//a[@href]")
# province_list = province_list[3:4]
# 对每一个省份进行处理
for province in province_list:
# 省份名
province_name = province.xpath("text()").extract()[0]
# 省份url
province_url = base_url + province.xpath("@href").extract()[0]
# 省份代号
province_num = province.xpath("@href").extract()[0].split('.', 1)[0]
print("省名:", province_name, " 链接:", province_url)
# 保存三个字段的值
item = ChineseItem(province_num=province_num, province_name=province_name, province_url=province_url)
# province_url:接下来要爬取的网址
# callback:接下来爬取网址的回调函数
# dont_filter:True 不过滤不在允许域名里的网址
request = scrapy.Request(url=province_url, callback=self.parse_city, dont_filter=True)
# 暂时保存item数据
request.meta['item'] = item
yield request
# 爬虫任务的第二层任务
def parse_city(self, response):
"""
市
:param response:
:return:
"""
# 对每一个市进行处理
for number, name_tem in common(response):
# 市代号
num = number.xpath("text()").extract()[0]
# 市名
name = name_tem.xpath("text()").extract()[0]
# 市url
url = base_url + number.xpath("@href").extract()[0]
# print("number:", num, " 名称:", name, " 链接:", url)
# 获取上一步暂存的item数据,重新构造item
item = response.meta['item']
item = ChineseItem(province_num=item["province_num"], province_name=item["province_name"],
province_url=item["province_url"],
city_num=num, city_name=name, city_url=url)
request = scrapy.Request(url=url, callback=self.parse_county, dont_filter=True)
request.meta['item'] = item
yield request
print("yield all the links!")
# 爬虫任务的第三层任务
def parse_county(self, response):
"""
县
:param response:
:return:
"""
# 对每一个县进行处理
for number, name_tem in common(response):
# 县代号
num = number.xpath("text()").extract()[0]
# 县名
name = name_tem.xpath("text()").extract()[0]
# 县url
url = base_url + num[:2] + "/" + number.xpath("@href").extract()[0]
# print("number:", num, " 名称:", name, " 链接:", url)
# 获取上一步暂存的item数据,重新构造item
item = response.meta['item']
item = ChineseItem(province_num=item["province_num"], province_name=item["province_name"],
province_url=item["province_url"],
city_num=item["city_num"], city_name=item["city_name"], city_url=item["city_url"],
county_num=num, county_name=name, county_url=url)
request = scrapy.Request(url=url, callback=self.parse_town, dont_filter=True)
request.meta['item'] = item
yield request
# 爬虫任务的第四层任务
def parse_town(self, response):
"""
镇
:param response:
:return:
"""
# 对每一个镇进行处理
for number, name_tem in common(response):
# 镇代号
num = number.xpath("text()").extract()[0]
# 镇名
name = name_tem.xpath("text()").extract()[0]
# 镇url
url = base_url + num[0:2] + "/" + num[2:4] + "/" + number.xpath("@href").extract()[0]
# print("number:", num, " 名称:", name, " 链接:", url)
# 获取上一步暂存的item数据,重新构造item
item = response.meta['item']
item = ChineseItem(province_num=item["province_num"], province_name=item["province_name"],
province_url=item["province_url"],
city_num=item["city_num"], city_name=item["city_name"], city_url=item["city_url"],
county_num=item["county_num"], county_name=item["county_name"],
county_url=item["county_url"],
town_num=num, town_name=name, town_url=url)
request = scrapy.Request(url=url, callback=self.parse_village, dont_filter=True)
request.meta['item'] = item
yield request
# 爬虫任务的第五层任务
def parse_village(self, response):
"""
村
:param response:
:return:
"""
# 匹配村信息
village_list = response.xpath("//tr[@class='villagetr']")
# 对每一个村进行处理
for village in village_list:
# 村代号
village_num = village.xpath("td[1]/text()").extract()[0]
# 村分类
village_class = village.xpath("td[2]/text()").extract()[0]
# 村名
village_name = village.xpath("td[3]/text()").extract()[0]
# print("number:", village_num, " 村名:", village_name, " 分类代码:", village_class)
# 获取上一步暂存的item数据,重新构造item
item = response.meta['item']
item = ChineseItem(province_num=item["province_num"], province_name=item["province_name"],
province_url=item["province_url"],
city_num=item["city_num"], city_name=item["city_name"], city_url=item["city_url"],
county_num=item["county_num"], county_name=item["county_name"],
county_url=item["county_url"],
town_num=item["town_num"], town_name=item["town_name"], town_url=item["town_url"],
village_num=village_num, village_name=village_name, village_class=village_class)
# 这是最后一层直接返回item,不再需要scrapy.Request()函数
yield item
程序效果
执行main.py中的main函数,会在项目的根目录生成一个data.json文件,效果如下:
{
"province_num": "14",
"province_name": "山西省",
"province_url": "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/14.html",
"city_num": "141000000000",
"city_name": "临汾市",
"city_url": "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/14/1410.html",
"county_num": "141032000000",
"county_name": "永和县",
"county_url": "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/14/10/141032.html",
"town_num": "141032204000",
"town_name": "交口乡",
"town_url": "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2019/14/10/32/141032204.html",
"village_num": "141032204200",
"village_name": "交口村委会",
"village_class": "210"
}