1.初使用Scrapy框架爬虫,当当图书关于某个关键字的爬取。
----创建dangdang.py
1)添加首先爬取的目录,对于URL的拼接
- 使用urlencode
from urllib.parse import urlencode
data = {'key': 'python', 'act': 'input'} base_url = 'http://search.dangdang.com/?'
# 需要在setting设置爬取总页面MAX_PAGE for page in range(1, self.settings.get('MAX_PAGE') + 1): data['page_index'] = page parms = urlencode(data) url = base_url + parms # url拼接 yield Request(url, self.parse) # 回调函数parse解析网页
- 使用urljoin
from urllib.parse import urljoin BASE_URL = 'https://movie.douban.com/top250' def scrape_index(page): index_url = f'{BASE_URL}/?start={page*25}&filter=' return scrape_api(index_url)
- (以后更新)
2) 关于解析函数的使用
1. 对目标网页解析内容做分析,items下创建。
collection = table = 'dangdangbook' # MYSQL和mongodb表明 title = scrapy.Field() title_url = scrapy.Field() price = scrapy.Field() # comment = scrapy.Field() author = scrapy.Field() detail = scrapy.Field()
2. 书写parse方法
def parse(self, response):
# 关于服务器状态码 self.logger.debug('Status Code: ' + str(response.status)) dammit = UnicodeDammit(response.body, ["utf-8", "gbk"]) # UnicodeDammit是检测工具,检测类型为utf-8和gbk
#???? data = dammit.unicode_markup selector = scrapy.Selector(text=data)
# 关于xpath的菜鸟教程有,还有多联系一下 quotes = selector.xpath("//li['@ddt-pit'][starts-with(@class,'line')]") for quote in quotes: item = BooksItem() item['title'] = quote.xpath('./a[position()=1]/@title').extract_first() item['author'] = quote.xpath("./p[@class='search_book_author']/span[position()=1]/a/@title").extract_first() item['title_url'] = quote.xpath('./a[position()=1]/@href').extract_first() item['price'] = quote.xpath("./p[@class='price']/span[@class='search_now_price']/text()").extract_first() item['detail'] = quote.xpath("./p[@class='detail']/text()").extract_first() # item['comment'] = quote.css('.li > p.search_star_line > a::text').extract_first() yield item
3.关于随机headers的使用,在中间件创建类RandomUserAgentMi'd'de'rware:,既使用Downloader middleware模块,Scheduler从队列中拿出一个Request发送给downloader执行下载,这个过程会经过Downloader middleware的处理,还有当downloader将request下载完成得到Response返回给spider时再次经过Downloader middleware
class RandomUserAgentMiddleware: def __init__(self):
# 请求头池 self.user_agents = ['Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)', 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.2 (KHTML, like Gecko) Chrome/22.0.1216.0 Safari/537.2', 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:15.0) Gecko/20100101 Firefox/15.0.1' ] def process_request(self, request, spider):
# random.choice 随机选择请求头 request.headers['User-Agent'] = random.choice(self.user_agents) def process_response(self, request, response, spider):
# 关于状态码的回应 response.status = 201 return response
还需再settings设置中间件
# 添加随机头文件 DOWNLOADER_MIDDLEWARES = { 'books.middlewares.RandomUserAgentMiddleware': 543, }
------关于存储数据,管道可清洗数据,保存到数据库
1.保存在mongodb。class MongoPipeline(object):
def __init__(self, mongo_url, mongo_db): self.mongo_url = mongo_url self.mongo_db = mongo_db
# MONGO_DB需要在设置里添加 @classmethod # 依赖注入,通过crawler可以全局变量的配置信息
# from_crawler def from_crawler(cls, crawler): return cls( mongo_url=crawler.settings.get('MONGO_URL'), mongo_db=crawler.settings.get('MONGO_DB') ) # 当Spider被开启时,方法被调用,初始化 def open_spider(self, spider): self.client = pymongo.MongoClient(self.mongo_url) # 创建连接 self.db = self.client[self.mongo_db] # 传入item方法中,item.collect def process_item(self, item, spider): name = item.collection
# 将item数据插入 self.db[name].insert(dict(item)) # 插入操作 return item # 当Spider关闭时,要做一些事情 def close_spider(self, spider): self.client.close() # 连接关闭
2.保存到mysql,首先的创建数据库,创建表
class MysqlPipeline: def __init__(self, host, database, user, password, port): self.host = host self.database = database self.user = user self.password = password self.port = port @classmethod def from_crawler(cls, crawler): return cls( host=crawler.settings.get('MYSQL_HOST'), database=crawler.settings.get('MYSQL_DATABASE'), user=crawler.settings.get('MYSQL_USER'), password=crawler.settings.get('MYSQL_PASSWORD'), port=crawler.settings.get('MYSQL_PORT'), ) def open_spider(self, spider): self.db = pymysql.connect(host=self.host, user=self.user, password=self.password, database=self.database, charset='utf8', port=self.port) self.cursor = self.db.cursor() # 创建游标 def close_spider(self, spider): self.db.close() # 游标关闭 def process_item(self, item, spider): print(item['title']) data = dict(item) # 字典数据 keys = ', '.join(data.keys()) # 字典主键间隔加, values = ', '.join(['%s'] * len(data)) # 键值分割符加 sql = 'insert into %s (%s) values (%s)' % (item.table, keys, values) self.cursor.execute(sql, tuple(data.values())) # 执行sql语句 self.db.commit() # 提交表格 return item
之后全局变量中,设置管道值,设置数据库连接
# 添加数据库mongodb MONGO_URI = 'localhost' MONGO_DB = 'images360' # 添加到mysql MYSQL_HOST = 'localhost' MYSQL_DATABASE = 'dangdangbook' MYSQL_PORT = 3306 MYSQL_USER = 'root' MYSQL_PASSWORD = '123456'
ITEM_PIPELINES = { 'books.pipelines.MongoPipeline': 300, # 管值越小,优先权越大 'books.pipelines.MysqlPipeline': 300, # 'books.pipelines.BooksPipeline': 300, }
Scrapy 爬取过程,对于scrapy框架:
- Engine打开一个网址,找到处理该网站的spider并向Spider请求提交第一个要爬取的网站。
- Engine从Spider中获取到第一个要爬取的URL并通过Scheduler以Request的形式调度
- Engine向Scheduler请求下一个要爬取的URL
- Scheduler返回下一个要爬取的URL给Engine,Engine将URL通过Downloader Middewares 转发给Downloader下载
- 一旦页面下载完毕,Downloader 生成一个该页面的Response,并将其通过Downloader Middewares发送给Engine
- Engine从下载器中接受到Response并通过Spider Middlewares发送给Spider处理
- Spider处理Response 并返回爬取的ITEM及新的Request给Engine
- Engine将spider返回的item给item pipeliane,将新的Requset给Scheduler
- 重复操作,直到Scheduler没有新的URL