中间件
import random import redis from scrapy.exceptions import NotConfigured from twisted.internet.error import ConnectError, TimeoutError class RandomProxyMiddleWare(object): def __init__(self, settings): # 2.初始化配置及相关变量 self.r = redis.Redis(host='127.0.0.1') self.proxy_key = settings.get('PROXY_REDIS_KEY') self.proxy_stats_key = self.proxy_key + '_stats' self.max_failed = 3 @property def proxies(self): return [i.decode('utf-8') for i in self.r.lrange(self.proxy_key, 0, -1)] @classmethod def from_crawler(cls, crawler): # 1. 创建中间件对象 # 默认代理是启用的 if not crawler.settings.getbool('HTTPPROXY_ENABLED'): raise NotConfigured return cls(crawler.settings) def process_request(self, request, spider): # 3. 为每个request对象分配随机的ip代理 if self.proxies and not request.meta.get('proxy') and request.url not in spider.start_urls: request.meta['proxy'] = random.choices(self.proxies) def process_response(self, request, response, spider): # 4.0 请求成功 cur_proxy = request.meta.get('proxy') # 判断ip是否被对方封禁 if response.status in (401, 403): self.r.hincrby(self.proxy_stats_key, cur_proxy, 1) # 当某个IP的失败次数累积到一定的数量 filed_times = self.hget(self.proxy_stats_key, cur_proxy) or 0 if int(filed_times) >= self.max_failed: print('got wrong http code (%s) when use %s' % (response.status, cur_proxy)) # 可以认为该IP被对方封禁。从代理池中将该IP删除 self.remove_proxy(cur_proxy) del request.meta['proxy'] # 返回request 将该请求重新->调度器 return request return response def process_exception(self, request, exception, spider): # 4.1 请求失败 cur_proxy = request.meta.get('proxy') # 请求使用代理,并且网络请求报错,认为该IP出错,删除,并重新->调度器 if cur_proxy and isinstance(cur_proxy, (ConnectError, TimeoutError)): print('error (%s) occur when use proxy %s' % (exception, cur_proxy)) self.remove_proxy(cur_proxy) del request.meta['proxy'] return request def remove_proxy(self, proxy): if proxy in self.proxies: self.r.lrem(self.proxy_key, proxy) self.r.hdel(self.proxy_stats_key, proxy)
setting配置文件
PROXY_REDIS_KEY= ""
把ip放进去,单独定义一个py文件
import redis PROXY = [ 'http://192.169.1.1:8000', ] r = redis.Redis(host='127.0.0.1') r.lpush('PROXY_REDIS_KEY的名称', *PROXY)