基于Scrapy-redis去重
1、安装scrapy-redis
pip3 install scrapy-redis
2、完全自定义redis去重原理
import redis
from scrapy.dupefilter import BaseDupeFilter
# 类似MD5值的一个数【如果url是一样的那么这个类似md5值的数也是一样的】
from scrapy.utils.request import request_fingerprint
class DupFilter(BaseDupeFilter):
def __init__(self):
self.conn = redis.Redis(host='127.0.0.1',port=6379)
def request_seen(self, request):
"""
检测当前请求是否已经被访问过
:param request:
:return: True表示已经访问过;False表示未访问过
"""
fid = request_fingerprint(request)
# redis集合
result = self.conn.sadd('visited_urls', fid)
if result == 1:
return False
return True
translate.py【去重】
# 修改默认配置
#DUPEFILTER_CLASS = 'scrapy.dupefilter.RFPDupeFilter'
DUPEFILTER_CLASS = 'xxd.translate.DupFilter'
3、使用scrapy-redis自带的去重规则
from scrapy_redis.dupefilter import RFPDupeFilter
from scrapy_redis.connection import get_redis_from_settings
from scrapy_redis import defaults
class RedisDupeFilter(RFPDupeFilter):
@classmethod
def from_settings(cls, settings):
"""Returns an instance from given settings.
This uses by default the key ``dupefilter:<timestamp>``. When using the
``scrapy_redis.scheduler.Scheduler`` class, this method is not used as
it needs to pass the spider name in the key.
Parameters
----------
settings : scrapy.settings.Settings
Returns
-------
RFPDupeFilter
A RFPDupeFilter instance.
"""
server = get_redis_from_settings(settings)
# XXX: This creates one-time key. needed to support to use this
# class as standalone dupefilter with scrapy's default scheduler
# if scrapy passes spider on open() method this wouldn't be needed
# TODO: Use SCRAPY_JOB env as default and fallback to timestamp.
key = defaults.DUPEFILTER_KEY % {'timestamp': 'xiaodongbei'}
debug = settings.getbool('DUPEFILTER_DEBUG')
return cls(server, key=key, debug=debug)
duplicate_removal.py
# ############### scrapy redis连接 ####################
REDIS_HOST = '140.143.227.206' # 主机名
REDIS_PORT = 8888 # 端口
REDIS_PARAMS = {'password':'beta'} # Redis连接参数 默认:REDIS_PARAMS = {'socket_timeout': 30,'socket_connect_timeout': 30,'retry_on_timeout': True,'encoding': REDIS_ENCODING,})
REDIS_ENCODING = "utf-8" # redis编码类型 默认:'utf-8'
# REDIS_URL = 'redis://user:pass@hostname:9001' # 连接URL(优先于以上配置)
DUPEFILTER_KEY = 'dupefilter:%(timestamp)s'
# DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
DUPEFILTER_CLASS = 'dbd.duplicate_removal.RedisDupeFilter'
配置