1 安装(linux环境)
首先安装docker
curl -sSL https://get.daocloud.io/docker | sh复制代码
2 拉取镜像
sudo docker pull scrapinghub/splash复制代码
3 启动容器:
sudo docker run -p 8050:8050 -p 5023:5023 scrapinghub/splash复制代码
现在splash在0.0.0.0这个ip上监听并绑定了端口8050(http) 和5023 (telnet)
这样,splash就启动起来了,如果想远程访问的话,要是阿里云的服务器,就去安全组中将进方向和出方向的端口都配上
具体的splash操作详见
splash-cn-doc.readthedocs.io/zh_CN/lates…
番外篇
# 查看linux中的docker进程docker ps# 杀死docker进程docker kill 338****0d(就是id)复制代码
一旦发现自己的多个进程或循环进程只运行了一次,看scrapy.Request的参数dont_filter是不是没有设置
requests搭配splash--url
# 原生的requests和splash的结合import requestsfrom fake_useragent import UserAgent splash_url = "http://192.168.59.103:8050/render.html?url={}&wait=1"url = 'https://www.guazi.com/sh/buy/'headers = {"User-Agent": UserAgent().random} response = requests.get(splash_url.format(url),headers={"User-Agent": UserAgent().random}) response.encoding='utf-8'print(response.text)复制代码
request搭配splash-lua
# 执行lua代码import requestsfrom fake_useragent import UserAgentfrom urllib.parse import quote url = "https://www.guazi.com/sh/buy/"lua_script = ''' function main(splash,args) splash:go('{}') splash.wait(2) return splash:html() end '''.format(url) splash_url = "http://192.168.59.103:8050/execute?lua_source={}".format(quote(lua_script)) headers = {"User-Agent": UserAgent().random} print(splash_url) response = requests.get(splash_url, headers={"User-Agent": UserAgent().random}) response.encoding = 'utf-8'print(response.text)复制代码
scrapy搭配spalsh
#. 然后在对应scrapy项目的settings里面配置Splash服务的地址,例如:SPLASH_URL = 'http://192.168.59.103:8050'#. 在settings中的DOWNLOADER_MIDDLEWARES 加上splash的中间件,并设置 HttpCompressionMiddleware 对象的优先级DOWNLOADER_MIDDLEWARES = {'scrapy_splash.SplashCookiesMiddleware': 723,'scrapy_splash.SplashMiddleware': 725,'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware': 810, }#. 在SPIDER_MIDDLEWARES 中安装splash的 SplashDeduplicateArgsMiddleware 中间件SPIDER_MIDDLEWARES = {'scrapy_splash.SplashDeduplicateArgsMiddleware': 100, }#. 您还可以设置对应的过滤中间件——DUPEFILTER_CLASSDUPEFILTER_CLASS = 'scrapy_splash.SplashAwareDupeFilter'#. 您可以设置scrapy.contrib.httpcache.FilesystemCacheStorage 来使用Splash的HTTP缓存HTTPCACHE_STORAGE = 'scrapy_splash.SplashAwareFSCacheStorage'# 然后再spider中这样写(方式一)import scrapyfrom scrapy_splash import SplashRequestclass BaiduSpider(scrapy.Spider):name = 'guazi'allowed_domains = ['guazi.comn'] start_urls = ['https://www.guazi.com/sh/buy/']def start_requests(self):yield SplashRequest(self.start_urls[0],dont_filter=True,args={'wait':1})def parse(self, response):print(response.text) # 方式二import scrapyfrom scrapy_splash import SplashRequestclass BaiduSpider(scrapy.Spider):name = 'guazi2'allowed_domains = ['guazi.comn'] start_urls = ['https://www.guazi.com/sh/buy/']def start_requests(self):lua_script = ''' function main(splash, args) assert(splash:go(args.url)) assert(splash:wait(0.5)) return { html = splash:html() } end '''yield SplashRequest(url=self.start_urls[0],endpoint='execute',args={'lua_source':lua_script})def parse(self, response):print(response.text)复制代码