一、爬虫

1、概述

网络爬虫,搜索引擎就是爬虫的应用者。

 

2、爬虫分类

(1)通用爬虫,常见就是搜索引擎,无差别的收集数据,存储,提取关键字,构建索引库,给用户提供搜索接口。

爬取一般流程:

 

初始化一批URL,将这些url放入到等待爬取队列。

从队列取出这些url,通过dns解析ip,对应ip站点下载HTML页面,保存到本地服务器中,爬取完的url放到已爬取队列。

分析这些网页内容,找出网页里面关心的url连接,继续执行第二步,直到爬取结束。

 

 

搜索引擎如何获取一个新网站的url。

 

新网站主动提交给搜索引擎。

通过其他网站页面中设置的外链。

搜索引擎和dns服务商合作,获取最新收录的网站。

 

 

 

(2)聚焦爬虫

有针对性的编写特定领域数据的爬取程序,针对某些类别数据的采集的爬虫,是面向主题的。

 

 

3、robots协议

指定一个robots.txt文件,告诉爬虫引擎什么可以爬取。

 

这个协议为了让搜索引擎更有效率搜索自己内容,提供了sitemap这样的文件。

这个文件禁止抓取的往往又是可能我们感兴趣的内容,反而泄露了这些地址。。

 

4、http请求和响应处理

 

爬虫网页就是通过HTTP协议访问网页,不过通过浏览器访问往往是人的行为,把程序编程人的行为的问题。

 

 

 

Urllib包

 

from urllib.request import urlopen



response = urlopen('http://www.bing.com')

print(response.closed)


with response:

    print(response.status)

    print(response._method)

    print(response.read())

    print(response.closed)

    print(response.info)

print(response.closed)

 

使用等,urllib包,使用查询等。

 

 

解决useragent问题:

 

 

from urllib.request import urlopen,Request


url = 'http://www.bing.com'

ua = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'


req = Request(url,headers={'User-agent':ua})

response = urlopen(req,timeout=10)

# print(req)

print(response.closed)


with response:

    print(response.status)

    print(response._method)

    # print(response.read())

    # print(response.closed)

    # # print(response.info)

    print(response.geturl())

print(req.get_header('User-agent'))

print(response.closed)

 

 

Chrome浏览器获取useragent

5、parse

 

from urllib import parse


d = {

    'id':1,

    'name':'tom',

    'url':'http://www.magedu.com'

}


url = 'http://www.magedu.com'

u = parse.urlencode(d)   #url编码

print(u)


print(parse.unquote(u))#解码

 

6、请求方法

from urllib import parse

import simplejson


base_url = 'http://cn.bing.com/search'


d = {

    'q':'马哥教育'

}

# d = {

#     'id':1,

#     'name':'tom',

#     'url':'http://www.magedu.com'

# }


# url = 'http://www.magedu.com'

u = parse.urlencode(d)   #url编码


# url = '{}?{}'.format(base_url,u)

# print(url)

#

# print(parse.unquote(url))#解码


from urllib.request import urlopen,Request


url = 'http://httpbin.org/post'


ua = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'


data = parse.urlencode({'name':'张三,@=/&*','age':'6'})


req = Request(url,headers={

    'User-agent':ua

})


# res = urlopen(req)


with urlopen(req,data= data.encode()) as res:

    text = res.read()

    d = simplejson.loads(text)

    print(d)

    # with open('c:/assets/bing.html','wb+') as f:

        # f.write(res.read())

        # f.flush() 
from urllib import parse

import simplejson


base_url = 'http://cn.bing.com/search'


d = {

    'q':'马哥教育'

}

# d = {

#     'id':1,

#     'name':'tom',

#     'url':'http://www.magedu.com'

# }


# url = 'http://www.magedu.com'

u = parse.urlencode(d)   #url编码


# url = '{}?{}'.format(base_url,u)

# print(url)

#

# print(parse.unquote(url))#解码


from urllib.request import urlopen,Request


url = 'http://httpbin.org/post'


ua = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'


data = parse.urlencode({'name':'张三,@=/&*','age':'6'})


req = Request(url,headers={

    'User-agent':ua

})


# res = urlopen(req)


with urlopen(req,data= data.encode()) as res:

    text = res.read()

    d = simplejson.loads(text)

    print(d)

    # with open('c:/assets/bing.html','wb+') as f:

        # f.write(res.read())

        # f.flush()

 

 

 

7、爬取豆瓣网

 

from urllib.request import Request,urlopen

import simplejson

from urllib import
parse


ua = 'Mozilla/5.0 (Windows NT 6.2; WOW64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'


jurl = 'https://movie.douban.com/j/search_subjects'


d = {

    'type':'movie',

    'tag':'热门',

    'page_limit':10,

    'page_start':10

}


req = Request('{}?{}'.format(jurl,parse.urlencode(d)),headers={

    'User-agent':ua

})


with urlopen(req) as res:

    sub = simplejson.loads(res.read())

    print(len(sub))

    print(sub)

 

8、解决https,ca证书的问题

 

忽略证书,ssl

 

from urllib.request import Request,urlopen


from urllib import
parse

import ssl


#request =
Request('http://www.12306.cn/mormhweb')

request = Request('http://www.baidu.com')

request.add_header('User-agent','Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like
Gecko) Chrome/67.0.3396.99 Safari/537.36'

)


context = ssl._create_unverified_context() 
#忽略不可用证书


with urlopen(request,context=context) as res:

    print(res._method)

    print(res.read())

 

 

9、urllib3

pip install urllib3
 
import urllib3



url = 'http://movie.douban.com'


ua = 
'Mozilla/5.0
(Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/67.0.3396.99 Safari/537.36'


with urllib3.PoolManager() as http:   #连接池管理器

    response = http.request('GET',url,headers={'User-agent':ua})

    print(1,response)

    print(2,type(response))

    print(3,response.status,response.reason)

    print(4,response.headers)

    print(5,response.data) 
 
import urllib3

from urllib.parse import urlencode

from urllib3 import
HTTPResponse


url = 'http://movie.douban.com'


ua = 
'Mozilla/5.0
(Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99
Safari/537.36'


jurl = 'https://movie.douban.com/j/search_subjects'


d = {

    'type':'movie',

    'tag':'热门',

    'page_limit':10,

    'page_start':10

}


# with urllib3.PoolManager() as http:   #连接池管理器

#     response =
http.request('GET',url,headers={'User-agent':ua})   #可以指定请求方法

#     print(1,response)

#     print(2,type(response))

#     print(3,response.status,response.reason)

#     print(4,response.headers)

#     print(5,response.data)


with urllib3.PoolManager() as http:

    response = http.request('GET','{}?{}'.format(jurl,urlencode(d)),headers={'User-agent':ua})

    print(response)

    print(response.status)

    print(response.data)

 

 

10、requests库

Requests使用了urllib3.

pip install requests

 

import urllib3

from urllib.parse import urlencode

from urllib3 import
HTTPResponse

import requests



# url = 'http://movie.douban.com'


ua = 
'Mozilla/5.0
(Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/67.0.3396.99 Safari/537.36'


jurl = 'https://movie.douban.com/j/search_subjects'


d = {

    'type':'movie',

    'tag':'热门',

    'page_limit':10,

    'page_start':10

}

url = '{}?{}'.format(jurl,urlencode(d))


# with urllib3.PoolManager() as http:   #连接池管理器

#     response =
http.request('GET',url,headers={'User-agent':ua})   #可以指定请求方法

#     print(1,response)

#     print(2,type(response))

#    
print(3,response.status,response.reason)

#     print(4,response.headers)

#     print(5,response.data)


# with urllib3.PoolManager() as http:

#     response =
http.request('GET','{}?{}'.format(jurl,urlencode(d)),headers={'User-agent':ua})

#     print(response)

#     print(response.status)

#     print(response.data)


response =
requests.request('GET',url,headers = {'User-agent':ua})


with response:

    print(response.text)

    print(response.status_code)

    print(response.url)

    print(response.headers)

    print(response.request)

 

 

 

带会话的方式  session。

会把请求头等信息自动管理。

 

 

import urllib3

from urllib.parse import urlencode

from urllib3 import
HTTPResponse

import requests



# url = 'http://movie.douban.com'


ua = 
'Mozilla/5.0
(Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99
Safari/537.36'


jurl = 'https://movie.douban.com/j/search_subjects'


d = {

    'type':'movie',

    'tag':'热门',

    'page_limit':10,

    'page_start':10

}

# url = '{}?{}'.format(jurl,urlencode(d))


# with urllib3.PoolManager() as http:   #连接池管理器

#     response =
http.request('GET',url,headers={'User-agent':ua})   #可以指定请求方法

#     print(1,response)

#     print(2,type(response))

#    
print(3,response.status,response.reason)

#     print(4,response.headers)

#     print(5,response.data)


# with urllib3.PoolManager() as http:

#     response =
http.request('GET','{}?{}'.format(jurl,urlencode(d)),headers={'User-agent':ua})

#     print(response)

#     print(response.status)

#     print(response.data)


# response = requests.request('GET',url,headers = {'User-agent':ua})

#

# with response:

#     print(response.text)

#     print(response.status_code)

#     print(response.url)

#     print(response.headers)

#     print(response.request)

urls = ['https://www.baidu.com/s?wd=magedu','https://www.baidu.com/s?wd=magedu']


session = requests.Session()

with session:

    for url in urls:

        response = session.get(url,headers={'User-agent':ua})

        with response:

            print(1,response.text)

            print(2,response.status_code)

            print(3,response.url)

            print(4,response.headers)

            print(5,response.request.headers)

            print('--------')

            print(response.cookies)

            print('--------------')

            print(response.cookies)

 

11、特别注意

个别网站登录的时候cookie,登录的时候要把原来的cookie带回去,然后登录成功后其给你返回一个新的,否则不能进行相关操作。有些时候只是带一些cookie相关的值即可。

反爬措施:对于用户发起的请求来检测上一次是否访问的是我的网站。

 

在network的referer里面显示上一次访问网站的哪个一页。

 

Files:上传的文件内容。

 

路由器的将用户名和密码加密放在请求头里面。

 

 

Cert证书。

 

Requests基本功能:

 

 

 

import requests


ua = 'Mozilla/5.0 (Windows NT 6.2; WOW64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36
Core/1.63.5514.400 QQBrowser/10.1.1660.400'

url = 'https://dig.chouti.com/login'


data = {

'phone':'8618804928235',

'password':'tana248654',

'oneMonth':'1'

}


r1_urls = 'https://dig.chouti.com'

r1 = requests.get(url=r1_urls,headers={'User-Agent':ua})

# print(r1.text)

r1_cookie = r1.cookies.get_dict()

print('r1',r1.cookies)


response = requests.post(url,data,headers={'User-Agent':ua},cookies=r1_cookie)


print(response.text)

print(response.cookies.get_dict())



r3 = requests.post(url='https://dig.chouti.com/link/vote?linksId=21718341',

                   cookies={'gpsd':r1_cookie.get('gpsd')},headers={'User-Agent':ua})


print(r3.text)

 

二、HTML解析

通过上面的库,可以拿到HTML内容。

 

1、Xpath

http://www.qutoric.com/xmlquire/

站点。

路径的遍历,查找到需要的内容。

 

 

2、lxml库

解析HTML的库。

https://lxml.de/

 

安装:

pip install lxml

 

 

爬取豆瓣网top10

import urllib3

from urllib.parse import urlencode

from urllib3 import
HTTPResponse

import requests

from lxml import
etree


# url = 'http://movie.douban.com'


ua = 
'Mozilla/5.0
(Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)
Chrome/67.0.3396.99 Safari/537.36'


jurl = 'https://movie.douban.com/j/search_subjects'


d = {

    'type':'movie',

    'tag':'热门',

    'page_limit':10,

    'page_start':10

}

# urls =
['https://www.baidu.com/s?wd=magedu','https://www.baidu.com/s?wd=magedu']

urls = ['https://movie.douban.com/']


session = requests.Session()

with session:

    for url in urls:

        response = session.get(url,headers={'User-agent':ua})

        with response:

            content = response.text


        html = etree.HTML(content)

        title = html.xpath("//div[@class='billboard-bd']//tr")

        for t in title:

            txt = t.xpath('.//text()')

            print(''.join(map(lambda x:x.strip(),txt)))

            # print(t)

 

 

3、beautifulsoup4

 

 

 

4、可以导航的string(navigablestring)

深度优先遍历。

 

Soup.findall().
Soup.findall(id =’header’)

 

 

 

5、css选择器

Soup.select          正则表达式
 
Pip install jsonpath.

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

from concurrent.futures import ThreadPoolExecutor

import threading

import time

from queue import
Queue

import logging

import requests

from bs4 import
BeautifulSoup


event = threading.Event()

url = 'https://news.enblogs.com'

path = '/n/page/'

ua = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'


urls = Queue()

htmls = Queue()

outps = Queue()



def create_urls(start,stop,step=1):

    for i in range(start,stop+1,step):

        url1 = '{}{}{}/'.format(url,path,i)

        urls.put(url1)


def crawler():

    while not event.is_set():

        try:

            url1 =
urls.get(True,1)


            response = requests.get(url,headers={'User-agent':ua})

            with response:

                html = response.text

                htmls.put(html)

        except Exception
as e:

            print(1,e)


def parse():

    while not event.is_set():

        try:

            html = htmls.get(True,1)

            soup = BeautifulSoup(html,'lxml')

            news = soup.select('h2.news_entry a')



            for n in news:

                txt = n.text

                url1 = url + n.attrs.get('href')

                outps.put((txt,url1))


        except Exception
as e:

            print(e)


def save(path):

    with open(path,'a+',encoding='utf-8') as f:

        while not event.is_set():

            try:

                title,url1 = outps.get(True,1)

                f.write('{}{}\n'.format(title,url1))

                f.flush()

            except Exception
as e:

                print(e)


executor = ThreadPoolExecutor(max_workers=10)

executor.submit(create_urls,1,10)

executor.submit(parse)

executor.submit(save,'c:/new.txt')


for i in range(7):

    executor.submit(crawler)


while True:

    cmd = input('>>>')

    if cmd.strip()
== 'q':

        event.set()

        executor.shutdown()

        print('close')

        time.sleep()

        break

 

三、动态网页处理

很多网站采用的是ajax技术,spa技术。部分内容都是异步加载的,提高用户体验。

 

1、phantomjs无头浏览器

 

http://phantomjs.org/

 

 

 

 

Xml http 与后端服务器建立的连接。

 

2、selenium

 

(1)自动化测试工具等,可以直接截图。模仿浏览器的行为等。

 

from selenium import webdriver

import datetime

import time

import random



driver = webdriver.PhantomJS('c:/assets/phantomjs-2.1.1-windows/bin/phantomjs.exe')


driver.set_window_size(1024,1024)

url = 'https://cn.bing.com/search?q=%E9%A9%AC%E5%93%A5%E6%95%99%E8%82%B2'

driver.get(url)




def savedic():

    try:

        base_dir = 'C:/assets/'

        filename = '{}{:%Y%m%d%H%M%S}{}.png'.format(base_dir,datetime.datetime.now(),random.randint(1,100))

        driver.save_screenshot(filename)

    except Exception
as e:

        print(1,e)

# time.sleep(6)

# print('-------')

# savedic()

MAXRETRIES = 5

while MAXRETRIES:

    try:

        ele = driver.find_element_by_id('b_results')

        print(ele)

        print('===========')

        savedic()

        break

    except Exception as e:

        print(e)

        print(type(e))

    time.sleep(1)

    MAXRETRIES -= 1

 

 

查找数据等,异步的方式。

 

 

 

(2)下拉框子使用,使用Select。

 

 

 

3、模拟键盘输入

 

模仿浏览器登录,先找到登录框的id,然后,setkeys。

 

之后返回登录后的网页。

 

from selenium import webdriver

from selenium.webdriver.common.keys import Keys

import time

import random

import datetime



driver = webdriver.PhantomJS('c:/assets/phantomjs-2.1.1-windows/bin/phantomjs.exe')


driver.set_window_size(1024,1024)


url = ''


def savedic():

    try:

        base_dir = 'C:/assets/'

        filename = '{}{:%Y%m%d%H%M%S}{}.png'.format(base_dir,datetime.datetime.now(),random.randint(1,100))

        driver.save_screenshot(filename)

    except Exception
as e:

        print(1,e)


driver.get(url)

print(driver.current_url,111111111111)

savedic()


email = driver.find_element_by_id('userMail')

passwed = driver.find_element_by_id('userPassword')


email.send_keys('604603701@qq.com')

passwed.send_keys('tana248654')

savedic()

passwed.send_keys(Keys.ENTER)




time.sleep(2)

print(driver.current_url,2222222222)

userinfo = driver.find_element_by_class_name('user-info')

print(userinfo.text)

time.sleep(2)

cookie = driver.get_cookies()

print(cookie)

savedic()

 

4、页面等待

(1)time.sleep

数据js加载需要一定的时间内。

 

线程休眠。

设置尝试的次数等

 

(2)selenium里面的wait

显示等待

from selenium.webdriver.common.by import By

from selenium.webdriver.support.ui import WebDriverWait

from selenium.webdriver.support import expected_conditions
as EC 
try:

    email = WebDriverWait(driver,10).until(

       
EC.presence_of_all_elements_located((By.ID,'userMail'))

    )

    savedic()

finally:

    driver.quit()

 

 

隐士的等待

driver.implicitly_wait(10)

 

 

总结:

 

 

四、scrapy框架

 

 

1、安装

Pip install scrapy    可能报错,报错的原因是下载tw开头的文件.whl文件,然后pip安装。

 

2、使用

 

 

scrapy startproject scrapyapp   开启一个项目

 

scrapy genspider donz_spider dnoz.org  进入spider文件下创建一个新的模块,把要爬取的网站加到url列表中。

scrapy genspider -t basic dbbook douban.com   继承自baseic模板。内容少。

 

scrapy genspider -t crawl book douban.com   继承自crawl模板,内容多。

 

-t 后面加的是模板。  然后名字和网站

scrapy crawl donz_spider   运行代码,运行时候报错的话pip install pypiwin32

 

 

from scrapy.http.response.html import HtmlResponse

 

response 继承于HTMLResponse。

 

在item设置中设置要爬取的信息的类例如标题。

在spiders下的文件里面写爬虫的xpath,爬取的队列及爬取内容的匹配。

Middlewares里面是中间件。

Pipelines里面处理函数。

 

 

 

五、scrapy-redis组件

1、scrapy-redis使用

 

 

 

Pip install
scrapy_redis

 

使用redis作为队列需要的配置文件

Setting.py
BOT_NAME = 'scrapyapp'


SPIDER_MODULES = ['scrapyapp.spiders']

NEWSPIDER_MODULE = 'scrapyapp.spiders'


USER_AGENT = 'Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'


ROBOTSTXT_OBEY = False

DOWNLOAD_DELAY = 1

COOKIES_ENABLED = False


SCHEDULER = "scrapy_redis.scheduler.Scheduler"

DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"ITEM_PIPELINES = {      # 
redis数据库连接相关

    'scrapyapp.pipelines.ScrapyappPipeline': 300,

    'scrapy_redis.pipelines.RedisPipeline': 543,

}REDIS_HOST = '192.168.118.130'    

REDIS_PORT = 6379


# LOG_LEVEL = 'DEBUG' 
 
Spiders 下面的爬虫文件.py
# -*- coding: utf-8
-*-

import scrapy

from scrapy.linkextractors import LinkExtractor

from scrapy.spiders import CrawlSpider, Rule

from scrapy_redis.spiders import RedisCrawlSpider

from ..items import
MovieItem



class MoviecommentSpider(RedisCrawlSpider):

    name = 'moviecomment'

    allowed_domains = ['douban.com']

    # start_urls = ['http://douban.com/']

    redis_key = 'moviecomment1:start_urls'


    rules = (

        Rule(LinkExtractor(allow=r'start=\d+'), callback='parse_item', follow=False),

    )


    def parse_item(self, response):

        # i = {}

        #i['domain_id'] =
response.xpath('//input[@id="sid"]/@value').extract()

        #i['name'] =
response.xpath('//div[@id="name"]').extract()

        #i['description'] =
response.xpath('//div[@id="description"]').extract()

        # return i

        comment = '//div[@class="comment-item"]//span[@class="short"]/text()'

        reviews = response.xpath(comment).extract()

        for review
in reviews:

            item = MovieItem()

            item['comment'] =
review.strip()

            yield item 
 
 
Item.py
import scrapy



class MovieItem(scrapy.Item):

    # define the fields for your item here like:

    # name = scrapy.Field()

    comment = scrapy.Field()

 

 

 

redis数据中要设置一个key值和movecomment.py 中的redis_key = 'moviecomment1:start_urls'  设置value及初始的url值。

 

完成后数据库会存储响应的值

 

 

可以在redis-cli 后面加上 –ra

 

2、分析

(1)jieba分词

Pip install jieba

 

 

(2)stopword停用词

数据清洗:把脏数据洗掉,检测出并除去数据中无效或者无关的数据,例如空值,非法值的检测,重复数据检测等。

 

 

(3)词云

Pip install
wordcloud
 
 
from redis import Redis

import json

import jieba



redis = Redis()

stopwords = set()

with open('', encoding='gbk') as f:

    for line
in f:

        print(line.rstrip('\r\n').encode())

        stopwords.add(line.rstrip('\r\n'))

print(len(stopwords))

print(stopwords)

items = redis.lrange('dbreview:items', 0, -1)

print(type(items))



words = {}

for item in items:

    val = json.loads(item)['review']

    for word
in jieba.cut(val):

        words[word] = words.get(word, 0) + 1

print(len(words))

print(sorted(words.items(), key=lambda x: x[1], reverse=True))

 

 

分词代码测试

 

六、scrapy项目

1、知识回顾

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

2、爬取技术网站

 

praise_nums = response.xpath("//span[contains(@class,
'vote-post-up')]/text()").extract()

fav_nums = response.xpath("//span[contains(@class,
'bookmark-btn')]/text()").extract()

# match_re =
re.match(".*(\d+).*", fav_nums)

 

class的值有多个的时候,使用container进行选取。

from scrapy.http import Request  #找到的url传递给下一级

from urllib import parse 
#提取下一页并交给scrapy下载

next_url =
response.xpath('//div[@class="navigation
margin-20"]/a[4]/@href').extract()

if next_url:

    yield Request(url=parse.urljoin(response.url, next_url), callback=self.parse)

 

 

(1)图片处理及存储:

pip install pillow
 
IMAGES_URLS_FIELD =
"front_image_url"

project_dir =
os.path.abspath(os.path.dirname(__file__))

IMAGES_STORE = os.path.join(project_dir, 'images')

 

 

 

 

(2)写入到本地文件:

class JsonWithEncodingPipeline(object):

    def __init__(self):

        self.file
= codecs.open('article.json', 'w', encoding='utf-8')


    def process_item(self, item, spider):

        lines = json.dumps(dict(item), ensure_ascii=False) + "\n"

        self.file.write(lines)

        return item


    def spider_closed(self, spider):

        self.file.close()
class JsonWithEncodingPipeline(object):

    def __init__(self):

        self.file
= codecs.open('article.json', 'w', encoding='utf-8')


    def process_item(self, item, spider):

        lines = json.dumps(dict(item), ensure_ascii=False) + "\n"

        self.file.write(lines)

        return item


    def spider_closed(self, spider):

        self.file.close()

 

 

 

scrapy自带的JsonItemExporter

 

(3)导出功能,还有csv文件等

 

class JsonItemExporterPipeline(object):

    '''

    调用scrapy的JsonItemExporter

    '''

    def __init__(self):

        self.file
= open('articleexport.json', 'wb')

        self.exporter
= JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False)

        self.exporter.start_exporting()


    def close_spider(self, spider):

        self.exporter.finish_exporting()

        self.file.close()


    def process_item(self, item, spider):

        self.exporter.export_item(item)

        return item

 

 

(4)数据库插入操作

 

class MysqlPipeline(object):

    def __init__(self):

        self.conn
= MySQLdb.connect('192.168.118.131', 'wang', 'wang', 'scrapy_jobbole', charset='utf8', use_unicode=True)

        self.cursor
= self.conn.cursor()


    def process_item(self, item, spider):

        insert_sql = """

        insert into jobbole_article(title,
url, create_date, fav_nums)

        values (%s, %s, %s, %s)

        """

        self.cursor.execute(insert_sql, (item['title'], item['url'], item['create_date'], item['fav_nums']))

        self.conn.commit()

 

 

 

(5)scrapy提供的异步方法

 

import MySQLdb

import MySQLdb.cursors

from twisted.enterprise import adbapi 
class MysqlTwistedPipeline(object):

    def __init__(self, dbpool):

        self.dbpool
= dbpool


    @classmethod

    def from_settings(cls, settings):

        dbparms = dict(

            host=settings['MYSQL_HOST'],

            db=settings['MYSQL_DBNAME'],

            user=settings['MYSQL_USER'],

            password =
settings['MYSQL_PASSWORD'],

            charset='utf8',

            cursorclass = MySQLdb.cursors.DictCursor,

            use_unicode = True

        )


        dbpool = adbapi.ConnectionPool('MySQLdb', **dbparms)

        return cls(dbpool)


    def process_item(self, item, spider):

        '''

        异步操作

        :param item:

        :param spider:

        :return:

        '''

        query = self.dbpool.runInteraction(self.do_insert, item)

        query.addErrback(self.handle_error)


    def handle_error(self, failure):

        '''

        处理插入的异常

        :param failure:

        :return:

        '''

        print(failure)


    def do_insert(self, cursor, item):

        '''

        执行具体插入

        :param cursor:

        :param item:

        :return:

        '''

        insert_sql = """

        insert into
jobbole_article(title, url, create_date, fav_nums)

        values (%s, %s, %s, %s)

        """

        cursor.execute(insert_sql, (item['title'], item['url'], item['create_date'], item['fav_nums']))

 

 

 

(5)将django的model集成到scrapy   

Scrapy-djangoitem

 

 

 

(6)改变超多的xpath和css,使用itemloader

# 通过itemloader加载item

item_loader =
ArticleItemLoader(item=ArticleItem(), response=response)

# item_loader.add_css()

item_loader.add_xpath('title', '//div[@class="entry-header"]/h1/text()')

 

可以在item里面的field里面选择,

class ArticleItem(scrapy.Item):

    title = scrapy.Field(

        input_processor=MapCompose(add_jobbole)

    )

    create_date = scrapy.Field(

        input_processor=MapCompose(add_time)

    )

自定义输出:

class ArticleItemLoader(ItemLoader):

    # 自定义item
loader

    default_output_processor = TakeFirst()

 

 

pipeline后面的数值是优先级的问题

 

 

七、反爬虫策略

1、修改settings和middlewares文件

Setting里面设置一个user-agent-list的列表。

Middlewares里面设置

class RandomUserAgentMiddlware(object):

    '''

    随机更换user-agent

    '''

    def __init__(self, crawler):

        super(RandomUserAgentMiddlware, self).__init__()

        self.user_agent_list
= crawler.settings.get("user_agent_list", [])


    @classmethod

    def from_crawler(cls, crawler):

        return cls(crawler)


    def process_request(self, request, spider):

        request.headers.setdefault('User-Agent', random())

 

2、随意更换user-agent 的库

>pip install fake-useragent
from fake_useragent import UserAgent
 
 
class RandomUserAgentMiddlware(object):

    '''

    随机更换user-agent

    '''

    def __init__(self, crawler):

        super(RandomUserAgentMiddlware, self).__init__()

        # self.user_agent_list =
crawler.settings.get("user_agent_list", [])

        self.ua
= UserAgent()

    @classmethod

    def from_crawler(cls, crawler):

        return cls(crawler)


    def process_request(self, request, spider):

        request.headers.setdefault('User-Agent', self.ua.random) 
class RandomUserAgentMiddlware(object):

    '''

    随机更换user-agent

    '''

    def __init__(self, crawler):

        super(RandomUserAgentMiddlware, self).__init__()

        # self.user_agent_list =
crawler.settings.get("user_agent_list", [])

        self.ua
= UserAgent()

        self.ua_type
= crawler.settings.get("RANDOM_UA_TYPE", "random") 配置项

    @classmethod

    def from_crawler(cls, crawler):

        return cls(crawler)


    def process_request(self, request, spider):

        def get_ua():

            return  getattr(self.ua, self.ua_type)

        request.headers.setdefault('User-Agent', get_ua())

随机选取一个user-agent

3、代理ip

普通ip代理

request.meta['proxy'] = "http://61.135.217.7:80"  #ip
代理

 

(1)直接设置普通ip

 

(2)首先爬取某代理网站的代理ip存入到数据库中,然后从数据库中找到数据,放到middlewares里面进行ip代理。

import requests

from scrapy.selector import Selector

import MySQLdb

import threading

from fake_useragent import UserAgent



conn = MySQLdb.connect(host='127.0.0.1', user='root', passwd='centos', db='test', charset='utf8')

cour = conn.cursor()


ua = UserAgent()



def crawl_ips():

    headers = {

        'User-Agent':  'Mozilla/5.0 (Windows NT 6.2; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36',

    }

    for i in range(3):

        re = requests.get('http://www.xicidaili.com/wt/{0}'.format(i), headers=headers)


    seletor = Selector(text=re.text)

    all_trs = seletor.css('#ip_list tr')

    ip_list = []

    for tr in all_trs:

        speed_strs = tr.css(".bar::attr(title)").extract()

        if speed_strs:

            speed_str = speed_strs[0]


        all_texts = tr.css('td::text').extract()

        if all_texts:

            ip = all_texts[0]

            port = all_texts[1]

            proxy_type = all_texts[5]

            ip_list.append((ip, port, proxy_type, speed_str.split('秒')[0]))


        for ip_info
in ip_list:

            cour.execute(

                "insert xici_ip_list(ip, port, speed, proxy_type)
VALUES('{0}', '{1}', '{2}', '{3}')".format(

                    ip_info[0], ip_info[1], ip_info[3], ip_info[2])


            )

            conn.commit()

            print('数据库写入完成')



# crawl_ips()



class GetIP(object):

    def delete_ip(self, ip):

        delete_sql = """

        delete from xici_ip_list where
ip='{0}'

        """.format(ip)

       
cour.execute(delete_sql)

        conn.commit()

        return True


    def judge_ip(self, ip, port):

        http_url = 'http://ww.baidu.com'

        proxy_url = 'http://{}:{}'.format(ip, port)

        try:

            proxy_dict = {

                'http': proxy_url

            }

            response =
requests.get(http_url, proxies=proxy_dict)

        except Exception
as e:

            print('invalid ip and port')

            self.delete_ip(ip)

            return False

        else:

            code = response.status_code

            if code
>= 200 and code < 300:

                print('eddective ip')

                return True

            else:

                print('invalid ip and port')

                self.delete_ip(ip)

                return False


    def get_random_ip(self):

        # 从数据库中随机获取一个ip

        sql = """

        SELECT ip, port FROM xici_ip_list

        ORDER BY RAND()

        LIMIT 1

        """

        result =
cour.execute(sql)

        for ip_info
in cour.fetchall():

            ip = ip_info[0]

            port = ip_info[1]

            judge_ip = self.judge_ip(ip, port)

            if judge_ip:

                return "http://{0}:{1}".format(ip, port)

            else:

                return self.get_random_ip()



# t = threading.Thread(target=crawl_ips)

# t.start()


get_ip = GetIP()


get_ip.get_random_ip() 
class RandomProxyMiddleware(object):

    #动态设计ip代理

    def process_request(self, request, spider):

        get_ip = GetIP()

        request.meta['proxy'] = get_ip.get_random_ip()  #ip
代理

 

(3)插件化scrapy-proxies

https://github.com/aivarsk/scrapy-proxies/blob/master/scrapy_proxies

 

(4)scrapy-crawlera

收费版本

(5)tor洋葱网络

https://github.com/aivarsk/scrapy-proxies/blob/master/scrapy_proxies

稳定版本

 

八、验证码识别

1、验证码识别方法

编码实现tesseract-cor

 

在线打码

http://www.yundama.com/

 

人工打码