前几天爬取豆瓣的时候,以为豆瓣是没有反爬的,直到碰到了豆瓣阅读...

这里是官网:https://read.douban.com/ebooks/?dcs=original-featured&dcm=normal-nav

需求:爬取里面13个图书类别,每个类别500本,最后保存到excel表中

Java selenium取指定异步request的response selenium异步爬取_xml

这是包含书本链接url的负载 ,如果有大佬可以逆向出来,就可以不用selenium

 用到的工具:asyncio,aiohttp,time,openpyxl,lxml,selenium

import asyncio
import aiohttp
import time
import openpyxl
from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.by import By

先导入包

def chu(q):
    url = []
    option = webdriver.ChromeOptions()
    option.add_experimental_option('detach', True)
    driver = webdriver.Chrome(chrome_options=option)#初始化一个Chrome对象
    driver.get('https://read.douban.com/ebooks/?dcs=original-featured&dcm=normal-nav')
    #请求url
    time.sleep(2)
    #等待页面加载

    page = driver.page_source
    tree = etree.HTML(page)#解析页面源代码
    #豆瓣会对频繁访问的ip进行登陆验证,所以这里使用try防止报错
    try:
        #如果没有登陆验证的话
        lei_url = tree.xpath('//ul[@class="list kinds-list tab-panel"]//a/@href')
        #获取每个类别的url

        lei = driver.find_element(By.XPATH,f'//ul[@class="list kinds-list tab-panel"]/li/a[@href="{lei_url[q]}"]')

        lei.click()
        #模拟点击,进入类别相应的页面

    except:
        #如果有登录验证的话
        login = input('请在网页上进行登录,登录完成后输入1:')
        #手动登录,因为需要滑块验证码,本人技术层次没那么高
        time.sleep(2)
        #等待页面加载
        lei_url = tree.xpath('//ul[@class="list kinds-list tab-panel"]//a/@href')
        

        lei = driver.find_element(By.XPATH, f'//ul[@class="list kinds-list tab-panel"]/li/a[@href="{lei_url[q]}"]')
        lei.click()
        #和上面同样的做法
    count = 1
    while count <31:
        time.sleep(2)
           #等待页面加载
        page = driver.page_source
        tree = etree.HTML(page)
        #获取每本书的url
        print('正在抓取url')
        url.append(tree.xpath('//ul[@class="works-list"]/li/@to'))
        time.sleep(1)
        driver.find_element(By.XPATH,'//div[@class="paginator-full"]//a[@class="page-next"]').click()
        #模拟点击,进入下一页,循环,直到满30页
        count += 1

    driver.quit()
    return url

selenium模拟采集url,返回的数据是一个列表,每20个url为一个元素,方便后面进行异步操作。

接下来进行对书本信息进行采集

book_page = []
#用来存储书本信息
async def pa_book(url#传入url,q#判断书本类别):
#使用异步函数,大大节省运行时间
    count = 0
    b = []
    header = {
        'User-Agent': '你的ua',
        'Cookie': '浏览器里自行复制'
    }#豆瓣cookie是固定的,我们复制下来一段,保持登陆状态
    async with aiohttp.ClientSession() as session:
        #初始化一个session对象
        async with await session.get(url=url,headers=header) as resp:
            #对页面发起请求
            tree = etree.HTML(await resp.text())
            #用xpath进行解析
            try:
                #因为有些书本信息不全,防止报错,我们直接使用try
                o = 0
                zuozhe = tree.xpath('//div[@class="article-meta"]//text()')
                prise = tree.xpath('//s[@class="original-price-count"]/text()')[0]
                mingzi = tree.xpath('//span[@itemprop="itemListElement"]//span[@itemprop="name"]/text()')[0]
                pingjia = tree.xpath('//span[@class="amount"]/text()')[0]
                #存入信息
                b.append(mingzi)
                b.append(mane[q])
                b.append(zuozhe[zuozhe.index('类别') + 1])
                b.append(zuozhe[zuozhe.index('作者') + 1])
                b.append(zuozhe[zuozhe.index('出版社') + 1])
                b.append(zuozhe[zuozhe.index('提供方') + 1])
                b.append(zuozhe[zuozhe.index('字数') + 1])
                b.append(zuozhe[zuozhe.index('ISBN') + 1])

                b.append(pingjia)
                b.append(prise)
                book_page.append(b)
            except:
                #跳过不全的书本
                count += 1

这样书本信息就采集好了,接下来存入excel表中

我们需要准备一个名称为书籍.xlsx.的文件不然会报错

def cun():
    w = 2
    wb = openpyxl.load_workbook('书籍.xlsx')
    ws = wb['Sheet1']
    top = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
    for o in book_page:


        count = 0
        for j,i in zip(o,top):
            ws[f'{i}{w}'] = j
            count +=1
        w = w+1
    #这个很简单写写循环就可以了
    wb.save('书籍.xlsx')

最后一步代码运行

for q in range(14):
    p = 1
    for url in chu(q):
        task = []

        for i in url:
            task.append(pa_book(f'https://read.douban.com{i}',q))
            #注册进循环里
        print(f'正在爬取第{p}页------------')
        p += 1
        loop = asyncio.get_event_loop()

        loop.run_until_complete(asyncio.wait(task))
        #运行
        time.sleep(2)
        #等待2秒防止访问被拒
        print('success')
cun()
#存入数据

完整代码实现

import asyncio
import aiohttp
import time
import openpyxl
from lxml import etree
from selenium import webdriver
from selenium.webdriver.common.by import By
start_time = time.time()
mane = ['小说', '文学', '人文社科', '经济管理', '科技科普', '计算机与互联网', '成功励志', '生活', '少儿', '艺术设计', '漫画绘本', '教育考试', '杂志']
def chu(q):
    url = []
    option = webdriver.ChromeOptions()
    option.add_experimental_option('detach', True)
    driver = webdriver.Chrome(chrome_options=option)
    driver.get('https://read.douban.com/ebooks/?dcs=original-featured&dcm=normal-nav')

    time.sleep(2)


    page = driver.page_source
    tree = etree.HTML(page)
    try:
        lei_url = tree.xpath('//ul[@class="list kinds-list tab-panel"]//a/@href')
        lei_name = tree.xpath('//ul[@class="list kinds-list tab-panel"]//a/text()')

        lei = driver.find_element(By.XPATH,f'//ul[@class="list kinds-list tab-panel"]/li/a[@href="{lei_url[q]}"]')
        lei.click()

    except:
        login = input('请在网页上进行登录,登录完成后输入1:')
        time.sleep(2)
        lei_url = tree.xpath('//ul[@class="list kinds-list tab-panel"]//a/@href')
        lei_name = tree.xpath('//ul[@class="list kinds-list tab-panel"]//a/text()')

        lei = driver.find_element(By.XPATH, f'//ul[@class="list kinds-list tab-panel"]/li/a[@href="{lei_url[q]}"]')
        lei.click()
    count = 1
    while count <9:
        time.sleep(2)
        page = driver.page_source
        tree = etree.HTML(page)
        print('正在抓取url')
        url.append(tree.xpath('//ul[@class="works-list"]/li/@to'))
        time.sleep(1)
        driver.find_element(By.XPATH,'//div[@class="paginator-full"]//a[@class="page-next"]').click()
        count += 1

    driver.quit()
    return url
book_page = []
async def pa_book(url,q):
    count = 0
    b = []
    header = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36',
        'Cookie': 'bid=eLRBZmbY55Y; dbcl2="267875740:kPdFiwjWprY"; ck=ijB_; _ga=GA1.3.133779158.1679229659; _gid=GA1.3.10458227.1679229659; _pk_ref.100001.a7dd=%5B%22%22%2C%22%22%2C1679229659%2C%22https%3A%2F%2Faccounts.douban.com%2F%22%5D; _pk_ses.100001.a7dd=*; _ga=GA1.1.133779158.1679229659; __gads=ID=de972e154c2fef7a-22a0163bd5de0064:T=1679229662:RT=1679229662:S=ALNI_MaXNgyOFMsVTnhbnDSgMLFd_tnQ5w; __gpi=UID=00000bdc08bc9484:T=1679229662:RT=1679229662:S=ALNI_MY0Iz9JEbO8VuYeFJ3xJQOu75jnQw; _ga_RXNMP372GL=GS1.1.1679229658.1.1.1679229769.56.0.0; _pk_id.100001.a7dd=d883665af68561be.1679229659.1.1679229771.1679229659.'
    }
    async with aiohttp.ClientSession() as session:
        async with await session.get(url=url,headers=header) as resp:

            tree = etree.HTML(await resp.text())
            try:
                o = 0
                zuozhe = tree.xpath('//div[@class="article-meta"]//text()')
                prise = tree.xpath('//s[@class="original-price-count"]/text()')[0]
                mingzi = tree.xpath('//span[@itemprop="itemListElement"]//span[@itemprop="name"]/text()')[0]
                pingjia = tree.xpath('//span[@class="amount"]/text()')[0]
                b.append(mingzi)
                b.append(mane[q])
                b.append(zuozhe[zuozhe.index('类别') + 1])
                b.append(zuozhe[zuozhe.index('作者') + 1])
                b.append(zuozhe[zuozhe.index('出版社') + 1])
                b.append(zuozhe[zuozhe.index('提供方') + 1])
                b.append(zuozhe[zuozhe.index('字数') + 1])
                b.append(zuozhe[zuozhe.index('ISBN') + 1])

                b.append(pingjia)
                b.append(prise)
                book_page.append(b)
            except:
                count += 1

def cun():
    w = 2
    wb = openpyxl.load_workbook('书籍.xlsx')
    ws = wb['Sheet1']
    top = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
    for o in book_page:


        count = 0
        for j,i in zip(o,top):
            ws[f'{i}{w}'] = j
            count +=1
        w = w+1
    wb.save('书籍.xlsx')

for q in range(14):
    p = 1
    for url in chu(q):
        task = []

        for i in url:
            task.append(pa_book(f'https://read.douban.com{i}',q))
        print(f'正在爬取第{p}页------------')
        p += 1
        loop = asyncio.get_event_loop()

        loop.run_until_complete(asyncio.wait(task))
        time.sleep(2)
        print('success')
cun()
end_time = time.time()
run_time = -start_time+end_time
print(f'本次运行时间为{run_time}s')

我加了一个运行时间显示

注意:1.代码运行过程中如果停止那么信息会全部丢失,前功尽弃,可以通过修改循环次数来控制爬取量

2.还是会有封ip的风险,如果没有爬过豆瓣的可以一次性爬完,可以加代理ip

本人爬虫萌新,纯粹练手,如有侵权请告知,希望大佬们多提建议