aiohttp异步爬虫实战

原创

echohye 2022-08-01 10:28:30 博主文章分类：简单Python小程序实现 ©著作权

©著作权归作者所有：来自51CTO博客作者echohye的原创作品，请联系作者获取转载授权，否则将追究法律责任

爬取目标

一个图书网站，其链接为：https://spa5.scrape.center/ ，整个网站包含数千本图书信息，网站数据是通过JavaScript渲染出来的，数据是通过Ajax接口获取的，并且接口没有设置任何反爬措施和加密参数，页面加载如下图所示：

aiohttp异步爬虫实战_数据

另外，由于这个网站的数据量多一些，所以选择用异步方式来爬取。

爬取步骤

分析页面数据的加载逻辑
用aiohttp实现Ajax数据的爬取
将每本图书的相关信息分别保存到MySQL数据库中

代码实例

# -*- UTF-8 -*-
"""
@File:aiohttp_p207.py
@Description:
@Author:echohye
@Date:2022/02/01 15:09
"""
import asyncio
import json
import aiohttp
import logging
import pymysql

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
INDEX_URL = 'https://spa5.scrape.center/api/book/?limit=18&offset={offset}'
DETAIL_URL = 'https://spa5.scrape.center/api/book/{ide}'
PAGE_SIZE = 18
PAGE_NUMBER = 60
CONCURRENCY = 20

semaphore = asyncio.Semaphore(CONCURRENCY)  # 设置并发量
session = None


# api内容获取
async def scrape_api(url):
    async with semaphore:
        try:
            logging.info("scraping %s", url)
            async with session.get(url) as response:
                return await response.json()
        except aiohttp.ClientError:
            logging.error("error occurred while scraping %s", url, exc_info=True)


# 每页链接切换
async def scrape_index(page):
    url = INDEX_URL.format(offset=PAGE_SIZE * (page - 1))
    return await scrape_api(url)


# 每本图书详情链接切换
async def scrape_detail(ide):
    url = DETAIL_URL.format(ide=ide)
    return await scrape_api(url)


# 爬取的主方法
async def scrape_main():
    global session
    session = aiohttp.ClientSession()
    scrape_index_tasks = [asyncio.ensure_future(scrape_index(page)) for page in range(1, PAGE_NUMBER + 1)]
    results = await asyncio.gather(*scrape_index_tasks)
    logging.info("result %s", json.dumps(results, ensure_ascii=False, indent=2))

    ids = []
    for index_data in results:
        if not index_data: continue
        for item in index_data.get('results'):
            ids.append(item.get('id'))
    scrape_detail_tasks = [asyncio.ensure_future(scrape_detail(ide)) for ide in ids]
    books = await asyncio.gather(*scrape_detail_tasks)
    logging.info("book %s", json.dumps(books, ensure_ascii=False, indent=2))
    await session.close()
    return books


# 信息保存到数据库
def save_data(books: list):
    db = pymysql.Connect(host="localhost", user="root", password="zhy123", port=3306, db="spiders")
    cursor = db.cursor()
    table_sql = 'create table if not exists aiohttp_P207(id varchar(255) not null,name varchar(255) not null,authors varchar(255),' \
                'translators varchar(255),publisher varchar(255),tags varchar(255),url varchar(255),isbn varchar(255),' \
                'cover varchar(255),page_number varchar(255),price varchar(255),score varchar(255),' \
                'published_at varchar(255),updated_at varchar(255))'
    cursor.execute(table_sql)
    for book in books:
        try:
            sql = 'insert into aiohttp_P207(id,name,authors,translators,publisher,tags,url,isbn,cover,page_number,price,score,' \
                  f"published_at,updated_at) values(\"{book.get('id')}\",\"{book.get('name')}\",\"{book.get('authors')}\",\"{book.get('translators')}\",\"{book.get('publisher')}\"," \
                  f"\"{book.get('tags')}\",\"{book.get('url')}\",'{book.get('isbn')}',\"{book.get('cover')}\",\"{book.get('page_number')}\",\"{book.get('price')}\"," \
                  f"\"{book.get('score')}\",\"{book.get('published_at')}\",\"{book.get('updated_at')}\")"
            cursor.execute(sql)
            db.commit()
            logging.info("DataBase book %s", book.get('id'))
        except Exception as e:
            db.rollback()
            print(e.args)
    db.close()


def main():
    contents = asyncio.get_event_loop().run_until_complete(scrape_main())
    save_data(contents)


if __name__ == '__main__':
    main()