爬取目标

一个图书网站,其链接为:​​https://spa5.scrape.center/​​ ,整个网站包含数千本图书信息,网站数据是通过JavaScript渲染出来的,数据是通过Ajax接口获取的,并且接口没有设置任何反爬措施和加密参数,页面加载如下图所示:

aiohttp异步爬虫实战_数据

另外,由于这个网站的数据量多一些,所以选择用异步方式来爬取。

爬取步骤

  • 分析页面数据的加载逻辑
  • 用aiohttp实现Ajax数据的爬取
  • 将每本图书的相关信息分别保存到MySQL数据库中

代码实例

# -*- UTF-8 -*-
"""
@File:aiohttp_p207.py
@Description:
@Author:echohye
@Date:2022/02/01 15:09
"""
import asyncio
import json
import aiohttp
import logging
import pymysql

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
INDEX_URL = 'https://spa5.scrape.center/api/book/?limit=18&offset={offset}'
DETAIL_URL = 'https://spa5.scrape.center/api/book/{ide}'
PAGE_SIZE = 18
PAGE_NUMBER = 60
CONCURRENCY = 20

semaphore = asyncio.Semaphore(CONCURRENCY) # 设置并发量
session = None


# api内容获取
async def scrape_api(url):
async with semaphore:
try:
logging.info("scraping %s", url)
async with session.get(url) as response:
return await response.json()
except aiohttp.ClientError:
logging.error("error occurred while scraping %s", url, exc_info=True)


# 每页链接切换
async def scrape_index(page):
url = INDEX_URL.format(offset=PAGE_SIZE * (page - 1))
return await scrape_api(url)


# 每本图书详情链接切换
async def scrape_detail(ide):
url = DETAIL_URL.format(ide=ide)
return await scrape_api(url)


# 爬取的主方法
async def scrape_main():
global session
session = aiohttp.ClientSession()
scrape_index_tasks = [asyncio.ensure_future(scrape_index(page)) for page in range(1, PAGE_NUMBER + 1)]
results = await asyncio.gather(*scrape_index_tasks)
logging.info("result %s", json.dumps(results, ensure_ascii=False, indent=2))

ids = []
for index_data in results:
if not index_data: continue
for item in index_data.get('results'):
ids.append(item.get('id'))
scrape_detail_tasks = [asyncio.ensure_future(scrape_detail(ide)) for ide in ids]
books = await asyncio.gather(*scrape_detail_tasks)
logging.info("book %s", json.dumps(books, ensure_ascii=False, indent=2))
await session.close()
return books


# 信息保存到数据库
def save_data(books: list):
db = pymysql.Connect(host="localhost", user="root", password="zhy123", port=3306, db="spiders")
cursor = db.cursor()
table_sql = 'create table if not exists aiohttp_P207(id varchar(255) not null,name varchar(255) not null,authors varchar(255),' \
'translators varchar(255),publisher varchar(255),tags varchar(255),url varchar(255),isbn varchar(255),' \
'cover varchar(255),page_number varchar(255),price varchar(255),score varchar(255),' \
'published_at varchar(255),updated_at varchar(255))'
cursor.execute(table_sql)
for book in books:
try:
sql = 'insert into aiohttp_P207(id,name,authors,translators,publisher,tags,url,isbn,cover,page_number,price,score,' \
f"published_at,updated_at) values(\"{book.get('id')}\",\"{book.get('name')}\",\"{book.get('authors')}\",\"{book.get('translators')}\",\"{book.get('publisher')}\"," \
f"\"{book.get('tags')}\",\"{book.get('url')}\",'{book.get('isbn')}',\"{book.get('cover')}\",\"{book.get('page_number')}\",\"{book.get('price')}\"," \
f"\"{book.get('score')}\",\"{book.get('published_at')}\",\"{book.get('updated_at')}\")"
cursor.execute(sql)
db.commit()
logging.info("DataBase book %s", book.get('id'))
except Exception as e:
db.rollback()
print(e.args)
db.close()


def main():
contents = asyncio.get_event_loop().run_until_complete(scrape_main())
save_data(contents)


if __name__ == '__main__':
main()

aiohttp异步爬虫实战_json_02

箴言:因为这些东西是非常简单的。不要抱怨自己学不会,那是因为你没有足够用心。