笔趣阁有很多站点,因为本人最近在看一世之尊,因此想尝试在笔趣阁上爬取全本。
从该页面中可以找到各个章节对应的url,需要将其保存在一个列表中,通过遍历爬取全部章节。
通过F12调试界面可以看到,其html代码,url均在<div id=list> 中, 或者在<a href="xxx"> 中,抛弃首尾的冗余,可以得到目标章节url。
通过观察内容页面,发现爬取很简单,文字都在<div id=content> 中,奈何遇到了一个困难。
问题一:爬取文字过程中发现print出来仅有最后一行,其余文字均消失。猜测如下:1.网页反爬程序,返回假页面。2.文字由js代码生成,并不是静态页面。 3.编解码格式问题
猜测一:构建user-agent、cookies、referer,更换ip等操作均无效,只好暂时放弃
猜测二:通过f12观察network中 xml 数据,并没有发现目标数据。再通过禁用网页JavaScript,发现文字仍然显示。显示网页源码,也能直接观测到文字。种种迹象表明,猜测二错误
猜测三:通过.find("div", id="content").text 获取内容,发现文字成功显示
再就是格式调整,爬取的内容中含有一些广告、多余换行。通过split()可以快速去除首尾空格、换行符
通过多线程加快爬取速度,但是爬取速度过快的话会导致重连超时...
import time
import requests
from bs4 import BeautifulSoup
import random
import threading
import os
url = "https://www.vipxs.la/0_740/"
headers = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",
"Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
"Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"
]
referer = "https://www.vipxs.la/"
cookies = {'Cookie':"UM_distinctid=17f533be31115db-00017a4b760e46-977173c-144000-17f533be312c6c; Hm_lvt_8744b58bc1913cae0d8c4dc68f187d61=1646368908,1646368915,1646450962; CNZZDATA1280571925=488337212-1646363111-https%3A%2F%2Fwww.baidu.com%2F|1646449511; coupletAlllength=5; CNZZDATA1280571999=226664088-1646363299-https%3A%2F%2Fwww.baidu.com%2F|1646449699; Hm_lvt_b48494e860b198c9c71009978cfc755e=1646368908,1646368915,1646450962; fixedalllength=9; Hm_lvt_2d2ceac9af7f7f1a8dbdd51db6dbf36c=1646368908,1646368915,1646450962; 5531_2603_27.38.254.113=1; CNZZDATA1280572003=1348579129-1646364283-https%3A%2F%2Fwww.baidu.com%2F|1646450689; fixedall1length=8; CNZZDATA1280572006=1040752295-1646363481-https%3A%2F%2Fwww.baidu.com%2F|1646449881; Hm_lvt_dd3a5d36b1adfd567e4b8290c0760ba3=1646368908,1646368915,1646450963; clickbids=740; Hm_lvt_4d0a92fe9eb4da3973f356b734b334b6=1646368908,1646368915,1646450963; img3002500length=6; 5531_2570_27.38.254.113=1; Hm_lvt_4ad6b1a6d9755b262a181c469db16477=1646368913,1646450973; 5531_2444_27.38.254.113=1; 5531_2409_27.38.254.113=1; 5531_2403_27.38.254.113=1; CNZZDATA1280572013=42055633-1646365418-https%3A%2F%2Fwww.vipxs.la%2F|1646451818; 5531_2334_27.38.254.113=1; Hm_lpvt_4d0a92fe9eb4da3973f356b734b334b6=1646452229; 5531_2578_27.38.254.113=1; 5531_2563_27.38.254.113=1; coupletAll=1_0_4_3; fixedall=8_3_0_1_7; img3002500=1; fixedall1=6_2_5_7; richviews_5531=cRE9U1a3frz1iDNhc0K7SiahoOmhFed824EmDGllfAcca2YveADIUUZ4RaxXDzxli%2FHutkjPerP9wyrRHpug%2Fk%2B%2FXdViyzcaXEypaCEzuSyrbR9rvqKz9%2B81xBsynM6omYQw9eI3x0PEJ%2FmAv2AsKOY21ere%2Bf4rafFzUUOPSOxxXLwHf95U1sXNnYeOhr9bO8C3j36sy1MkcP77Qh9gspMwrZ4H0%2BfU6rnQPrHZ6CK1hXCb3tiIf6xo6FBRjO%2FgqIO%2FHDGk%2B1CM818cVCaBZ9Fs2LSVVUS7O%2Fa2SrNL7cJPFab2Bk%2FdLithl3nVy4MBs%2B4zlOoKCBlJgo7%2FgZ81Jo%2Bm9L%2BXWpWErQB%2FSEXRAoUVYIQ6TruK8dqMZPqQCUVJHqUtXDu0NCqW2r0KinusY8Rc5tlzdayjPWF%2F7yNEwsGb0LVYWk4Q9Atf4lHmt14iY9b4O0MLPZwckbtZ4IIY7SbW5yOn%2FHtyaJS0EvjOpW%2B7KS%2FVZ4LfxkwzbquJANRA7nHhVOMkUt9ldFOqcIaZB67%2BPHDwub0o4cfyKyi%2BaU2jOkmnnKxpRwFAjQEVF0Dd5m6T0xUCN9SL04vmT%2FQEHg47z0NyL9txUFInfFU7qhlGzFUKpoTbqzAogzKRVn1N%2BItSh1Atqcme8eLqzr%2BTw1grq7Dkbn9f52e47o%2FEl38%3D; Hm_lpvt_8744b58bc1913cae0d8c4dc68f187d61=1646452466; Hm_lpvt_b48494e860b198c9c71009978cfc755e=1646452466; Hm_lpvt_2d2ceac9af7f7f1a8dbdd51db6dbf36c=1646452466; Hm_lpvt_dd3a5d36b1adfd567e4b8290c0760ba3=1646452466; Hm_lpvt_4ad6b1a6d9755b262a181c469db16477=1646452466"}
chapter_list = []
res = requests.get(url, headers={"User-Agent": random.choice(headers), "Referer": referer})
bsobj = BeautifulSoup(res.content, 'lxml') # 将网页源码构造成BeautifulSoup对象,方便操作
temp = bsobj.find_all("a")
for i in temp[35:-10]:
chapter_list.append(i.get('href'))
def pachong(chapter_list,name):
for i in chapter_list:
url = "https://www.vipxs.la" + i
max_retry = 15
for n in range(max_retry):
try:
time.sleep(random.randint(1, 10))
res2 = requests.get(url, headers={'User-Agent': random.choice(headers)}, cookies=cookies, timeout=15)
if res2.status_code == 200:
break
except:
print(f"connect error, retry times : {n+1}")
if n+1 == max_retry:
print("重连次数过多,自动结束程序")
exit()
bsobj2 = BeautifulSoup(res2.content, 'lxml') # 将网页源码构造成BeautifulSoup对象,方便操作
content = bsobj2.find("div", id="content").text
title = bsobj2.find("h1").text
temp = content.split()
with open(f"{name}.txt", "a", encoding="utf-8") as f:
f.write("-------" + title + "-------" + "\r\n")
for j in temp[:-2]:
f.write(" " + j + "\r\n")
print(f"完成<<{title}>>的爬取!")
print("完成!!")
t_list = []
threading_num = 11
for i in range(threading_num):
book_num = threading_num -1
temp = int(len(chapter_list)/book_num) * (i+1)
if temp <= len(chapter_list):
t = threading.Thread(target=pachong, args=(chapter_list[temp - int(len(chapter_list) / book_num):temp], f"一世之尊{i+1}"))
else:
t = threading.Thread(target=pachong, args=(chapter_list[temp - int(len(chapter_list) / book_num):], f"一世之尊{i+1}"))
t_list.append(t)
t.start()
for t in t_list:
t.join()
def check(threading_num, true_length):
count = 0
for i in range(threading_num):
with open(f"一世之尊{i + 1}.txt", "r", encoding="utf-8") as f:
for j in f:
if "-----" in j:
count += 1
return true_length == count
def combine(threading_num):
for i in range(threading_num - 1):
with open(f"一世之尊.txt", "w", encoding="utf-8") as f:
with open(f"一世之尊{i+1}.txt", "r", encoding="utf-8") as f2:
f.write(f2.read())
os.remove(f"一世之尊{i+1}.txt")
if check(threading_num,len(chapter_list)):
print("校验成功,下载无错误!")
combine(threading_num)