笔趣阁有很多站点,因为本人最近在看一世之尊,因此想尝试在笔趣阁上爬取全本。

从该页面中可以找到各个章节对应的url,需要将其保存在一个列表中,通过遍历爬取全部章节。

通过F12调试界面可以看到,其html代码,url均在<div id=list> 中, 或者在<a href="xxx">  中,抛弃首尾的冗余,可以得到目标章节url。

通过观察内容页面,发现爬取很简单,文字都在<div id=content> 中,奈何遇到了一个困难。

问题一:爬取文字过程中发现print出来仅有最后一行,其余文字均消失。猜测如下:1.网页反爬程序,返回假页面。2.文字由js代码生成,并不是静态页面。 3.编解码格式问题

猜测一:构建user-agent、cookies、referer,更换ip等操作均无效,只好暂时放弃

猜测二:通过f12观察network中 xml 数据,并没有发现目标数据。再通过禁用网页JavaScript,发现文字仍然显示。显示网页源码,也能直接观测到文字。种种迹象表明,猜测二错误

猜测三:通过.find("div", id="content").text 获取内容,发现文字成功显示

 

再就是格式调整,爬取的内容中含有一些广告、多余换行。通过split()可以快速去除首尾空格、换行符

通过多线程加快爬取速度,但是爬取速度过快的话会导致重连超时...

import time
import requests
from bs4 import BeautifulSoup
import random
import threading
import os
url = "https://www.vipxs.la/0_740/"

headers = [ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.102 Safari/537.36",
            "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
            "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; en) Presto/2.8.131 Version/11.11",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
            "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)"
            ]
referer = "https://www.vipxs.la/"
cookies = {'Cookie':"UM_distinctid=17f533be31115db-00017a4b760e46-977173c-144000-17f533be312c6c; Hm_lvt_8744b58bc1913cae0d8c4dc68f187d61=1646368908,1646368915,1646450962; CNZZDATA1280571925=488337212-1646363111-https%3A%2F%2Fwww.baidu.com%2F|1646449511; coupletAlllength=5; CNZZDATA1280571999=226664088-1646363299-https%3A%2F%2Fwww.baidu.com%2F|1646449699; Hm_lvt_b48494e860b198c9c71009978cfc755e=1646368908,1646368915,1646450962; fixedalllength=9; Hm_lvt_2d2ceac9af7f7f1a8dbdd51db6dbf36c=1646368908,1646368915,1646450962; 5531_2603_27.38.254.113=1; CNZZDATA1280572003=1348579129-1646364283-https%3A%2F%2Fwww.baidu.com%2F|1646450689; fixedall1length=8; CNZZDATA1280572006=1040752295-1646363481-https%3A%2F%2Fwww.baidu.com%2F|1646449881; Hm_lvt_dd3a5d36b1adfd567e4b8290c0760ba3=1646368908,1646368915,1646450963; clickbids=740; Hm_lvt_4d0a92fe9eb4da3973f356b734b334b6=1646368908,1646368915,1646450963; img3002500length=6; 5531_2570_27.38.254.113=1; Hm_lvt_4ad6b1a6d9755b262a181c469db16477=1646368913,1646450973; 5531_2444_27.38.254.113=1; 5531_2409_27.38.254.113=1; 5531_2403_27.38.254.113=1; CNZZDATA1280572013=42055633-1646365418-https%3A%2F%2Fwww.vipxs.la%2F|1646451818; 5531_2334_27.38.254.113=1; Hm_lpvt_4d0a92fe9eb4da3973f356b734b334b6=1646452229; 5531_2578_27.38.254.113=1; 5531_2563_27.38.254.113=1; coupletAll=1_0_4_3; fixedall=8_3_0_1_7; img3002500=1; fixedall1=6_2_5_7; richviews_5531=cRE9U1a3frz1iDNhc0K7SiahoOmhFed824EmDGllfAcca2YveADIUUZ4RaxXDzxli%2FHutkjPerP9wyrRHpug%2Fk%2B%2FXdViyzcaXEypaCEzuSyrbR9rvqKz9%2B81xBsynM6omYQw9eI3x0PEJ%2FmAv2AsKOY21ere%2Bf4rafFzUUOPSOxxXLwHf95U1sXNnYeOhr9bO8C3j36sy1MkcP77Qh9gspMwrZ4H0%2BfU6rnQPrHZ6CK1hXCb3tiIf6xo6FBRjO%2FgqIO%2FHDGk%2B1CM818cVCaBZ9Fs2LSVVUS7O%2Fa2SrNL7cJPFab2Bk%2FdLithl3nVy4MBs%2B4zlOoKCBlJgo7%2FgZ81Jo%2Bm9L%2BXWpWErQB%2FSEXRAoUVYIQ6TruK8dqMZPqQCUVJHqUtXDu0NCqW2r0KinusY8Rc5tlzdayjPWF%2F7yNEwsGb0LVYWk4Q9Atf4lHmt14iY9b4O0MLPZwckbtZ4IIY7SbW5yOn%2FHtyaJS0EvjOpW%2B7KS%2FVZ4LfxkwzbquJANRA7nHhVOMkUt9ldFOqcIaZB67%2BPHDwub0o4cfyKyi%2BaU2jOkmnnKxpRwFAjQEVF0Dd5m6T0xUCN9SL04vmT%2FQEHg47z0NyL9txUFInfFU7qhlGzFUKpoTbqzAogzKRVn1N%2BItSh1Atqcme8eLqzr%2BTw1grq7Dkbn9f52e47o%2FEl38%3D; Hm_lpvt_8744b58bc1913cae0d8c4dc68f187d61=1646452466; Hm_lpvt_b48494e860b198c9c71009978cfc755e=1646452466; Hm_lpvt_2d2ceac9af7f7f1a8dbdd51db6dbf36c=1646452466; Hm_lpvt_dd3a5d36b1adfd567e4b8290c0760ba3=1646452466; Hm_lpvt_4ad6b1a6d9755b262a181c469db16477=1646452466"}
chapter_list = []

res = requests.get(url, headers={"User-Agent": random.choice(headers), "Referer": referer})
bsobj = BeautifulSoup(res.content, 'lxml')  # 将网页源码构造成BeautifulSoup对象,方便操作
temp = bsobj.find_all("a")
for i in temp[35:-10]:
    chapter_list.append(i.get('href'))



def pachong(chapter_list,name):
    for i in chapter_list:
        url = "https://www.vipxs.la" + i
        max_retry = 15
        for n in range(max_retry):
            try:
                time.sleep(random.randint(1, 10))
                res2 = requests.get(url, headers={'User-Agent': random.choice(headers)}, cookies=cookies, timeout=15)
                if res2.status_code == 200:
                    break
            except:
                print(f"connect error, retry times : {n+1}")
                if n+1 == max_retry:
                    print("重连次数过多,自动结束程序")
                    exit()

        bsobj2 = BeautifulSoup(res2.content, 'lxml')  # 将网页源码构造成BeautifulSoup对象,方便操作
        content = bsobj2.find("div", id="content").text
        title = bsobj2.find("h1").text
        temp = content.split()
        with open(f"{name}.txt", "a", encoding="utf-8") as f:
            f.write("-------" + title + "-------" + "\r\n")
            for j in temp[:-2]:
                f.write("  " + j + "\r\n")
        print(f"完成<<{title}>>的爬取!")
    print("完成!!")




t_list = []
threading_num = 11
for i in range(threading_num):
    book_num = threading_num -1
    temp = int(len(chapter_list)/book_num) * (i+1)
    if temp <= len(chapter_list):
        t = threading.Thread(target=pachong, args=(chapter_list[temp - int(len(chapter_list) / book_num):temp], f"一世之尊{i+1}"))
    else:
        t = threading.Thread(target=pachong, args=(chapter_list[temp - int(len(chapter_list) / book_num):], f"一世之尊{i+1}"))

    t_list.append(t)
    t.start()

for t in t_list:
    t.join()

def check(threading_num, true_length):
    count = 0
    for i in range(threading_num):
        with open(f"一世之尊{i + 1}.txt", "r", encoding="utf-8") as f:
            for j in f:
                if "-----" in j:
                    count += 1
    return true_length == count

def combine(threading_num):
    for i in range(threading_num - 1):
        with open(f"一世之尊.txt", "w", encoding="utf-8") as f:
            with open(f"一世之尊{i+1}.txt", "r", encoding="utf-8") as f2:
                f.write(f2.read())
        os.remove(f"一世之尊{i+1}.txt")



if check(threading_num,len(chapter_list)):
    print("校验成功,下载无错误!")
    combine(threading_num)