使用PYTHON语言,用到的外部包有pasel, requests。
逻辑:首先得到该小说所有章节地址,再使用多线程访问链接,得到的内容放入object列表中,最后写入本地文件。
功能:设置菜单,由此可以选择不同的小说站点;写入本地时会在命令行打印所有章节名称;高速爬取小说。
注意:1,爬取时没有提示,但写入本地时会显示章节名称;2,测试用的小说站点已清除,在具体使用时,需要自己修改部分代码,其中用注释(需要手动修改)标明。
1 import requests
2 import parsel
3 import threading
4 import re
5
6 # 常量区
7 ## 网页标签xpath,主域名
8 N_NAME = '.read h3::text'
9 N_TITLE = '.title::text'
10 N_CONTENT = '.text::text'
11 # 站点域名, 需要手动修改
12 HOST1 = 'https://m.xxxx.com'
13 HOST2 = 'http://www.xxxx.com'
14
15 threadLock = threading.RLock()
16 chapters = []
17
18 # 单章写入本地文件
19 ## f 打开的文件对象
20 def harddisk_write(f, chapter):
21 ## 线程同步
22 # global threadLock
23 # threadLock.acquire()
24 f.write(chapter.title)
25 f.write('\n')
26 f.write(chapter.content)
27 f.write('\n')
28 f.flush()
29 print(chapter.title)
30 # threadLock.release()
31 return
32
33 # 网页内容读取,处理; 返回章节名和章节内容,同下一函数相同,目的是应对不同的HOST
34 ## 需要手动修改:css中的文本
35 def pull_content1(url):
36 res = requests.get(url)
37 sel = parsel.Selector(res.text)
38 chapter_title = sel.css('.title::text').get() # 章节名字
39 chapter_list = sel.css('.text::text').getall() # 小说内容
40 # 过滤AAA, 前三行
41 chapter_content = "\n".join(chapter_list[3:])
42 return chapter_title, chapter_content
43
44 # 网页内容读取,处理; 返回章节名和章节内容
45 ## 需要手动修改: css中的文本
46 def pull_content2(url):
47 res = requests.get(url)
48 sel = parsel.Selector(res.text)
49 chapter_title = sel.css('.bookname h1::text').get() # 章节名字
50 chapter_list = sel.css('#content::text').getall() # 小说内容
51 chapter_content = "".join(chapter_list)
52 return chapter_title, chapter_content
53
54 # 处理小说分页, 获取所有章节的url,同下一函数功能相同,不同的是应对的是不一样的HOST
55 ## 需要手动修改: css中的文本,分页处理的逻辑
56 def handle_pagination(first_page_url):
57 response = requests.get(first_page_url)
58 sel = parsel.Selector(response.text)
59 novel_name = sel.css('.read h3::text').get().strip() # 小说名字
60 page_str = sel.css('.page::text').getall()[2] # 分页信息字符串
61 pages = int(re.findall(r'\d+', page_str)[1])
62 second_part = re.search(r'(\d+_?)+', first_page_url).group() # 小说分页后缀
63 second_part_sp = second_part.split('_')
64 ## 生成小说章节地址
65 urls = []
66 i = 1
67 while i <= pages:
68 second_part_sp[2] = str(i)
69 suffix = '_'.join(second_part_sp)
70 url = HOST1 + '/' + suffix
71 # urls.append(url)
72 i += 1
73 response = requests.get(url)
74 sel = parsel.Selector(response.text)
75 urls.extend(sel.css('.chapter li a::attr(href)').getall())
76
77 return novel_name, list(map(lambda x: HOST1 + '/' + x, urls))
78
79 # 获取小说名和章节url
80 ## 需要手动修改:: css中的文本
81 def handle_mainpage(first_url):
82 response = requests.get(first_url)
83 selector = parsel.Selector(response.text)
84 novel_name = selector.css('#info h1::text').get().strip()
85 part_suffix = selector.css('#list dd a::attr(href)').getall()
86 return novel_name, list(map(lambda x: HOST2 + '/' + x, part_suffix))
87
88 # 定义小说章节数据类型
89 class Chapter:
90 # index用来sorted排序
91 def __init__(self, url, title, content, index):
92 self.url = url
93 self.title = title
94 self.content = content
95 self.index = index
96
97 def set(self, title, content):
98 self.title = title
99 self.content = content
100
101 def get_title(self):
102 return self.title
103
104 def get_content(self):
105 return self.content
106
107 def __repr__(self):
108 return repr((self.index, self.title))
109
110 # 运行线程,deprecate(无法多线程高速度,抛弃)
111 def single_job(url, f):
112 threadLock.acquire()
113 title, content = pull_content1(url)
114
115 harddisk_write(f, title, content)
116 threadLock.release()
117
118 # my job
119 ## 需要手动修改:条件语句
120 def spider_job(chapter, number):
121 # 获取网址
122 url = chapter.url
123 if number == 1:
124 title, content = pull_content1(url)
125 elif number == 2:
126 title, content = pull_content2(url)
127
128 chapter.title, chapter.content = title, content
129
130 # 菜单,选择爬取的小说的主域名
131 ## 需要手动修改:打印的内容
132 def menu():
133 print("1, xxx1")
134 print("2, xxx2")
135
136 # 主程序
137 ## 需要手动修改:结合menu函数,修改if条件语句
138 def main():
139 menu()
140 number = int(input("输入选择项目的序号:"))
141 first_url = input("请输入小说地址: ")
142 ## 小说名字, 章节网址列表
143 if number == 1:
144 novel_name, urls = handle_pagination(first_url)
145 elif number == 2:
146 novel_name, urls = handle_mainpage(first_url)
147 else:
148 exit()
149 threads = []
150 i = 1
151 for url in urls:
152 chapter = Chapter(url, "", "", i)
153 chapters.append(chapter)
154 i += 1
155 thread0 = threading.Thread(target = spider_job, args=(chapter,number))
156 thread0.start()
157 threads.append(thread0)
158
159 # 关闭文件
160 for t in threads:
161 t.join()
162
163 # 对queue排序
164 sorted(chapters, key=lambda ch: ch.index)
165 # 生成小说文件
166 with open("E:/online-novel/" +novel_name + ".txt", mode='a', encoding='utf-8') as f:
167 for chapter in chapters:
168 harddisk_write(f, chapter)
169
170 main()