近期搜电子是的时候发现一个有趣的网站,很多精校版的电子书,由于好奇,就想做一个爬虫把名称汇总一下。(具体原因在于canvas的页面背景效果在Chrome浏览器里面特别消耗资源)自己去搜索书名,然后找下载地址。十几分钟,脚本基本写完,一晚上时间也差不多能够跑完了。

分享代码,仅供参考(比较粗糙)。

package com.fun
2
3import com.fun.db.mysql.MySqlTest
4import com.fun.frame.httpclient.FanLibrary
5import com.fun.utils.Regex
6import org.slf4j.Logger
7import org.slf4j.LoggerFactory
8
9class T extends FanLibrary {
10
11 static Logger logger = LoggerFactory.getLogger(T.class)
12
13
14 public static void main(String[] args) {
15// test(322)
16
17 def list = 1..1000 as List
18
19 list.each { x ->
20 try {
21 test(x)
22 } catch (Exception e) {
23 logger.error(x.toString())
24 output(e)
25 }
26 logger.warn(x.toString())
27 sleep(2000)
28 }
29
30 testOver()
31 }
32 //****代表网站地址
33 static def test(int id) {
34// def get = getHttpGet("https://****/books/9798.html")
35 def get = getHttpGet("https://****/books/" + id + ".html")
36 def response = getHttpResponse(get)
37 def string = response.getString("content")
38 if (string.contains("您需求的文件不存在")|| string.contains("页面未找到")) return
39 output(string)
40 def all = Regex.regexAll(string, "class=\"bookpic\"> <img title=\".*?\"").get(0)
41 def all2 = Regex.regexAll(string, "content=\"内容简介.*?\"").get(0)
42 def all3 = Regex.regexAll(string, "title=\"作者:.*?\"").get(0)
43 def all40 = Regex.regexAll(string, "https://*******\\.cc/go\\.html\\?url=https{0,1}://.*?\\.ctfile\\.com/.*?\"")
44 def all4 = all40.size() == 0 ? "" : all40.get(0)
45 def all50 = Regex.regexAll(string, "https://******\\.cc/go\\.html\\?url=https{0,1}://pan\\.baidu\\.com/.*?\"")
46 def all5 = all50.size() == 0 ? "" : all50.get(0)
47 output(all)
48 output(all2)
49 output(all3)
50 output(all4)
51 output(all5)
52 def name = all.substring(all.lastIndexOf("=") + 2, all.length() - 1)
53 def author = all3.substring(all3.lastIndexOf("=") + 2, all3.length() - 1)
54 def intro = all2.substring(all2.lastIndexOf("=") + 2, all2.length() - 1)
55 def url1 = all4 == "" ? "" : all4.substring(all4.lastIndexOf("=") + 1, all4.length() - 1)
56 def url2 = all5 == "" ? "" : all5.substring(all5.lastIndexOf("=") + 1, all5.length() - 1)
57 output(name, author, intro, url1, url2)
58 def sql = String.format("INSERT INTO books (name,author,intro,urlc,urlb,bookid) VALUES (\"%s\",\"%s\",\"%s\",\"%s\",\"%s\",%d)", name, author, intro, url1, url2, id)
59 MySqlTest.sendWork(sql)
60 }
61}

个人感觉还是比较满意的。

电子书网站爬虫实践_mysql数据库截图

公众号后台回复“电子书”可得网站地址和CSV文件下载地址。