给基友下载漫画看

代码:

1 # !usr/bin/python3.4
2 # -*- coding:utf-8 -*-
3
4 import requests
5 import os
6 import time
7 import re
8 from lxml import etree
9 import random
10
11 def geturl(url,postdata):
12 header = {'User-Agent':
13 'Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5',
14 'Referer':'http://m.1kkk.com/vol1-6871/',
15 'Host': 'manhua1023.61-174-50-131.cdndm5.com',
16 'Accept': 'image/png,image/*;q=0.8,*/*;q=0.5',
17 'Accept-Encoding': 'gzip, deflate',
18 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
19 'Connection': 'keep-alive',
20 }
21
22 s = requests.Session()
23 r = s.post('http://m.1kkk.com/userdata.ashx',data = postdata)
24 _cookies = r.cookies
25 #print(r.content)
26 rs = s.get(url, headers=header,cookies = _cookies)
27 return rs
28
29
30 def get(url):
31 header = {'User-Agent':
32 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0',
33 'Referer': 'http://www.1kkk.com/manhua589/',
34 'Host': 'www.1kkk.com'}
35
36 # 解析网页
37 html_bytes = requests.get(url, headers=header)
38
39 return html_bytes
40
41 def mget(url):
42 header = {'User-Agent':
43 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0',
44 'Referer': 'http://m.1kkk.com/manhua589/',
45 'Host': 'm.1kkk.com'}
46
47 # 解析网页
48 html_bytes = requests.get(url, headers=header)
49
50 return html_bytes
51
52
53 # 去除标题中的非法字符 (Windows)
54 def validateTitle(title):
55 # '/\:*?"<>|'
56 rstr = r"[\/\\\:\*\?\"\<\>\|]"
57 new_title = re.sub(rstr, "", title)
58 return new_title
59
60
61 def prints(timesleep):
62 print('暂停' + str(timesleep) + '秒后开始批量下载图片,请保持网络畅通...')
63 time.sleep(timesleep)
64
65 # 解析js
66 def regnext(js):
67 reg = r'(var.+?.split)'
68 all = re.compile(reg);
69 alllist = re.findall(all, js)
70 return alllist
71
72 # 递归创建文件夹
73 def createjia(path):
74 try:
75 os.makedirs(path)
76 except:
77 print('目录已经存在:' + path)
78
79
80 if __name__ == '__main__':
81
82 html = get('http://www.1kkk.com/manhua589/').content.decode('utf-8', 'ignore')
83
84 page = etree.HTML(html.lower())
85 # 得到网址后缀
86 hrefs = page.xpath('//ul[@class="sy_nr1 cplist_ullg"][2]/li/a/@href')
87 # 得到编号
88 hrefnames = page.xpath('//ul[@class="sy_nr1 cplist_ullg"][2]/li/a/text()')
89 # 得到页数
90 hrefpages = page.xpath('//ul[@class="sy_nr1 cplist_ullg"][2]/li/text()')
91
92 href = []
93 hrefname = []
94 hrefpage = []
95 number = 1
96
97 # 不知道里面那几卷是不是漫画里面的
98 # 先抓下来再说
99 # 得到网址后缀
100 for temp in hrefs:
101 towurl = temp
102 href.append(towurl)
103 # 得到编号
104 for temp in hrefnames:
105 hrefname.append(temp)
106 # 得到页数
107 for temp in hrefpages:
108 hrefpage.append(temp.replace("页", ""))
109
110 j = 0
111 filenamep = '../data/' + str(hrefname[0]) + "/"
112 createjia(filenamep)
113
114 for i in range(0, len(href)):
115 for j in range(len(hrefpage)):
116
117 # 6871、6872。。
118 hrefnumber = str(href[i]).replace("ch54-","").replace("/","").replace("vol1-","")
119 #print(hrefnumber)
120 # 构造jsurl
121 # 得到
122 # http://www.1kkk.com/vol1-6871/imagefun.ashx?cid=6871&page=1&key=65abd421f4aed565&maxcount=10
123 jsurl = "http://www.1kkk.com" + str(href[i]) + "/imagefun.ashx?cid=" + str(hrefnumber) + "&page=" + str(j + 1) + "&key=65abd421f4aed565&maxcount=10"
124 print(jsurl)
125
126 # 构造image网址
127 html = get(jsurl).content.decode('utf-8', 'ignore')
128 html1 = regnext(html)
129 html1 = html1[0].replace("'.split", "").split('|')
130
131 # http://manhua1023.61-174-50-131.cdndm5.com/1/589/6871/102_9224.jpg?cid=6871&key=d8ce90e0b3f013f292ef77e84da88990&type=1
132 image_1url = "http://manhua1023." + str(html1[19]) + "-" + str(html1[18]) + "-" + str(html1[9]) + "-" + str(
133 html1[10]) + ".cdndm5.com/1/589/" + str(href[i]) + "/" + str(html1[20]) + "?cid=" + str(6871) + "&key=" + str(
134 html1[8]) + "&type=1"
135 print(image_1url)
136
137 # 构造image网址
138 filess = open(filenamep + str(j + 1) + '.jpg', 'wb')
139
140 # 伪装posrdata
141 postdata = {
142 'cid': 6871,
143 'language': 1,
144 'mid': 589,
145 'page': j + 1,
146 'tp': 8,
147 'uid': 0
148 }
149
150 # 即使正确的网址也是不能下载
151 pic = geturl(image_1url,postdata)
152 filess.write(pic.content)
153 filess.close()
154 print('已经写入第' + str(j + 1) + '张图片')
155 j = j + 1
156
157 # 每一次下载都暂停1-3秒
158 loadimg = random.randint(1, 3)
159 print('暂停' + str(loadimg) + '秒')
160

 

selenium抓取:

1 #!/usr/bin/python3.4
2 # -*- coding: utf-8 -*-
3
4 from selenium import webdriver
5 import time
6 from selenium.webdriver.common.keys import Keys
7 import re
8
9
10 # 去除标题中的非法字符 (Windows)
11 def validateTitle(title):
12 rstr = r"[\/\\\:\*\?\"\<\>\|]" # '/\:*?"<>|'
13 new_title = re.sub(rstr, "", title)
14 return new_title
15
16
17 def getimg():
1
19 # 打开火狐浏览器
20 # browser = webdriver.Chrome()
21 browser = webdriver.Firefox()
22
23 # 设置浏览器大小
24 browser.set_window_size(1200, 900)
25 # 输入网址
26 browser.get("http://m.1kkk.com/vol1-6871/")
27 # 根据各自网速来判断网址加载时间
28 time.sleep(10)
29
30 for i in range(10000):
31
32 # 关掉广告
33 browser.find_element_by_class_name("ad_cross").click()
34
35 # 翻页到最后面
36 browser.execute_script("""
37 (function () {
38 var y = 0;
39 var step = 100;
40 window.scroll(0, 0);
41
42 function f() {
43 if (y < document.body.scrollHeight) {
44 y += step;
45 window.scroll(0, y);
46 setTimeout(f, 100);
47 } else {
48 window.scroll(0, 0);
49 document.title += "scroll-done";
50 }
51 }
52
53 setTimeout(f, 1000);
54 })();
55 """)
56 print("下拉中...")
57 #time.sleep(180)
58 while True:
59 if "scroll-done" in browser.title:
60 break
61 else:
62 print("还没有拉到最底端...")
63 time.sleep(10)
64
65 # while True:
66 # # 判断是否存在这个东西
67 # select = browser.find_element_by_xpath('//a[@class="readTipForm"]')
68 # if select:
69 # break
70 # else:
71 # print("还没有拉到最底端...")
72 # time.sleep(60)
73
74 print("正在下载图片中...")
75 # 图片的命名
76 name = validateTitle(browser.current_url)
77 print("正在截图...")
78 time.sleep(5)
79
80 # 截图
81 browser.save_screenshot("../jpg/cartoon/" + str(i + 1) + str(name) + ".png")
82 time.sleep(5)
83
84 # 点击阅读下一章
85 browser.find_element_by_class_name("readTipForm").click()
86 print("准备进入下一章...")
87 time.sleep(5)
88
89 browser.quit()
90
91
92 if __name__ == '__main__':
93