注意:只是文字,其他都行,自己实现吧。
1 import requests
2 from lxml import etree
3 from urllib.request import urlopen, Request
4
5 import time
6 class blog():
7
8 def __init__(self,url):
9 self.base_url = url
10 self.headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36"}#模拟浏览器
11
12 def get_html(self,url):
13 response = requests.get(url, self.headers, timeout = 5)#因为请求方式位get所以,用get请求
14 if response.status_code == 200:#访问正常状态码为200
15 response.encoding = response.apparent_encoding
16 return response.text
17 return None
18 def get_url1(self,html):#获取每个分页内的文章标题和url
19 url_1 = []
20 name = []
21 x_html = etree.HTML(html)
22 #print(x_html)
23 url_1 = x_html.xpath('//a[@class="postTitle2"]/@href')
24 name = x_html.xpath('//a[@class="postTitle2"]/text()')
25 names = []
26
27 for i in name:
28 names.append(i.strip())#去除空格
29 if names:
30 return names, url_1
31 else:
32 return None
33 def get_url2(self,s):
34
35 url = self.base_url + "default.html?page=" + s
36 return url
37 def get_text(self,html):
38 x_html = etree.HTML(html)
39 txt = []
40
41 txt = x_html.xpath('//div[@]//p//text()')
42 return txt
43 def save_text(self,name,txt):#保存成txt文件,txt是文章内容
44 print(name + "loading...")
45 with open("C:/Users/25766/Desktop/sa/" + name + '.txt','w',encoding='utf-8') as f:#文件路径
46 for i in txt:
47 f.write(i)
48 f.write('\n')
49 f.close()
50 print("finsh")
51 surl = input("输入爬取博客的首页url:")
52 c = blog(surl)
53 for i in range(1,200):#获取其各个分页,通过规律发现
54 url = c.get_url2(str(i))
55 ls = c.get_html(url)
56 #print(ls)
57 time.sleep(2)
58 names, urls = c.get_url1(ls)
59 for name, url in zip(names,urls):
60 #print(name,":", url)
61 html = c.get_html(url)
62 txt = c.get_text(html)
63 c.save_text(name[:5],txt)#以前5个字符命名
64
65
66
67
追求吾之所爱