from lxml import etree
import requests
import re

class QiuShiBai(object):

def __init__(self):
self.start_save = True
# 请求头
self.headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36",'Cookie': """_xsrf=2|9eddf422|645cfaf9eb5a803c30cc80e1d1615c5d|1601094123; BAIDU_SSP_lcr=https://www.baidu.com/link?url=jq06Qc3HHKSLv7Hwapz8ijVO_TwbO0x1RwK6HbLb-b6NU5F7uwvxLeMCmI3rpUWC&wd=&eqid=e41515e3000d2fbd000000035f6ec1e7; Hm_lvt_2670efbdd59c7e3ed3749b458cafaa37=1601094125; gr_user_id=afa35f5a-04c9-4b03-ae07-3ac3f807b811; _qqq_uuid_="2|1:0|10:1601094124|10:_qqq_uuid_|56:NmUwMjVmNWYzMjVkNjNjNGE0M2RlYjIzOWEyMmZlMjAxMjVmZTQ1OQ==|4347d1f32342ae3ef17c9eb338787ce49f582101342c1f5c82e729518208e2ac'; _ga=GA1.2.1225827497.1601094125; _gid=GA1.2.1087809495.1601094125; grwng_uid=5b71ba19-07ac-4dd6-8a86-05f77385daa4; __cur_art_index=4001; Hm_lvt_743362d4b71e22775786fbc54283175c=1601094221; Hm_lpvt_743362d4b71e22775786fbc54283175c=1601094890; Hm_lpvt_2670efbdd59c7e3ed3749b458cafaa37=1601098839; ff2672c245bd193c6261e9ab2cd35865_gr_session_id=5020a836-ce51-4d9f-8cf0-8ec57b966aa1; _gat=1; ff2672c245bd193c6261e9ab2cd35865_gr_session_id_5020a836-ce51-4d9f-8cf0-8ec57b966aa1=true"""}
# 开始时的url
self.start_url = "https://www.qiushibaike.com/text/?page=1"

# 获取请求内容
def parse_url(self,url):
resp = requests.get(url,headers=self.headers)
resp = resp.content.decode()
# print(resp)
# 获取响应数据,并将其已xpath对象形式返回,便于提取
return etree.HTML(resp)

# 提取参数,并获取下一页url
def filter_argu(self,res):
# li_list = res.xpath("//div[@class='col1 old-style-col1']//div[@class='content']/span[1]/text()")
# 提取下一页的url地址
next_url = 'https://www.qiushibaike.com' + res.xpath("//span[@class='next']/../@href")[0] if res.xpath("//span[@class='next']/../@href") else None
# 提取数据列表
li_list = res.xpath("//div[@class='col1 old-style-col1']/div")
item = {}
# 遍历并提取需要的数据
for li in li_list:
item['content'] = li.xpath(".//div[@class='content']/span[1]/text()")[0]
item['content'] = re.sub(r'<br>|\n','',item['content'])
item['user_id'] = li.xpath(".//div[contains(@class,'author')]/a[2]/h2/text()")[0]
item['user_id'] = re.sub(r'\n','',item['user_id'])

item['user_img'] = "https:" + li.xpath(".//div[contains(@class,'author')]/a[1]/img/@src")[0]

item['hot_comment'] = li.xpath("./a[contains(@class,'indexGodCmt')]//div[@class='main-text']/text()")
if item['hot_comment']:
item['hot_comment'] = re.sub(r'\n','',item['hot_comment'][0])
if len(item['hot_comment']) == 0:
item['hot_comment'] = 'not found'
self.save_csv(item)
print("success")
return next_url

# 保存到csv文件中
def save_csv(self,item):
# 判断是否是第一次保存,是的话,就在第一行加入各字段标题
if self.start_save:
self.start_save = False
with open('./duanzi.csv','a',encoding='utf-8') as f:
f.write("content,user_id,user_img,hot_comment")
f.write('\n')
print(item)
# 将数据转换成用逗号相连的字符串
list_com = [i for i in item.values()]
content = ','.join(list_com)
# 追加数据
with open('./duanzi.csv', 'a', encoding='utf-8') as f:
f.write(content)
f.write('\n')

def run(self):

next_url = self.start_url
while next_url:
# 请求url,获取响应
resp = self.parse_url(next_url)
# 提取参数
# 获取下一页url
# 请求下一页
next_url = self.filter_argu(resp)




if __name__=='__main__':
spider = QiuShiBai()
spider.run()

多线程版:

from lxml import etree
import requests
import re
from threading import Thread
from queue import Queue # 队列用于线程间通信

class QiuShiBai(object):

def __init__(self):
self.start_save = True

self.headers = {
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36",
'Cookie': """_xsrf=2|9eddf422|645cfaf9eb5a803c30cc80e1d1615c5d|1601094123; BAIDU_SSP_lcr=https://www.baidu.com/link?url=jq06Qc3HHKSLv7Hwapz8ijVO_TwbO0x1RwK6HbLb-b6NU5F7uwvxLeMCmI3rpUWC&wd=&eqid=e41515e3000d2fbd000000035f6ec1e7; Hm_lvt_2670efbdd59c7e3ed3749b458cafaa37=1601094125; gr_user_id=afa35f5a-04c9-4b03-ae07-3ac3f807b811; _qqq_uuid_="2|1:0|10:1601094124|10:_qqq_uuid_|56:NmUwMjVmNWYzMjVkNjNjNGE0M2RlYjIzOWEyMmZlMjAxMjVmZTQ1OQ==|4347d1f32342ae3ef17c9eb338787ce49f582101342c1f5c82e729518208e2ac'; _ga=GA1.2.1225827497.1601094125; _gid=GA1.2.1087809495.1601094125; grwng_uid=5b71ba19-07ac-4dd6-8a86-05f77385daa4; __cur_art_index=4001; Hm_lvt_743362d4b71e22775786fbc54283175c=1601094221; Hm_lpvt_743362d4b71e22775786fbc54283175c=1601094890; Hm_lpvt_2670efbdd59c7e3ed3749b458cafaa37=1601098839; ff2672c245bd193c6261e9ab2cd35865_gr_session_id=5020a836-ce51-4d9f-8cf0-8ec57b966aa1; _gat=1; ff2672c245bd193c6261e9ab2cd35865_gr_session_id_5020a836-ce51-4d9f-8cf0-8ec57b966aa1=true"""}

self.temp_url = "https://www.qiushibaike.com/text/?page="
self.item_que = Queue() # 用来存储提取到的item的每一条数据
self.next_url_que = Queue() # 用来存储所有需要请求的url
self.content_to_que = Queue() # 用来存储每一个转换成xpath的对象

# 获取请求内容
def parse_url(self):
while True:
# 获取队列内容
url = self.next_url_que.get()
resp = requests.get(url, headers=self.headers)
resp = resp.content.decode()
# 将响应内容添加到队列
self.content_to_que.put(etree.HTML(resp))
# 用于通知此队列, 在完成一项工作之后,Queue.task_done()函数向任务已经完成的队列发送一个信号.
self.next_url_que.task_done()

# 提取参数,并获取下一页url
def filter_argu(self):
while True:
res = self.content_to_que.get()

# 提取数据列表
li_list = res.xpath("//div[@class='col1 old-style-col1']/div")
item = {}
# 遍历并提取需要的数据
for li in li_list:
item['content'] = li.xpath(".//div[@class='content']/span[1]/text()")[0]
item['content'] = re.sub(r'<br>|\n', '', item['content'])
item['user_id'] = li.xpath(".//div[contains(@class,'author')]/a[2]/h2/text()")[0]
item['user_id'] = re.sub(r'\n', '', item['user_id'])

item['user_img'] = "https:" + li.xpath(".//div[contains(@class,'author')]/a[1]/img/@src")[0]

item['hot_comment'] = li.xpath("./a[contains(@class,'indexGodCmt')]//div[@class='main-text']/text()")
if item['hot_comment']:
item['hot_comment'] = re.sub(r'\n', '', item['hot_comment'][0])
if len(item['hot_comment']) == 0:
item['hot_comment'] = 'not found'

self.item_que.put(item)
self.content_to_que.task_done()

# 保存到csv文件中
def save_csv(self):
while True:
item = self.item_que.get()
# 判断是否是第一次保存,是的话,就在第一行加入各字段标题
if self.start_save:
self.start_save = False
with open('./duanzi2.csv', 'a', encoding='utf-8') as f:
f.write("content,user_id,user_img,hot_comment")
f.write('\n')
print(item)
# 将数据转换成用逗号相连的字符串
list_com = [i for i in item.values()]
content = ','.join(list_com)
# 追加数据
with open('./duanzi2.csv', 'a', encoding='utf-8') as f:
f.write(content)
f.write('\n')
self.item_que.task_done()

def get_list_url(self):
# 将需要请求的url加入队列
[self.next_url_que.put(self.temp_url+str(i)) for i in range(1,14)]

def run(self):
# 线程列表
thread_list = []
# 创建线程并加入列表
t_url_list = Thread(target=self.get_list_url)
thread_list.append(t_url_list)
# 初始化10个线程用来请求网页内容
for i in range(10):
t_parse = Thread(target=self.parse_url)
thread_list.append(t_parse)

t_filter = Thread(target=self.filter_argu)
thread_list.append(t_filter)

t_save = Thread(target=self.save_csv)
thread_list.append(t_save)
# 遍历线程列表并开始
for t in thread_list:
"""join()方法:主线程A中,创建了子线程B,并且在主线程中调用了B.join()方法,那么主线程A会在调用的地方等待,直到子线程B完成操作后,才可以接着往下执行.
setDaemon()方法:主线程A中,创建了子线程B,并且在主线程A中调用了B.setDaemon()方法,这个意思是把主线程A设置为守护线程,这个时候,要是主线程A执行结束了,
就不用管线程B是否完成,一并和主线程A退出.
注意:setDaemon() 必须在start()方法调用之前设置"""
t.setDaemon(True)
t.start()

for q in [self.item_que,self.next_url_que,self.content_to_que]:
q.join() #让主线程等待阻塞,等待队列的任务完成之后再完成


if __name__ == '__main__':
spider = QiuShiBai()
spider.run()

效果图:

爬取某站并在提取数据后保存成格式化文件(csv)_字符串