爬取某站并在提取数据后保存成格式化文件(csv)

原创

sweetheart7_7 2020-09-26 14:59:47 博主文章分类：爬虫笔记 ©著作权

文章标签 字符串 xpath csv curl python 文章分类 OpenStack 云计算

©著作权归作者所有：来自51CTO博客作者sweetheart7_7的原创作品，请联系作者获取转载授权，否则将追究法律责任

from lxml import etree
import requests
import re

class QiuShiBai(object):

    def __init__(self):
        self.start_save = True
        # 请求头
        self.headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36",'Cookie': """_xsrf=2|9eddf422|645cfaf9eb5a803c30cc80e1d1615c5d|1601094123; BAIDU_SSP_lcr=https://www.baidu.com/link?url=jq06Qc3HHKSLv7Hwapz8ijVO_TwbO0x1RwK6HbLb-b6NU5F7uwvxLeMCmI3rpUWC&wd=&eqid=e41515e3000d2fbd000000035f6ec1e7; Hm_lvt_2670efbdd59c7e3ed3749b458cafaa37=1601094125; gr_user_id=afa35f5a-04c9-4b03-ae07-3ac3f807b811; _qqq_uuid_="2|1:0|10:1601094124|10:_qqq_uuid_|56:NmUwMjVmNWYzMjVkNjNjNGE0M2RlYjIzOWEyMmZlMjAxMjVmZTQ1OQ==|4347d1f32342ae3ef17c9eb338787ce49f582101342c1f5c82e729518208e2ac'; _ga=GA1.2.1225827497.1601094125; _gid=GA1.2.1087809495.1601094125; grwng_uid=5b71ba19-07ac-4dd6-8a86-05f77385daa4; __cur_art_index=4001; Hm_lvt_743362d4b71e22775786fbc54283175c=1601094221; Hm_lpvt_743362d4b71e22775786fbc54283175c=1601094890; Hm_lpvt_2670efbdd59c7e3ed3749b458cafaa37=1601098839; ff2672c245bd193c6261e9ab2cd35865_gr_session_id=5020a836-ce51-4d9f-8cf0-8ec57b966aa1; _gat=1; ff2672c245bd193c6261e9ab2cd35865_gr_session_id_5020a836-ce51-4d9f-8cf0-8ec57b966aa1=true"""}
        # 开始时的url
        self.start_url = "https://www.qiushibaike.com/text/?page=1"

    # 获取请求内容
    def parse_url(self,url):
        resp = requests.get(url,headers=self.headers)
        resp = resp.content.decode()
        # print(resp)
        # 获取响应数据,并将其已xpath对象形式返回,便于提取
        return etree.HTML(resp)
    
    # 提取参数,并获取下一页url
    def filter_argu(self,res):
        # li_list = res.xpath("//div[@class='col1 old-style-col1']//div[@class='content']/span[1]/text()")
        # 提取下一页的url地址
        next_url = 'https://www.qiushibaike.com' + res.xpath("//span[@class='next']/../@href")[0] if res.xpath("//span[@class='next']/../@href") else None
        # 提取数据列表
        li_list = res.xpath("//div[@class='col1 old-style-col1']/div")
        item = {}
        # 遍历并提取需要的数据
        for li in li_list:
            item['content'] = li.xpath(".//div[@class='content']/span[1]/text()")[0]
            item['content'] = re.sub(r'<br>|\n','',item['content'])
            item['user_id'] = li.xpath(".//div[contains(@class,'author')]/a[2]/h2/text()")[0]
            item['user_id'] = re.sub(r'\n','',item['user_id'])

            item['user_img'] = "https:" + li.xpath(".//div[contains(@class,'author')]/a[1]/img/@src")[0]

            item['hot_comment'] = li.xpath("./a[contains(@class,'indexGodCmt')]//div[@class='main-text']/text()")
            if item['hot_comment']:
                item['hot_comment'] = re.sub(r'\n','',item['hot_comment'][0])
            if len(item['hot_comment']) == 0:
                item['hot_comment'] = 'not found'
            self.save_csv(item)
            print("success")
        return next_url
    
    # 保存到csv文件中
    def save_csv(self,item):
        # 判断是否是第一次保存,是的话,就在第一行加入各字段标题
        if self.start_save:
            self.start_save = False
            with open('./duanzi.csv','a',encoding='utf-8') as f:
                f.write("content,user_id,user_img,hot_comment")
                f.write('\n')
        print(item)
        # 将数据转换成用逗号相连的字符串
        list_com = [i for i in item.values()]
        content = ','.join(list_com)
        # 追加数据
        with open('./duanzi.csv', 'a', encoding='utf-8') as f:
            f.write(content)
            f.write('\n')

    def run(self):

        next_url = self.start_url
        while next_url:
            # 请求url,获取响应
            resp = self.parse_url(next_url)
            # 提取参数
            # 获取下一页url
            # 请求下一页
            next_url = self.filter_argu(resp)




if __name__=='__main__':
    spider = QiuShiBai()
    spider.run()

多线程版:

from lxml import etree
import requests
import re
from threading import Thread
from queue import Queue # 队列用于线程间通信

class QiuShiBai(object):

    def __init__(self):
        self.start_save = True

        self.headers = {
            "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.135 Safari/537.36",
            'Cookie': """_xsrf=2|9eddf422|645cfaf9eb5a803c30cc80e1d1615c5d|1601094123; BAIDU_SSP_lcr=https://www.baidu.com/link?url=jq06Qc3HHKSLv7Hwapz8ijVO_TwbO0x1RwK6HbLb-b6NU5F7uwvxLeMCmI3rpUWC&wd=&eqid=e41515e3000d2fbd000000035f6ec1e7; Hm_lvt_2670efbdd59c7e3ed3749b458cafaa37=1601094125; gr_user_id=afa35f5a-04c9-4b03-ae07-3ac3f807b811; _qqq_uuid_="2|1:0|10:1601094124|10:_qqq_uuid_|56:NmUwMjVmNWYzMjVkNjNjNGE0M2RlYjIzOWEyMmZlMjAxMjVmZTQ1OQ==|4347d1f32342ae3ef17c9eb338787ce49f582101342c1f5c82e729518208e2ac'; _ga=GA1.2.1225827497.1601094125; _gid=GA1.2.1087809495.1601094125; grwng_uid=5b71ba19-07ac-4dd6-8a86-05f77385daa4; __cur_art_index=4001; Hm_lvt_743362d4b71e22775786fbc54283175c=1601094221; Hm_lpvt_743362d4b71e22775786fbc54283175c=1601094890; Hm_lpvt_2670efbdd59c7e3ed3749b458cafaa37=1601098839; ff2672c245bd193c6261e9ab2cd35865_gr_session_id=5020a836-ce51-4d9f-8cf0-8ec57b966aa1; _gat=1; ff2672c245bd193c6261e9ab2cd35865_gr_session_id_5020a836-ce51-4d9f-8cf0-8ec57b966aa1=true"""}

        self.temp_url = "https://www.qiushibaike.com/text/?page="
        self.item_que = Queue()  # 用来存储提取到的item的每一条数据
        self.next_url_que = Queue() # 用来存储所有需要请求的url
        self.content_to_que = Queue() # 用来存储每一个转换成xpath的对象

    # 获取请求内容
    def parse_url(self):
        while True:
            # 获取队列内容
            url = self.next_url_que.get()
            resp = requests.get(url, headers=self.headers)
            resp = resp.content.decode()
            # 将响应内容添加到队列
            self.content_to_que.put(etree.HTML(resp))
            # 用于通知此队列, 在完成一项工作之后，Queue.task_done()函数向任务已经完成的队列发送一个信号.
            self.next_url_que.task_done()

    # 提取参数,并获取下一页url
    def filter_argu(self):
        while True:
            res = self.content_to_que.get()

            # 提取数据列表
            li_list = res.xpath("//div[@class='col1 old-style-col1']/div")
            item = {}
            # 遍历并提取需要的数据
            for li in li_list:
                item['content'] = li.xpath(".//div[@class='content']/span[1]/text()")[0]
                item['content'] = re.sub(r'<br>|\n', '', item['content'])
                item['user_id'] = li.xpath(".//div[contains(@class,'author')]/a[2]/h2/text()")[0]
                item['user_id'] = re.sub(r'\n', '', item['user_id'])

                item['user_img'] = "https:" + li.xpath(".//div[contains(@class,'author')]/a[1]/img/@src")[0]

                item['hot_comment'] = li.xpath("./a[contains(@class,'indexGodCmt')]//div[@class='main-text']/text()")
                if item['hot_comment']:
                    item['hot_comment'] = re.sub(r'\n', '', item['hot_comment'][0])
                if len(item['hot_comment']) == 0:
                    item['hot_comment'] = 'not found'

                self.item_que.put(item)
            self.content_to_que.task_done()

    # 保存到csv文件中
    def save_csv(self):
        while True:
            item = self.item_que.get()
            # 判断是否是第一次保存,是的话,就在第一行加入各字段标题
            if self.start_save:
                self.start_save = False
                with open('./duanzi2.csv', 'a', encoding='utf-8') as f:
                    f.write("content,user_id,user_img,hot_comment")
                    f.write('\n')
            print(item)
            # 将数据转换成用逗号相连的字符串
            list_com = [i for i in item.values()]
            content = ','.join(list_com)
            # 追加数据
            with open('./duanzi2.csv', 'a', encoding='utf-8') as f:
                f.write(content)
                f.write('\n')
            self.item_que.task_done()

    def get_list_url(self):
        # 将需要请求的url加入队列
        [self.next_url_que.put(self.temp_url+str(i)) for i in range(1,14)]

    def run(self):
        # 线程列表
        thread_list = []
        # 创建线程并加入列表
        t_url_list = Thread(target=self.get_list_url)
        thread_list.append(t_url_list)
        # 初始化10个线程用来请求网页内容
        for i in range(10):
            t_parse = Thread(target=self.parse_url)
            thread_list.append(t_parse)
        
        t_filter = Thread(target=self.filter_argu)
        thread_list.append(t_filter)

        t_save = Thread(target=self.save_csv)
        thread_list.append(t_save)
        # 遍历线程列表并开始
        for t in thread_list:
            """join()方法:主线程A中,创建了子线程B,并且在主线程中调用了B.join()方法,那么主线程A会在调用的地方等待,直到子线程B完成操作后,才可以接着往下执行.
            setDaemon()方法:主线程A中,创建了子线程B,并且在主线程A中调用了B.setDaemon()方法,这个意思是把主线程A设置为守护线程,这个时候,要是主线程A执行结束了,
            就不用管线程B是否完成,一并和主线程A退出.
            注意:setDaemon() 必须在start()方法调用之前设置"""
            t.setDaemon(True) 
            t.start()

        for q in [self.item_que,self.next_url_que,self.content_to_que]:
            q.join() #让主线程等待阻塞,等待队列的任务完成之后再完成


if __name__ == '__main__':
    spider = QiuShiBai()
    spider.run()