【spider】多线程爬虫

原创

刘旺學長 2022-07-04 20:37:17 博主文章分类：爬虫学习 ©著作权

文章标签 html 多线程 xml 文章分类 运维

©著作权归作者所有：来自51CTO博客作者刘旺學長的原创作品，请联系作者获取转载授权，否则将追究法律责任

多线程工作原理

多线程示意图

【spider】多线程爬虫_xml

Queue（队列对象）

queue是python中的标准库，可以直接from queue import Queue引用;队列是线程间最常用的交换数据的形式

python下多线程的思考

对于资源，加锁是个重要的环节。Queue，是线程安全的，因此在满足使用条件下，建议使用队列

创建一个“队列”对象

pageQueue = Queue(10)

将一个值放入队列中

for page in range(1, 11):
pageQueue.put(page)

将一个值从队列中取出

pageQueue.get()

队列Queue

Queue线程安全
   queue是python中的标准库，可以直接from queue import Queue引用;队列是线程间最常用的交换数据的形式
   创建一个“队列”对象
   队列常用方法
       put()
       get(block)
       empty()
       full()
       qsize()

队列锁与线程锁

import threading
from queue import  Queue
dataQueue = Queue(100)
exitFlag = False

class MyThread(threading.Thread):
    def __init__(self,q):
        super().__init__()
        self.queue = q
    def run(self):
        super().run()
        global exitFlag
        while True:
            if exitFlag:
                print('++++++++++++++++++++++++++exit')
                break
            try:
                print('------------------------',self.queue.get(False))
                self.queue.task_done()
            except:
                pass

def main():
    for i in range(100):
        dataQueue.put(i)

    threads = []
    for i in range(5):
        thread = MyThread(dataQueue)
        threads.append(thread)
        thread.start()
    # 队列锁
    # dataQueue.join()
    global  exitFlag
    exitFlag = True
    print('exit ------------------------------------------------')
    # 线程锁
    for t in threads:
        t.join()

if __name__ == '__main__':
    main()

另一个实例爬去读书网站

import requests
from bs4 import BeautifulSoup
from queue import Queue
import threading
from threading import Lock
url = 'https://www.dushu.com/book/1175_%d.html'
task_queue = Queue(100)
parse_queue = Queue(100)
headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':'Hm_lvt_8008bbd51b8bc504162e1a61c3741a9d=1572418328; Hm_lpvt_8008bbd51b8bc504162e1a61c3741a9d=1572418390',
'Host':'www.dushu.com',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'none',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',}
# 解析线程退出的标记
exit_flag = False

# 相当于线程池
class CrawlThread(threading.Thread):
    def __init__(self, q_task:Queue,q_parse:Queue) -> None:
        super().__init__()
        self.q_task = q_task
        self.q_parse = q_parse
    def run(self) -> None:
        super().run()
        self.spider()
    # 一直干活
    def spider(self):
        while True:
            if self.q_task.empty():
                print('+++++++爬虫线程%s执行任务结束+++++++'%(threading.current_thread().getName()))
                break
            taskId = self.q_task.get()
            response = requests.get(url % (taskId), headers = headers)
            response.encoding = 'utf-8'
            html = response.text
            self.q_parse.put((html,taskId))
            self.q_task.task_done()
            print('------爬虫线程：%s-----执行任务:%d-------'
                  %(threading.current_thread().getName(),taskId))
# 专心爬虫
def crawl():
    for i in range(1,101):
        task_queue.put(i)
    for i in range(5):
        t = CrawlThread(task_queue,parse_queue)
        t.start()

class ParseThread(threading.Thread):
    def __init__(self,q_parse:Queue,lock:Lock,fp):
        super().__init__()
        self.q_parse = q_parse
        self.lock = lock
        self.fp = fp
    def run(self):
        super().run()
        self.parse()
    def parse(self):
        while True:
            if exit_flag:
                print('-----------解析线程：%s完成任务退出------------'
                      %(threading.current_thread().getName()))
                break
            try:
                html,taskId = self.q_parse.get(block=False)
                soup = BeautifulSoup(html,'lxml')
                books = soup.select('div[class="bookslist"] > ul > li')
                print('----------------',len(books))
                for book in books:
                    self.lock.acquire()
                    book_url = book.find('img').attrs['src']
                    book_title = book.select('h3 a')[0]['title']
                    book_author = book.select('p')[0].get_text()
                    book_describe = book.select('p')[1].get_text()
                    fp.write('%s\t%s\t%s\t%s\n'%(book_url,book_title,book_author,book_describe))
                    self.lock.release()
                self.q_parse.task_done()
                print('**********解析线程：%s完成了第%d页解析任务***********'
                      %(threading.current_thread().getName(),taskId))
            except :
                pass
# 专心的负责网页解析，保存
def parse(fp):
    lock = Lock()
    for i in range(5):
        t = ParseThread(parse_queue,lock,fp)
        t.start()
if __name__ == '__main__':
    crawl()
    fp = open('./book.txt','a',encoding='utf-8')
    parse(fp)
    # 队列join：队列中的任务必须结束，下面才会执行
    task_queue.join()
    parse_queue.join()
    fp.close()
    exit_flag = True
    print('代码执行到这里！！！！！！！！！！！！！！')

多线程实现
   读书http://www.qwsy.com/shuku.aspx?&page=1
   导包
   定义变量
   创建爬虫线程并启动
       爬虫线程
   创建解析线程并启动
       解析线程
           Queue.get(block = True/False)
   join()锁定线程，确保线程全部执行完毕
   结束任务