多线程工作原理

多线程示意图

【spider】多线程爬虫_xml

 

Queue(队列对象)

queue是python中的标准库,可以直接from queue import Queue引用;队列是线程间最常用的交换数据的形式

 

python下多线程的思考

对于资源,加锁是个重要的环节。Queue,是线程安全的,因此在满足使用条件下,建议使用队列

 

创建一个“队列”对象 

pageQueue = Queue(10)

 

将一个值放入队列中

for page in range(1, 11):
   pageQueue.put(page)

 

将一个值从队列中取出

pageQueue.get()

 

队列Queue

Queue线程安全
    queue是python中的标准库,可以直接from queue import Queue引用;队列是线程间最常用的交换数据的形式
    创建一个“队列”对象
    队列常用方法
        put()
        get(block)
        empty()
        full()
        qsize()

队列锁与线程锁

import threading
from queue import Queue
dataQueue = Queue(100)
exitFlag = False

class MyThread(threading.Thread):
def __init__(self,q):
super().__init__()
self.queue = q
def run(self):
super().run()
global exitFlag
while True:
if exitFlag:
print('++++++++++++++++++++++++++exit')
break
try:
print('------------------------',self.queue.get(False))
self.queue.task_done()
except:
pass

def main():
for i in range(100):
dataQueue.put(i)

threads = []
for i in range(5):
thread = MyThread(dataQueue)
threads.append(thread)
thread.start()
# 队列锁
# dataQueue.join()
global exitFlag
exitFlag = True
print('exit ------------------------------------------------')
# 线程锁
for t in threads:
t.join()

if __name__ == '__main__':
main()

另一个实例 爬去读书网站

import requests
from bs4 import BeautifulSoup
from queue import Queue
import threading
from threading import Lock
url = 'https://www.dushu.com/book/1175_%d.html'
task_queue = Queue(100)
parse_queue = Queue(100)
headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Cache-Control':'max-age=0',
'Connection':'keep-alive',
'Cookie':'Hm_lvt_8008bbd51b8bc504162e1a61c3741a9d=1572418328; Hm_lpvt_8008bbd51b8bc504162e1a61c3741a9d=1572418390',
'Host':'www.dushu.com',
'Sec-Fetch-Mode':'navigate',
'Sec-Fetch-Site':'none',
'Sec-Fetch-User':'?1',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36',}
# 解析线程退出的标记
exit_flag = False

# 相当于线程池
class CrawlThread(threading.Thread):
def __init__(self, q_task:Queue,q_parse:Queue) -> None:
super().__init__()
self.q_task = q_task
self.q_parse = q_parse
def run(self) -> None:
super().run()
self.spider()
# 一直干活
def spider(self):
while True:
if self.q_task.empty():
print('+++++++爬虫线程%s执行任务结束+++++++'%(threading.current_thread().getName()))
break
taskId = self.q_task.get()
response = requests.get(url % (taskId), headers = headers)
response.encoding = 'utf-8'
html = response.text
self.q_parse.put((html,taskId))
self.q_task.task_done()
print('------爬虫线程:%s-----执行任务:%d-------'
%(threading.current_thread().getName(),taskId))
# 专心爬虫
def crawl():
for i in range(1,101):
task_queue.put(i)
for i in range(5):
t = CrawlThread(task_queue,parse_queue)
t.start()

class ParseThread(threading.Thread):
def __init__(self,q_parse:Queue,lock:Lock,fp):
super().__init__()
self.q_parse = q_parse
self.lock = lock
self.fp = fp
def run(self):
super().run()
self.parse()
def parse(self):
while True:
if exit_flag:
print('-----------解析线程:%s完成任务退出------------'
%(threading.current_thread().getName()))
break
try:
html,taskId = self.q_parse.get(block=False)
soup = BeautifulSoup(html,'lxml')
books = soup.select('div[class="bookslist"] > ul > li')
print('----------------',len(books))
for book in books:
self.lock.acquire()
book_url = book.find('img').attrs['src']
book_title = book.select('h3 a')[0]['title']
book_author = book.select('p')[0].get_text()
book_describe = book.select('p')[1].get_text()
fp.write('%s\t%s\t%s\t%s\n'%(book_url,book_title,book_author,book_describe))
self.lock.release()
self.q_parse.task_done()
print('**********解析线程:%s完成了第%d页解析任务***********'
%(threading.current_thread().getName(),taskId))
except :
pass
# 专心的负责网页解析,保存
def parse(fp):
lock = Lock()
for i in range(5):
t = ParseThread(parse_queue,lock,fp)
t.start()
if __name__ == '__main__':
crawl()
fp = open('./book.txt','a',encoding='utf-8')
parse(fp)
# 队列join:队列中的任务必须结束,下面才会执行
task_queue.join()
parse_queue.join()
fp.close()
exit_flag = True
print('代码执行到这里!!!!!!!!!!!!!!')

多线程实现
    读书http://www.qwsy.com/shuku.aspx?&page=1
    导包
    定义变量
    创建爬虫线程并启动
        爬虫线程
    创建解析线程并启动
        解析线程
            Queue.get(block = True/False)
    join()锁定线程,确保线程全部执行完毕
    结束任务