进程 线程 协程 对比
一个进程可以包含多个线程,一个线程可以包含多个协程。
进程可以使用多核cpu并行运行,内存开销比较大,适用于cpu密集型。
线程只能使用一个cpu并发运行,同一时刻只能有一个线程运行(GIL锁的原因)。适用于io密集型,内存开销、线程切换。
协程内存开销最小,适用于io密集型,缺点是需要支持库,代码复杂。
进程 multiprocessing
- Process 类
from multiprocessing import Process
def f(name):
print('hello', name)
if __name__ == '__main__':
p = Process(target=f, args=('bob',))
p.start()
p.join() # 让main等待子进程完成
- Queue(), pool线程池
队列
from multiprocessing import Process, Queue
def f(q):
q.put([42, None, 'hello'])
if __name__ == '__main__':
q = Queue()
p = Process(target=f, args=(q,))
p.start()
print(q.get()) # prints "[42, None, 'hello']"
p.join()
进程池
from multiprocessing import Pool
import time
def f(x):
return x*x
if __name__ == '__main__':
with Pool(processes=4) as pool: # start 4 worker processes
result = pool.apply_async(f, (10,)) # evaluate "f(10)" asynchronously in a single process
print(result.get(timeout=1)) # prints "100" unless your computer is *very* slow
print(pool.map(f, range(10))) # prints "[0, 1, 4,..., 81]"
it = pool.imap(f, range(10))
print(next(it)) # prints "0"
print(next(it)) # prints "1"
print(it.next(timeout=1)) # prints "4" unless your computer is *very* slow
result = pool.apply_async(time.sleep, (10,))
print(result.get(timeout=1)) # raises multiprocessing.TimeoutError
线程 threading
- 创建线程
普通创建
import threading
def run(n):
print("task", n)
t1 = threading.Thread(target=run, args=("t1",))
t1.start()
t1.join() # 子线程设置了join方法,主线程就需要等待此子线程
类创建
import threading
class MyThread(threading.Thread):
def __init__(self, n):
super(MyThread, self).__init__() # 重构run函数必须要写
self.n = n
def run(self):
print("task", self.n)
if __name__ == "__main__":
t1 = MyThread("t1")
t1.start()
- setDaemon join
当设置守护线程时setDaemon(True),意思是子线程是否完整运行对主线程不重要了,主线程结束,子线程也就不执行了。
当设置子线程阻塞join(timeout)状态时,意思是主线程必须等待子线程,子线程运行时,主线程处于阻塞状态。 - lock锁
lock = threading.Lock()
with lock: # 方法一
# 改变共享变量的代码
lock.acquire() # 方法二
try:
# do something
finally:
lock.release()
- 线程池
from concurrent.futures import ThreadPoolExecutor, as_completed
with ThreadPoolExecutor() as pool: # 用法1
results = pool.map(craw, urls)
for result in results:
print(result)
with ThreadPoolExecutor() as pool: # 用法2
futures = [pool.submit(craw, url) for url in urls]
for future in futures:
print(future.result())
for future in as_completed(futures): # 返回结果是不固定的
print(future.result())
协程 asyncio
等待一个协程。以下代码段会在等待 1 秒后打印 "hello",然后 再次 等待 2 秒后打印 "world":
import asyncio
import time
async def say_after(delay, what):
await asyncio.sleep(delay)
print(what)
async def main():
print(f"started at {time.strftime('%X')}")
await say_after(1, 'hello')
await say_after(2, 'world')
print(f"finished at {time.strftime('%X')}")
asyncio.run(main())
asyncio.create_task() 函数用来并发运行作为 asyncio 任务 的多个协程
修改main函数,执行时间为2秒
async def main():
task1 = asyncio.create_task(
say_after(1, 'hello'))
task2 = asyncio.create_task(
say_after(2, 'world'))
print(f"started at {time.strftime('%X')}")
await task1
await task2
print(f"finished at {time.strftime('%X')}")
subprocess模块
- os旧模块的使用
a = os.system("ipconfig //all") # 结果输出到屏幕,成功返回0,失败返回非o,汉字乱码
print(a)
res = os.popen("ipconfig //all").read() # 结果保存在内存中,用read方法读出,汉字正常显示
print(res)
- subprocess
subprocess.run("ipconfig /all") # 结果输出到屏幕,返回一个对象
subprocess.call("ipconfig /all") # 命令的结果输出到屏幕,返回执行状态,0或者非0
subprocess.getstatusoutput("ipconfig /all") # 返回元组,元组第一项,0或者非0,第二项为执行结果
subprocess.getoutput("ipconfig /all") # 返回执行结果,汉字正常显示
res = subprocess.Popen("ipconfig /all", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
print(res.stderr.read()) # 返回是b'',subprocess.PIPE,这个又是啥呢?原来这个是一个管道
print(res.stdout.read())
res.wait() # 等待完成,poll() 返回执行结果,terminate() 结束进程
res.stderr.close()
res.stdout.close()
使用多线程爬取页面,速度提高10倍
源文件1:blog_spider.py
import requests
def craw(url):
r = requests.get(url)
print(url, len(r.text))
craw(urls[0])
源文件2:multi_thread_craw.py
import time
import blog_spider
import threading
def single_thread():
for url in blog_spider.urls:
blog_spider.craw(url)
def multi_thread():
threads = []
for url in blog_spider.urls:
threads.append(threading.Thread(target=blog_spider.craw, args=(url,)))
for thread in threads:
thread.start()
for thread in threads:
thread.join()
if __name__ == '__main__':
start = time.time()
# single_thread()
multi_thread()
end = time.time()
print("用时", end-start, '秒')
python实现生产者消费者爬虫
源文件1:blog_spider.py
import requests
from bs4 import BeautifulSoup
def craw(url):
r = requests.get(url)
return r.text
def parse(html):
soup = BeautifulSoup(html, "html.parser")
links = soup.find_all('a', class_="post-item-title")
return [(link["href"], link.get_text()) for link in links ]
if __name__ == '__main__':
for result in parse(craw(urls[2])):
print(result)
源文件2:producer_consumer_spider.py
import queue
import blog_spider
import threading
def do_craw(url_queue:queue.Queue, html_queue:queue.Queue):
while True:
url = url_queue.get()
html = blog_spider.craw(url)
html_queue.put(html)
print(threading.current_thread().name, url_queue.qsize())
def do_parse(html_queue:queue.Queue, fout):
while True:
html = html_queue.get()
results = blog_spider.parse(html)
for result in results:
fout.write(str(result) + '\n')
print(threading.current_thread().name, html_queue.qsize())
if __name__ == '__main__':
url_queue = queue.Queue()
html_queue = queue.Queue()
for url in blog_spider.urls:
url_queue.put(url)
for idx in range(3): # 生产者3个线程
t = threading.Thread(target=do_craw, args=(url_queue, html_queue), name=f"craw{idx}")
t.start()
fout = open("result.txt", 'w')
for idx in range(2): # 消费者2个线程
t = threading.Thread(target=do_parse, args=(html_queue, fout), name=f"parse{idx}")
t.start()