进程 线程 协程 对比

一个进程可以包含多个线程,一个线程可以包含多个协程。
进程可以使用多核cpu并行运行,内存开销比较大,适用于cpu密集型。
线程只能使用一个cpu并发运行,同一时刻只能有一个线程运行(GIL锁的原因)。适用于io密集型,内存开销、线程切换。
协程内存开销最小,适用于io密集型,缺点是需要支持库,代码复杂。

进程 multiprocessing

  • Process 类
from multiprocessing import Process

def f(name):
    print('hello', name)

if __name__ == '__main__':
    p = Process(target=f, args=('bob',))
    p.start()
    p.join() # 让main等待子进程完成
  • Queue(), pool线程池
    队列
from multiprocessing import Process, Queue

def f(q):
    q.put([42, None, 'hello'])

if __name__ == '__main__':
    q = Queue()
    p = Process(target=f, args=(q,))
    p.start()
    print(q.get())    # prints "[42, None, 'hello']"
    p.join()

进程池

from multiprocessing import Pool
import time

def f(x):
    return x*x

if __name__ == '__main__':
    with Pool(processes=4) as pool:         # start 4 worker processes
        result = pool.apply_async(f, (10,)) # evaluate "f(10)" asynchronously in a single process
        print(result.get(timeout=1))        # prints "100" unless your computer is *very* slow

        print(pool.map(f, range(10)))       # prints "[0, 1, 4,..., 81]"

        it = pool.imap(f, range(10))
        print(next(it))                     # prints "0"
        print(next(it))                     # prints "1"
        print(it.next(timeout=1))           # prints "4" unless your computer is *very* slow

        result = pool.apply_async(time.sleep, (10,))
        print(result.get(timeout=1))        # raises multiprocessing.TimeoutError

线程 threading

  • 创建线程

普通创建

import threading

def run(n):
    print("task", n)

t1 = threading.Thread(target=run, args=("t1",))
t1.start()
t1.join()  # 子线程设置了join方法,主线程就需要等待此子线程

类创建

import threading

class MyThread(threading.Thread):
    def __init__(self, n):
        super(MyThread, self).__init__()  # 重构run函数必须要写
        self.n = n

    def run(self):
        print("task", self.n)

if __name__ == "__main__":
    t1 = MyThread("t1")
    t1.start()
  • setDaemon join
    当设置守护线程时setDaemon(True),意思是子线程是否完整运行对主线程不重要了,主线程结束,子线程也就不执行了。
    当设置子线程阻塞join(timeout)状态时,意思是主线程必须等待子线程,子线程运行时,主线程处于阻塞状态。
  • lock锁
lock = threading.Lock()

    with lock:   # 方法一
          # 改变共享变量的代码

    lock.acquire()  # 方法二
    try:
        # do something
    finally:
        lock.release()
  • 线程池
from concurrent.futures import ThreadPoolExecutor, as_completed

with ThreadPoolExecutor() as pool:  # 用法1
    results = pool.map(craw, urls)
    for result in results:
        print(result)
        
with ThreadPoolExecutor() as pool:  # 用法2
    futures = [pool.submit(craw, url) for url in urls]
    for future in futures:
        print(future.result())
    for future in as_completed(futures): # 返回结果是不固定的
        print(future.result())

协程 asyncio

等待一个协程。以下代码段会在等待 1 秒后打印 "hello",然后 再次 等待 2 秒后打印 "world":

import asyncio
import time

async def say_after(delay, what):
    await asyncio.sleep(delay)
    print(what)

async def main():
    print(f"started at {time.strftime('%X')}")

    await say_after(1, 'hello')
    await say_after(2, 'world')

    print(f"finished at {time.strftime('%X')}")

asyncio.run(main())

asyncio.create_task() 函数用来并发运行作为 asyncio 任务 的多个协程
修改main函数,执行时间为2秒

async def main():
    task1 = asyncio.create_task(
        say_after(1, 'hello'))

    task2 = asyncio.create_task(
        say_after(2, 'world'))

    print(f"started at {time.strftime('%X')}")

    await task1
    await task2

    print(f"finished at {time.strftime('%X')}")

subprocess模块

  • os旧模块的使用
a = os.system("ipconfig //all")  # 结果输出到屏幕,成功返回0,失败返回非o,汉字乱码
print(a)
res = os.popen("ipconfig //all").read()  # 结果保存在内存中,用read方法读出,汉字正常显示
print(res)
  • subprocess
subprocess.run("ipconfig /all")  # 结果输出到屏幕,返回一个对象
subprocess.call("ipconfig /all")  # 命令的结果输出到屏幕,返回执行状态,0或者非0
subprocess.getstatusoutput("ipconfig /all")  # 返回元组,元组第一项,0或者非0,第二项为执行结果
subprocess.getoutput("ipconfig /all")  # 返回执行结果,汉字正常显示
res = subprocess.Popen("ipconfig /all", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
print(res.stderr.read())  # 返回是b'',subprocess.PIPE,这个又是啥呢?原来这个是一个管道
print(res.stdout.read())
res.wait()  # 等待完成,poll() 返回执行结果,terminate() 结束进程
res.stderr.close()
res.stdout.close()

使用多线程爬取页面,速度提高10倍

源文件1:blog_spider.py

import requests

def craw(url):
    r = requests.get(url)
    print(url, len(r.text))

craw(urls[0])

源文件2:multi_thread_craw.py

import time
import blog_spider
import threading

def single_thread():
    for url in blog_spider.urls:
        blog_spider.craw(url)

def multi_thread():
    threads = []
    for url in blog_spider.urls:
        threads.append(threading.Thread(target=blog_spider.craw, args=(url,)))
    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()

if __name__ == '__main__':
    start = time.time()
    # single_thread()
    multi_thread()
    end = time.time()
    print("用时", end-start, '秒')

python实现生产者消费者爬虫

源文件1:blog_spider.py

import requests
from bs4 import BeautifulSoup

def craw(url):
    r = requests.get(url)
    return r.text

def parse(html):
    soup = BeautifulSoup(html, "html.parser")
    links = soup.find_all('a', class_="post-item-title")
    return [(link["href"], link.get_text()) for link in links ]

if __name__ == '__main__':
    for result in parse(craw(urls[2])):
        print(result)

源文件2:producer_consumer_spider.py

import queue
import blog_spider
import threading

def do_craw(url_queue:queue.Queue, html_queue:queue.Queue):
    while True:
        url = url_queue.get()
        html = blog_spider.craw(url)
        html_queue.put(html)
        print(threading.current_thread().name, url_queue.qsize())

def do_parse(html_queue:queue.Queue, fout):
    while True:
        html = html_queue.get()
        results = blog_spider.parse(html)
        for result in results:
            fout.write(str(result) + '\n')
        print(threading.current_thread().name, html_queue.qsize())

if __name__ == '__main__':
    url_queue = queue.Queue()
    html_queue = queue.Queue()
    for url in blog_spider.urls:
        url_queue.put(url)
    for idx in range(3):   # 生产者3个线程
        t = threading.Thread(target=do_craw, args=(url_queue, html_queue), name=f"craw{idx}")
        t.start()
    fout = open("result.txt", 'w')
    for idx in range(2):  # 消费者2个线程
        t = threading.Thread(target=do_parse, args=(html_queue, fout), name=f"parse{idx}")
        t.start()