python 协程注入好复杂 python协程数量

转载

mob64ca13f48509 2023-10-25 13:37:46

文章标签 python 协程注入好复杂 html ide 子线程 文章分类 Python 后端开发

进程线程协程对比

一个进程可以包含多个线程，一个线程可以包含多个协程。
进程可以使用多核cpu并行运行，内存开销比较大，适用于cpu密集型。
线程只能使用一个cpu并发运行，同一时刻只能有一个线程运行（GIL锁的原因）。适用于io密集型，内存开销、线程切换。
协程内存开销最小，适用于io密集型，缺点是需要支持库，代码复杂。

进程 multiprocessing

Process 类

from multiprocessing import Process

def f(name):
    print('hello', name)

if __name__ == '__main__':
    p = Process(target=f, args=('bob',))
    p.start()
    p.join() # 让main等待子进程完成

Queue(), pool线程池
队列

from multiprocessing import Process, Queue

def f(q):
    q.put([42, None, 'hello'])

if __name__ == '__main__':
    q = Queue()
    p = Process(target=f, args=(q,))
    p.start()
    print(q.get())    # prints "[42, None, 'hello']"
    p.join()

进程池

from multiprocessing import Pool
import time

def f(x):
    return x*x

if __name__ == '__main__':
    with Pool(processes=4) as pool:         # start 4 worker processes
        result = pool.apply_async(f, (10,)) # evaluate "f(10)" asynchronously in a single process
        print(result.get(timeout=1))        # prints "100" unless your computer is *very* slow

        print(pool.map(f, range(10)))       # prints "[0, 1, 4,..., 81]"

        it = pool.imap(f, range(10))
        print(next(it))                     # prints "0"
        print(next(it))                     # prints "1"
        print(it.next(timeout=1))           # prints "4" unless your computer is *very* slow

        result = pool.apply_async(time.sleep, (10,))
        print(result.get(timeout=1))        # raises multiprocessing.TimeoutError

线程 threading

创建线程

普通创建

import threading

def run(n):
    print("task", n)

t1 = threading.Thread(target=run, args=("t1",))
t1.start()
t1.join()  # 子线程设置了join方法，主线程就需要等待此子线程

类创建

import threading

class MyThread(threading.Thread):
    def __init__(self, n):
        super(MyThread, self).__init__()  # 重构run函数必须要写
        self.n = n

    def run(self):
        print("task", self.n)

if __name__ == "__main__":
    t1 = MyThread("t1")
    t1.start()

setDaemon join
当设置守护线程时setDaemon(True)，意思是子线程是否完整运行对主线程不重要了，主线程结束，子线程也就不执行了。
当设置子线程阻塞join(timeout)状态时，意思是主线程必须等待子线程，子线程运行时，主线程处于阻塞状态。
lock锁

lock = threading.Lock()

    with lock:   # 方法一
          # 改变共享变量的代码

    lock.acquire()  # 方法二
    try:
        # do something
    finally:
        lock.release()

线程池

from concurrent.futures import ThreadPoolExecutor, as_completed

with ThreadPoolExecutor() as pool:  # 用法1
    results = pool.map(craw, urls)
    for result in results:
        print(result)
        
with ThreadPoolExecutor() as pool:  # 用法2
    futures = [pool.submit(craw, url) for url in urls]
    for future in futures:
        print(future.result())
    for future in as_completed(futures): # 返回结果是不固定的
        print(future.result())

协程 asyncio

等待一个协程。以下代码段会在等待 1 秒后打印 "hello"，然后再次等待 2 秒后打印 "world":

import asyncio
import time

async def say_after(delay, what):
    await asyncio.sleep(delay)
    print(what)

async def main():
    print(f"started at {time.strftime('%X')}")

    await say_after(1, 'hello')
    await say_after(2, 'world')

    print(f"finished at {time.strftime('%X')}")

asyncio.run(main())

asyncio.create_task() 函数用来并发运行作为 asyncio 任务的多个协程
修改main函数，执行时间为2秒

async def main():
    task1 = asyncio.create_task(
        say_after(1, 'hello'))

    task2 = asyncio.create_task(
        say_after(2, 'world'))

    print(f"started at {time.strftime('%X')}")

    await task1
    await task2

    print(f"finished at {time.strftime('%X')}")

subprocess模块

os旧模块的使用

a = os.system("ipconfig //all")  # 结果输出到屏幕，成功返回0，失败返回非o，汉字乱码
print(a)
res = os.popen("ipconfig //all").read()  # 结果保存在内存中，用read方法读出，汉字正常显示
print(res)

subprocess

subprocess.run("ipconfig /all")  # 结果输出到屏幕，返回一个对象
subprocess.call("ipconfig /all")  # 命令的结果输出到屏幕,返回执行状态，0或者非0
subprocess.getstatusoutput("ipconfig /all")  # 返回元组，元组第一项，0或者非0，第二项为执行结果
subprocess.getoutput("ipconfig /all")  # 返回执行结果，汉字正常显示
res = subprocess.Popen("ipconfig /all", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
print(res.stderr.read())  # 返回是b'',subprocess.PIPE,这个又是啥呢？原来这个是一个管道
print(res.stdout.read())
res.wait()  # 等待完成，poll() 返回执行结果，terminate() 结束进程
res.stderr.close()
res.stdout.close()

使用多线程爬取页面，速度提高10倍

源文件1：blog_spider.py

import requests

def craw(url):
    r = requests.get(url)
    print(url, len(r.text))

craw(urls[0])

源文件2：multi_thread_craw.py

import time
import blog_spider
import threading

def single_thread():
    for url in blog_spider.urls:
        blog_spider.craw(url)

def multi_thread():
    threads = []
    for url in blog_spider.urls:
        threads.append(threading.Thread(target=blog_spider.craw, args=(url,)))
    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()

if __name__ == '__main__':
    start = time.time()
    # single_thread()
    multi_thread()
    end = time.time()
    print("用时", end-start, '秒')

python实现生产者消费者爬虫

源文件1：blog_spider.py

import requests
from bs4 import BeautifulSoup

def craw(url):
    r = requests.get(url)
    return r.text

def parse(html):
    soup = BeautifulSoup(html, "html.parser")
    links = soup.find_all('a', class_="post-item-title")
    return [(link["href"], link.get_text()) for link in links ]

if __name__ == '__main__':
    for result in parse(craw(urls[2])):
        print(result)

源文件2：producer_consumer_spider.py

import queue
import blog_spider
import threading

def do_craw(url_queue:queue.Queue, html_queue:queue.Queue):
    while True:
        url = url_queue.get()
        html = blog_spider.craw(url)
        html_queue.put(html)
        print(threading.current_thread().name, url_queue.qsize())

def do_parse(html_queue:queue.Queue, fout):
    while True:
        html = html_queue.get()
        results = blog_spider.parse(html)
        for result in results:
            fout.write(str(result) + '\n')
        print(threading.current_thread().name, html_queue.qsize())

if __name__ == '__main__':
    url_queue = queue.Queue()
    html_queue = queue.Queue()
    for url in blog_spider.urls:
        url_queue.put(url)
    for idx in range(3):   # 生产者3个线程
        t = threading.Thread(target=do_craw, args=(url_queue, html_queue), name=f"craw{idx}")
        t.start()
    fout = open("result.txt", 'w')
    for idx in range(2):  # 消费者2个线程
        t = threading.Thread(target=do_parse, args=(html_queue, fout), name=f"parse{idx}")
        t.start()

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。