前言
当我们需要批量下载图片的时候,requests 库会比较慢,如果一个个下载,出现阻塞的时候,后面的都会阻塞卡住,假死状态。当然你用多线程也能提高效率。
这里介绍用aiohttp 异步批量下载图片
异步批量下载图片
话不多说,直接看代码
import aiohttp
import asyncio
from pathlib import Path
async def down_img(session, url):
"""下载图片"""
name = url.split('/')[-1] # 获得图片名字
img = await session.get(url)
# 触发到await就切换,等待get到数据
content = await img.read()
# 读取内容
with open('./down_img/'+str(name), 'wb') as f:
# 写入至文件
f.write(content)
print(f'{name} 下载完成!')
return str(url)
async def main(URL):
# 建立会话session
async with aiohttp.ClientSession() as session:
# 建立所有任务
tasks = [asyncio.create_task(down_img(session, img_url)) for img_url in URL]
# 触发await,等待任务完成
done, pending = await asyncio.wait(tasks)
all_results = [done_task.result() for done_task in done]
# 获取所有结果
print("ALL RESULT:"+str(all_results))
URL = [
'https://cdn.pixabay.com/photo/2014/10/07/13/48/mountain-477832_960_720.jpg',
'https://cdn.pixabay.com/photo/2013/07/18/10/56/railroad-163518_960_720.jpg',
'https://cdn.pixabay.com/photo/2018/03/12/20/07/maldives-3220702_960_720.jpg',
'https://cdn.pixabay.com/photo/2017/08/04/17/56/dolomites-2580866_960_720.jpg',
'https://cdn.pixabay.com/photo/2016/06/20/03/15/pier-1467984_960_720.jpg',
'https://cdn.pixabay.com/photo/2014/07/30/02/00/iceberg-404966_960_720.jpg',
'https://cdn.pixabay.com/photo/2014/11/02/10/41/plane-513641_960_720.jpg',
'https://cdn.pixabay.com/photo/2015/10/30/20/13/sea-1014710_960_720.jpg'
]
fp = Path('./down_img')
if not fp.exists():
fp.mkdir()
loop = asyncio.get_event_loop()
loop.run_until_complete(main(URL))
运行结果
mountain-477832_960_720.jpg 下载完成!
railroad-163518_960_720.jpg 下载完成!
maldives-3220702_960_720.jpg 下载完成!
dolomites-2580866_960_720.jpg 下载完成!
pier-1467984_960_720.jpg 下载完成!
plane-513641_960_720.jpg 下载完成!
iceberg-404966_960_720.jpg 下载完成!
sea-1014710_960_720.jpg 下载完成!
ALL RESULT:['https://cdn.pixabay.com/photo/2014/07/30/02/00/iceberg-404966_960_720.jpg', 'https://cdn.pixabay.com/photo/2018/03/12/20/07/maldives-3220702_960_720.jpg', 'https://cdn.pixabay.com/photo/2014/10/07/13/48/mountain-477832_960_720.jpg', 'https://cdn.pixabay.com/photo/2014/11/02/10/41/plane-513641_960_720.jpg', 'https://cdn.pixabay.com/photo/2017/08/04/17/56/dolomites-2580866_960_720.jpg', 'https://cdn.pixabay.com/photo/2013/07/18/10/56/railroad-163518_960_720.jpg', 'https://cdn.pixabay.com/photo/2015/10/30/20/13/sea-1014710_960_720.jpg', 'https://cdn.pixabay.com/photo/2016/06/20/03/15/pier-1467984_960_720.jpg']
Semaphore控制并发
上面的代码是把8个url一起加入到并发任务,当url数量很多的时候,我们希望可以控制并发量,于是可以用到Semaphore控制并发。
semaphore = asyncio.Semaphore(2) # 限制并发量为2
优化后的代码
import aiohttp
import asyncio
from pathlib import Path
async def down_img(session, url, semaphore):
"""下载图片"""
async with semaphore:
name = url.split('/')[-1] # 获得图片名字
img = await session.get(url)
# 触发到await就切换,等待get到数据
content = await img.read()
# 读取内容
with open('./down_img/'+str(name), 'wb') as f:
# 写入至文件
f.write(content)
print(f'{name} 下载完成!')
return str(url)
async def main(URL):
semaphore = asyncio.Semaphore(2) # 限制并发量为2
# 建立会话session
async with aiohttp.ClientSession() as session:
# 建立所有任务
tasks = [asyncio.create_task(down_img(session, img_url, semaphore)) for img_url in URL]
# 触发await,等待任务完成
done, pending = await asyncio.wait(tasks)
all_results = [done_task.result() for done_task in done]
# 获取所有结果
print("ALL RESULT:"+str(all_results))
URL = [
'https://cdn.pixabay.com/photo/2014/10/07/13/48/mountain-477832_960_720.jpg',
'https://cdn.pixabay.com/photo/2013/07/18/10/56/railroad-163518_960_720.jpg',
'https://cdn.pixabay.com/photo/2018/03/12/20/07/maldives-3220702_960_720.jpg',
'https://cdn.pixabay.com/photo/2017/08/04/17/56/dolomites-2580866_960_720.jpg',
'https://cdn.pixabay.com/photo/2016/06/20/03/15/pier-1467984_960_720.jpg',
'https://cdn.pixabay.com/photo/2014/07/30/02/00/iceberg-404966_960_720.jpg',
'https://cdn.pixabay.com/photo/2014/11/02/10/41/plane-513641_960_720.jpg',
'https://cdn.pixabay.com/photo/2015/10/30/20/13/sea-1014710_960_720.jpg'
]
fp = Path('./down_img')
if not fp.exists():
fp.mkdir()
loop = asyncio.get_event_loop()
loop.run_until_complete(main(URL))
使用TCPConnector控制并发
TCPConnector 使用 limit 参数控制并发数
conn = aiohttp.TCPConnector(limit=2)
# 建立会话session
async with aiohttp.ClientSession(connector=conn) as session:
完成代码如下
import aiohttp
import asyncio
from pathlib import Path
async def down_img(session, url):
"""下载图片"""
name = url.split('/')[-1] # 获得图片名字
img = await session.get(url)
# 触发到await就切换,等待get到数据
content = await img.read()
# 读取内容
with open('./down_img/'+str(name), 'wb') as f:
# 写入至文件
f.write(content)
print(f'{name} 下载完成!')
return str(url)
async def main(URL):
conn = aiohttp.TCPConnector(limit=2)
# 建立会话session
async with aiohttp.ClientSession(connector=conn) as session:
# 建立所有任务
tasks = [asyncio.create_task(down_img(session, img_url)) for img_url in URL]
# 触发await,等待任务完成
done, pending = await asyncio.wait(tasks)
all_results = [done_task.result() for done_task in done]
# 获取所有结果
print("ALL RESULT:"+str(all_results))
URL = [
'https://cdn.pixabay.com/photo/2014/10/07/13/48/mountain-477832_960_720.jpg',
'https://cdn.pixabay.com/photo/2013/07/18/10/56/railroad-163518_960_720.jpg',
'https://cdn.pixabay.com/photo/2018/03/12/20/07/maldives-3220702_960_720.jpg',
'https://cdn.pixabay.com/photo/2017/08/04/17/56/dolomites-2580866_960_720.jpg',
'https://cdn.pixabay.com/photo/2016/06/20/03/15/pier-1467984_960_720.jpg',
'https://cdn.pixabay.com/photo/2014/07/30/02/00/iceberg-404966_960_720.jpg',
'https://cdn.pixabay.com/photo/2014/11/02/10/41/plane-513641_960_720.jpg',
'https://cdn.pixabay.com/photo/2015/10/30/20/13/sea-1014710_960_720.jpg'
]
fp = Path('./down_img')
if not fp.exists():
fp.mkdir()
loop = asyncio.get_event_loop()
loop.run_until_complete(main(URL))