目录
1.socket通信:
1.1 osi模型:
2. 协程:
2.1 利用yield实现协程:
2.2 greenlet:可实现单线程内切换多个任务
2. 3 gevent: 可实现协程也是由C扩展而来的 a.阻塞式协程
b. 非阻塞, 上述例子中gevent识别io阻塞。如果想实现非阻塞模型需要导入, #167和#168两行代码
1.socket通信:
1.1 osi模型:
实现通信
物理层----数据链路层---网络层----传输层----应用---表示----回话
传输层--socket(网络套接字,可实现两个主要通信架构c/s---UDP/TCP)
mysql---tcp
2. 协程:
提升通信效率的手段,单线程下实现并发。
优点:
1)无需线程可以切换上下文,避免无意义的调度,可提升代码性能
2)无需原子操作可以锁定及同步开销
3)简化编程
4) 高并发 高扩展 低成本
缺点:
1)无法利用多核资源
2)阻塞会导致程序崩溃
进程
线程
协程
2.1 利用yield实现协程:
import time
def func1():
while True:
yield
def func2():
g = func1()
for i in range(10000):
i+=1
next(g)
def main():
start = time.time()
func2()
print(time.time()-start)
if __name__ == '__main__':
main()
2.2 greenlet:可实现单线程内切换多个任务
import greenlet
# 返回greenlet数据类型
greenlet.greenlet
# 创建greenlet对象,不执行,run是greenlet执行的回调函数
greenlet(run=None,parent=None)
# 返回当前greenlet,谁调用返回谁
greenlet.getcurrent()
# 用于清除掉一个greenlet对象而不影响父辈greenlet
greenlet.GreenletExit
'dead', greenlet执行结束的标志
'error',
'getcurrent', 获取当前执行的greenlet
'gettrace', 获取链路追踪
'gr_frame',
'parent', 每个greenlet都有一个parent
'run', 当greenlet启动时调用run方法
'settrace', 设置链路追踪,可查看协程切换过程
'switch', 协程之间切换
'throw' 协程抛出异常
实例1:
import greenlet
def FunA():
print('FunA')
g2.switch()
print('FunB')
g2.switch()
def FunB():
print('FunB')
g1.switch()
print('FunA')
if __name__ == '__main__':
g1 = greenlet.greenlet(FunA)
g2 = greenlet.greenlet(FunB)
g1.switch()
2. 3 gevent: 可实现协程也是由C扩展而来的
a.阻塞式协程
import gevent,time
def eat(name):
print('%s eat 1'%name)
time.sleep(2)
print('%s eat 2' % name)
def study(name):
print('%s study 1'%name)
time.sleep(3)
print('%s study 2' % name)
def main():
g1 = gevent.spawn(eat,'Alex')
g2 = gevent.spawn(study,'Jill')
g1.join()
g2.join()
print('main')
if __name__ == '__main__':
main()
b. 非阻塞, 上述例子中gevent识别io阻塞。如果想实现非阻塞模型需要导入, #167和#168两行代码
import gevent,time
from gevent import monkey
monkey.patch_all()
def eat(name):
print('%s eat 1'%name)
time.sleep(2)
print('%s eat 2' % name)
def study(name):
print('%s study 1'%name)
time.sleep(3)
print('%s study 2' % name)
def main():
g1 = gevent.spawn(eat,'Alex')
g2 = gevent.spawn(study,'Jill')
g1.join()
g2.join()
print('main')
if __name__ == '__main__':
main()
课后作业:616971722@qq.com
仿照现有代码实现greenlet协程爬虫
import time,os,socket
from urllib.parse import urlparse
urls = [
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f0932b2-580x413.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f1da6e5-580x825.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f1e7e54-580x825.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f1e5b2d-580x825.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f223258-580x825.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f227243-580x825.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f24b9c2-580x825.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f24c126-580x825.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f256263-580x825.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f297a87-580x825.jpg!page-800',
]
class Crawler:
def __init__(self, url):
self.url = url
self.receive_data = b''
def fetch(self):
# url是获取的域名和路径
url = urlparse(self.url)
# 创建socket实例
self.socket = socket.socket()
# 使用阻塞方法连接服务器,web服务器通用端口号默认80
self.socket.connect((url.netloc,80))
print('连接成功')
os.system('mkdir -p pic')
data = 'GET {} HTTP/1.1\r\nHOST:{}\r\nConnection: close\r\n\r\n'.format(url.path, url.netloc)
# 向服务器发送数据
self.socket.send(data.encode())
while True:
d = self.socket.recv(1024)
if d:
self.receive_data+=d
else:
break
print('数据接收成功')
with open('pic/{}'.format(url.path[1:]),'wb') as f:
f.write(self.receive_data.split(b'\r\n\r\n')[1])
print('文件保存成功')
self.socket.close()
def main():
start = time.time()
for url in urls:
crawler = Crawler(url)
crawler.fetch()
print('耗时:{:.2f}s'.format(time.time()-start))
if __name__ == '__main__':
main()
3. greenlet协程爬虫
1)普通的爬虫方式:
urllib.parse
实例:耗时-1.26s
import time, os, socket
from urllib.parse import urlparse
urls = [
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f0932b2-580x413.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f1da6e5-580x825.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f1e7e54-580x825.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f1e5b2d-580x825.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f223258-580x825.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f227243-580x825.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f24b9c2-580x825.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f24c126-580x825.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f256263-580x825.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f297a87-580x825.jpg!page-800',
]
# 创建的爬虫类
class Crawler:
# 构造方法:初始化自带属性,
# 分别为url属性-输入的数据
# 以及receive_data属性-输出数据
def __init__(self, url):
self.url = url
self.receive_data = b''
# 实现爬虫的主要方法
def fetch(self):
# url是获取的域名和路径,目的是在执行爬虫代码之前先验证图片地址是否可用,截止到#111行
url = urlparse(self.url)
# 创建socket实例
self.socket = socket.socket()
# 使用阻塞方法连接服务器,web服务器通用端口号默认80
self.socket.connect((url.netloc, 80))
print('连接成功')
os.system('mkdir pic') # 当图片可用时,创建文件夹,接收数据,保存数据,截止到#126行
data = 'GET {} HTTP/1.1\r\nHOST:{}\r\nConnection: close\r\n\r\n'.format(url.path, url.netloc)
# 向服务器发送数据
self.socket.send(data.encode())
while True:
d = self.socket.recv(1024)
if d:
# bytes字节码或字节流
self.receive_data += d
else:
break
print('数据接收成功')
filename = self.url.split('/')[-1][:-9]
with open('pic/{}'.format(filename), 'wb') as f:
f.write(self.receive_data.split(b'\r\n\r\n')[1])
print('文件保存成功')
self.socket.close()
def main():
start = time.time()
for url in urls:
crawler = Crawler(url)
crawler.fetch()
print('耗时:{:.2f}s'.format(time.time() - start))
if __name__ == '__main__':
main()
2)使用greenlet-总耗时:0.620s
#导入相应的模块:
import socket,os,time
from greenlet import greenlet #实现协程
from urllib.parse import urlparse
# 实现IO多路复用
from selectors import DefaultSelector, EVENT_WRITE,EVENT_READ
selector = DefaultSelector()
stopped = False
urls = [
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f0932b2-580x413.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f1da6e5-580x825.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f1e7e54-580x825.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f1e5b2d-580x825.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f223258-580x825.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f227243-580x825.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f24b9c2-580x825.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f24c126-580x825.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f256263-580x825.jpg!page-800',
'https://oss.mkzcdn.com/comic/page/20191005/5d9899f297a87-580x825.jpg!page-800',
]
# 创建协程控制类
class Hub():
def wait(self):
# 创建Waiter实例
waiter = Waiter()
# 设置回调方法为waiter的swich方法
self.callback = waiter.switch
# 将waiter实例返回值通过get方法获取
return waiter.get()
def set_result(self,data):
self.callback(data)
class Waiter:
def __init__(self):
#创建主协程
self.main_gr = main_gr
def switch(self, value):
# 切换协程,保存当前协程
self.gr.switch(value)
def get(self):
# 将当前协程赋值给greenlet属性,当前协程就是Crawler实例的fetch方法
self.gr = greenlet.getcurrent()
return self.main_gr.switch()
# 创建爬虫类
class Crawler:
# 构造方法,创建属性,并赋值
def __init__(self, url):
self._url = url
self.url = urlparse(url)
self.response = b''
def fetch(self):
global stopped
# 创建套接字
sock = socket.socket()
# 设置套接字为非阻塞模式
sock.setblocking(False)
try:
sock.connect((self.url.netloc, 80))
except BlockingIOError:
pass
# 创建Hub调度实例
h = Hub()
# 写事件的回调,套接字的写事件就绪时运行如下函数
def writable():
h.set_result(None)
# 实例化selector,注册监听事件
selector.register(sock.fileno(),EVENT_WRITE, writable)
h.wait()
# 切换回来注销套接字监听事件
selector.unregister(sock.fileno())
data = 'GET {} HTTP/1.1\r\nHost:{}\r\nConnection: close\r\n\r\n'.format(self.url.path,self.url.netloc)
# 向服务器发送请求
sock.send(data.encode())
# 读取数据,创建读事件监听回调事件就是readable
def readable():
h.set_result(sock.recv(4096))
# 注册套接字读事件
selector.register(sock.fileno(), EVENT_READ, readable)
while True:
data = h.wait()
if data:
self.response += data
else:
# 数据接收完,注销客户端监听
selector.unregister(sock.fileno())
# 从列表中移除对应的URL
urls.remove(self._url)
if not urls:
stopped = True
# 转存图片到本地
filename = self.url.path.split('/')[-1][:-9]
with open('pic/'+filename,'wb') as f:
f.write(self.response.split(b'\r\n\r\n')[1])
print('URL:{}下在完成'.format(self.url.path))
break
def crawler():
# 循环创建实例
for url in urls:
crawler = Crawler(url)
# 使用fetch方法爬取数据
gr = greenlet(crawler.fetch)
gr.switch()
# 将crawler函数作为参数创建协程,该协程是主协程。在这个协程内部会创建子协程
main_gr = greenlet(crawler)
# 事件循环函数
def loop():
while not stopped:
events = selector.select()
for event_key,_ in events:
callback = event_key.data
callback()
def main():
start = time.time()
os.system('mkdir pic')
# 切换到父协程main_gr中运行
main_gr.switch()
# 执行循环事件
loop()
print('总耗时:{:.3f}s'.format(time.time()-start))
if __name__ == '__main__':
main()