import requests # 模块导入的俩种方法
from multiprocessing import Pool
import re
def get(url):
ret=requests.get(url)
if ret.status_code==200:
return ret.content.decode('gbk')
def call_back(arg):
ret = com.finditer(arg)
dict_lst=[]
for i in ret:
dic = {
'png': i.group('png'),
'name': i.group('name'),
'place': i.group('place')
}
dict_lst.append(dic)
for i in dict_lst:
res=subget(i['png'])
write_func(i['name'],i['place'],res)
return dict_lst
def subget(url):
if 'https' in url:
ret = requests.get(url)
if ret.status_code == 200:
return ret.content
else:
pass
else:
n_url = 'http://www.xiaohuar.com' + url
ret = requests.get(n_url)
if ret.status_code == 200:
return ret.content
else:
pass
def write_func(path,place,picture):
with open(r'E:\text1\爬虫\text_png\%s-%s.png' %(path,place),'wb') as f:
f.write(picture)
'''我要爬取的网页的特征'''
'''http://www.xiaohuar.com/list-1-0.html'''
'''http://www.xiaohuar.com/list-1-43.html'''
if __name__ =='__main__':
com = re.compile(
'<div class="item_t">(?:.*?)src="(?P<png>.*?)"(?:.*?)<span class="price">(?P<name>.*?)</span>(?:.*?)'
'<a rel="nofollow" href="http://www.xiaohuar.com/" class="img_album_btn">(?P<place>.*?)</a>', re.S)
pool=Pool(3)
res_lst=[]
for i in range(40):
pool.apply_async(get,args=('http://www.xiaohuar.com/list-1-%s.html' %i,),callback=call_back)
pool.close()
pool.join()
**缺点:**爬取的速度慢,最多17个网页(好无奈)