python是一个高效率的工具,无论是做什么开发的程序员,学点python,很多时候,总会让你事半功倍,如用python换包名,用python进行搜索文件,用python进行...,今天小试python,爬取搞笑段子及网站图片。
1
爬取糗事百科段子
使用BeautifulSoup库,可以自行下载
# -*- coding: utf-8 -*-from bs4 import BeautifulSoup import urllib2 def getContent(n): url = 'http://www.qiushibaike.com/text/page/' + str(n) + '/' print url heads = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.75 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml, application/xml;q=0.9,*/*;q=0.8', 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', 'Referer': 'http://www.qiushibaike.com/', 'Accept-Language': 'zh-CN,zh;q=0.8', 'Cookie': '_xsrf=2|db27040e|6b4ed8d9536590d4ec5d2064cc2bef4f |1474364551; _qqq_uuid_="2|1:0|10:1474364551|10:_qqq_uuid_| 56:MzBlNWFkOGE3MWEyMzc1MWIxMTE3MDBlZjM2M2RkZWQ xYzU5YTg1Yw== |1dd2a4f4ceacad26b5da9cc295d2965226ea25ee73289855 cf032629c4992698"; Hm_lvt_2670efbdd59c7e3ed3749b458cafaa37=1474364592; Hm_lpvt_2670efbdd59c7e3ed3749b458cafaa37=1474364595; _ga=GA1.2.1125329542.1474364596' } res = urllib2.Request(url, headers=heads) response = urllib2.urlopen(res) html = response.read() soup = BeautifulSoup(html, "lxml") someData = soup.select("div.content span") num = 0 for some in someData: num = num + 1 print num print some.text + '\n' if __name__ == "__main__": for i in range(1, 5): getContent(i)
在cmd效果:
在Eclipse中运行:
2
爬取图片
爬取图片:(煎蛋网)
[1] 图片请上传到 新浪微博相册,在评论框里粘帖图片地址即可发图
[2] 原创图请注明来源。发布后请等待审核,未通过审核的原因可能是重复图或者敏感图
妹子图过审标准:1、非AV截图;2、身着内衣;3、不露点
# -*- coding: utf-8 -*-from __future__ import unicode_literalsfrom HttpClient import HttpClientimport sys,re,osclass JianDan(HttpClient): def __init__(self): self.__pageIndex = 1500 self.__Url = "http://jandan.net/ooxx/" self.__floder = "jiandan" def __getAllPicUrl(self,pageIndex): realurl = self.__Url + "page-" + str(pageIndex) + "#comments" pageCode = self.Get(realurl) type = sys.getfilesystemencoding() pattern = re.compile('<p>.*?<a .*?view_img_link">.*?</a>. *?<img src="(.*?)".*?</p>',re.S) items = re.findall(pattern,pageCode.decode("UTF-8").encode(type)) num = 0 for item in items: num = num + 1 print "获取到图片:" + str(num) + "--->" + str("http:") + item self.__savePics(items,self.__floder) def __savePics(self,img_addr,folder): for item in img_addr: filename = item.split('/')[-1] print "正在保存图片:" + filename with open(filename,'wb') as file: img = self.Get(str("http:") + item) file.write(img) def __getNewPage(self): pageCode = self.Get(self.__Url) type = sys.getfilesystemencoding() pattern = re.compile(r'<div .*?cp-pagenavi">. *?<span .*?current-comment-page">\[(.*?)\]</span>',re.S) newPage = re.search(pattern,pageCode.decode("UTF-8").encode(type)) print pageCode.decode("UTF-8").encode(type) if newPage != None: return newPage.group(1) return 1500 def start(self): isExists=os.path.exists(self.__floder)#检测是否存在目录 print isExists if not isExists: os.mkdir(self.__floder) os.chdir(self.__floder) page = int(self.__getNewPage()) for i in range(self.__pageIndex,page): self.__getAllPicUrl(i)if __name__ == '__main__': jd = JianDan() jd.start()JianDan
中间运行过程:
最后效果:
爬去内容和图片,肯定不是最终极目的,这些都是好玩的,最后爬点不同的音频、视频封装格式,才是重点。这个原理也是一样的,分析到对应标签地址,进行下载,有这方面爱好者,可以用自己动手,实践下。python上手应该非常快。