前言
利用Python爬取的是今日头条中的街拍美图。废话不多说。
让我们愉快地开始吧~
开发工具
Python版本: 3.6.4
相关模块:
re模块;
requests模块;
以及一些Python自带的模块。
环境搭建
安装Python并添加到环境变量,pip安装需要的相关模块即可。
详细浏览器信息
获取文章链接相关代码:
import requests
import json
import re
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36'
}
def get_first_data(offset):
params = {
'offset': offset,
'format': 'json',
'keyword': '街拍',
'autoload': 'true',
'count': '20',
'cur_tab': '1',
'from':'search_tab'
}
response = requests.get(url='https://www.toutiao.com/search_content/', headers=headers, params=params)
try:
response.raise_for_status()
return response.text
except Exception as exc:
print("获取失败")
return None
def handle_first_data(html):
data = json.loads(html)
if data and "data" in data.keys():
for item in data.get("data"):
yield item.get("article_url")
这里需要提一下requests模块的报错,在response对象上调用 raise_for_status()方法,如果下载文件出错,会抛出异常,需要使用 try 和 except 语句将代码行包裹起来,处理这一错误,不让程序崩溃。
另外附上requests模块技术文档网址:http://cn.python-requests.org/zh_CN/latest/
获取图片链接相关代码:
def get_second_data(url):
if url:
try:
reponse = requests.get(url, headers=headers)
reponse.raise_for_status()
return reponse.text
except Exception as exc:
print("进入链接发生错误")
return None
def handle_second_data(html):
if html:
pattern = re.compile(r'gallery: JSON.parse\((.*?)\),', re.S)
result = re.search(pattern, html)
if result:
imageurl = []
data = json.loads(json.loads(result.group(1)))
if data and "sub_images" in data.keys():
sub_images = data.get("sub_images")
images = [item.get('url') for item in sub_images]
for image in images:
imageurl.append(images)
return imageurl
else:
print("have no result")
获取图片相关代码:
def download_image(imageUrl):
for url in imageUrl:
try:
image = requests.get(url).content
except:
pass
with open("images"+str(url[-10:])+".jpg", "wb") as ob:
ob.write(image)
ob.close()
print(url[-10:] + "下载成功!" + url)
def main():
html = get_first_data(0)
for url in handle_first_data(html):
html = get_second_data(url)
if html:
result = handle_second_data(html)
if result:
try:
download_image(result)
except KeyError:
print("{0}存在问题,略过".format(result))
continue
if __name__ == '__main__':
main()