实现对今日头条街拍近400条数据的爬取,感兴趣的朋友可以尝试一下

import requests

from urllib.parse import urlencode
import os
from hashlib import md5
from multiprocessing.pool import Pool

user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
headers = { 'User-Agent' : user_agent }

def get_page(offset):
params = {
'offset': offset,
'format': 'json',
'keyword': '街拍',
'autoload': 'true',
'count': '20',
'cur_tab': '1',

}
url = 'http://www.toutiao.com/search_content/?' + urlencode(params)
try:
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.json()
except requests.ConnectionError:
return None

def get_images(json):
if json.get('data'):
for item in json.get('data'):
title = item.get('title')
images = item.get('image_list')
if title and images:
for image in images:
yield {
'image': image.get('url'),
'title': title
}


def save_image(item):
if not os.path.exists(item.get('title')):
os.mkdir(item.get('title'))
try:
url = 'http:' + item.get('image')
response = requests.get(url, headers=headers)
if response.status_code == 200:
file_path = '{0}/{1}.{2}'.format(item.get('title'), md5(response.content).hexdigest(), 'jpg')
if not os.path.exists(file_path):
with open(file_path, 'wb') as f:
f.write(response.content)
else:
print('Already Download', file_path)
except requests.ConnectionError:
print('Failed to save Image')

def main(offset):
json = get_page(offset)
if json:
for item in get_images(json):
# print(item)
save_image(item)

GROUP_START = 1
GROUP_END = 20

if __name__ == '__main__':
pool = Pool()
groups = ([x*20 for x in range(GROUP_START, GROUP_END + 1)])
pool.map(main, groups)
pool.close()
pool.join()