import time
from concurrent.futures import ThreadPoolExecutor
import time
import os
import re
from urllib.parse import urlencode
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
rootrurl = 'https://bing.ioliu.cn/?'
save_dir = 'D:/estimages/'
headers = {
"Referer": rootrurl,
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
'Accept-Language': 'en-US,en;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive'
} ###设置请求的头部,伪装成浏览器
def saveOneImg(dir, img_url, title):
new_headers = {
"Referer": img_url,
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
'Accept-Language': 'en-US,en;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive'
} ###设置请求的头部,伪装成浏览器,实时换成新的 header 是为了防止403 http code问题,防止反盗链,
try:
img = requests.get(img_url, headers=new_headers) # 请求图片的实际URL
if (str(img).find('200') > 1):
with open(
'{}/{}.jpg'.format(dir, title), 'wb') as jpg: # 请求图片并写进去到本地文件
jpg.write(img.content)
print(img_url)
jpg.close()
return True
else:
return False
except Exception as e:
print('exception occurs: ' + img_url)
print(e)
return False
def getSubTitleName(str):
# cop = re.compile("[^\u4e00-\u9fa5^a-z^A-Z^0-9]") # 匹配不是中文、大小写、数字的其他字符
cop = re.compile("[^\u4e00-\u9fa5]") # 匹配不是中文、大小写、数字的其他字符
string1 = cop.sub('', str) # 将string1中匹配到的字符替换成空字符
return string1
def getOnePage(i):
params = {
'p': i,
}
url = rootrurl + urlencode(params)
print(url)
html = BeautifulSoup(requests.get(url, headers=headers).text, features="html.parser")
titles = html.find_all('h3')
lis = html.find_all('a', {'class': 'ctrl download'})
i = 0
for a in lis:
saveOneImg(save_dir, rootrurl[:-2] + a.get('href'), getSubTitleName(titles[i].get_text()))
i = i + 1
def getNumOfPages():
html = BeautifulSoup(requests.get(rootrurl, headers=headers).text, features="html.parser")
return int(html.find('div', {'class': 'page'}).find('span').get_text().split('/')[1])
if __name__ == '__main__':
getTotal = getNumOfPages()
for i in range(1, getTotal+1):
getOnePage(i)
pass