文章目录
- 前言
- 一、前提准备
- 二、代码部分
- 1.引入库
- 2.发送请求,解析数据,并保存到本地
- 3.全部代码
- 总结
前言
接触深度学习有一段时间了,我们利用CNN卷积神经网络做一个十二生肖动物图片识别的小项目。在训练模型的时候我们往往需要大量的数据,今天我们主要针对数据获取这部分做一个简要的介绍:
今天我们通过python将Selenium自动化框架和Beautifulsoup结合起来,并通过多线程的方式进行数据下载,来提高下载速度,最后将数据保存到我们的数据集中。
一、前提准备
开始前我们需要做好以下准备:
1.python3.10(python3版本就可以,我用的是python3.10最新版本)
2.requests
3.Beautifulsoup用于解析网页
4.Selenium自动化框架(安装好相应的浏览器驱动并配置好环境变量)
5.threading多线程
二、代码部分
声明:代码仅供技术交流使用,如有侵权请联系本人删除
1.引入库
首先,导入我们需要运用的库:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
# from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import threading # 用于开启多线程
import requests
import time
import random
import os
2.发送请求,解析数据,并保存到本地
根据网页信息先确定输入框的位置:
这里我们用到的浏览器是谷歌浏览器,先在代码中配置好谷歌浏览器的相关信息,然后定位输入框,代码部分如下:
def parser_content(self):
# 配置谷歌浏览器环境
driver = webdriver.Chrome()
# 最大化页面
driver.maximize_window()
# 加载网页
driver.get(self.url)
# 设置隐式等待时长10s
driver.implicitly_wait(10)
# 设置强制等待时长2s
time.sleep(2)
# 定位输入栏
driver.find_element(By.XPATH, "//input[@class='s_ipt']").send_keys(name)
time.sleep(1)
# 键盘事件模拟点击回车键进入图片界面
driver.find_element(By.XPATH, "//input[@class='s_ipt']").send_keys(Keys.ENTER)
driver.implicitly_wait(10)
time.sleep(2)
接下来我们进入图片列表界面:
在这里我们需要进行一个下拉框的操作来获取更多的图片信息,并将获取到的图片链接保存到列表中,代码部分如下:
start = time.time()
# 竖向滚动条操作----------------------------------------------------------------------------
temp_height = 0
for ii in range(1, 1000000, 8):
js1 = "document.documentElement.scrollTop={}".format(ii)
driver.execute_script(js1)
time.sleep(0.01)
# 检查滑动条是否到达页面最底部
check_height = driver.execute_script(
"return document.documentElement.scrollTop || window.pageYOffset || document.body.scrollTop;")
if check_height == temp_height:
break
temp_height = check_height
# 加入时间限制,超过45s会自动停止
if time.time() - start > 45:
break
# ----------------------------------------------------------------------------------------
# 获取全部图片url链接
url_lst = driver.find_elements(By.XPATH, "//div[@class='imgbox-border']/a")
for item in url_lst[1:201]: # 可以更改数字来获取更多图片链接
new_url = item.get_attribute("href")
# print(new_url)
self.lst1.append(new_url)
print("此次共获取到" + str(len(self.lst1)) + "张图片!")
# 关闭浏览器
driver.quit()
运行后可以看到我们已经获取到了想要的内容,接下来将我们进入图片链接的详情页。
通过Beautifulsoup来解析获取图片url链接,并开启四线程来进行图片下载,代码部分如下:
def download1(self):
for i in self.lst1[0:int(len(self.lst1) / 4)]:
try:
resp = requests.get(i, headers=self.headers)
resp.encoding = "utf-8"
html = resp.text
# print(html)
Be = BeautifulSoup(html, 'html.parser')
wrapper = Be.find('div', class_='img-wrapper')
img = wrapper.find('img')['src']
res = requests.get(img)
with open(
str(self.path) + str(self.save_path) + "/" + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(
random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + ".jpg", "wb") as file:
file.write(res.content)
print("下载完成保存至:" + str(self.path) + str(self.save_path) + "/" + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + ".jpg")
except:
pass
def download2(self):
for i in self.lst1[int(len(self.lst1) / 4):int(len(self.lst1) / 2)]:
try:
resp = requests.get(i, headers=self.headers)
resp.encoding = "utf-8"
html = resp.text
# print(html)
Be = BeautifulSoup(html, 'html.parser')
wrapper = Be.find('div', class_='img-wrapper')
img = wrapper.find('img')['src']
res = requests.get(img)
with open(
str(self.path) + str(self.save_path) + "/" + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(
random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + ".jpg", "wb") as file:
file.write(res.content)
print("下载完成保存至:" + str(self.path) + str(self.save_path) + "/" + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + ".jpg")
except:
pass
def download3(self):
for i in self.lst1[int(len(self.lst1) / 2):int(len(self.lst1) / 2) + int(len(self.lst1) / 4)]:
try:
resp = requests.get(i, headers=self.headers)
resp.encoding = "utf-8"
html = resp.text
# print(html)
Be = BeautifulSoup(html, 'html.parser')
wrapper = Be.find('div', class_='img-wrapper')
img = wrapper.find('img')['src']
res = requests.get(img)
with open(
str(self.path) + str(self.save_path) + "/" + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(
random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + ".jpg", "wb") as file:
file.write(res.content)
print("下载完成保存至:" + str(self.path) + str(self.save_path) + "/" + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + ".jpg")
except:
pass
def download4(self):
for i in self.lst1[int(len(self.lst1) / 2) + int(len(self.lst1) / 4):len(self.lst1)]:
try:
resp = requests.get(i, headers=self.headers)
resp.encoding = "utf-8"
html = resp.text
# print(html)
Be = BeautifulSoup(html, 'html.parser')
wrapper = Be.find('div', class_='img-wrapper')
img = wrapper.find('img')['src']
res = requests.get(img)
with open(
str(self.path) + str(self.save_path) + "/" + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(
random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + ".jpg", "wb") as file:
file.write(res.content)
print("下载完成保存至:" + str(self.path) + str(self.save_path) + "/" + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + ".jpg")
except:
pass
def multi_thread(self):
t1 = threading.Thread(target=self.download1())
t2 = threading.Thread(target=self.download2())
t3 = threading.Thread(target=self.download3())
t4 = threading.Thread(target=self.download4())
t1.start()
t2.start()
t3.start()
t4.start()
3.全部代码
# -- coding: utf-8 --
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
# from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import threading
import requests
import time
import random
import os
class Spider():
def __init__(self):
self.url = "https://image.baidu.com/"
self.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3883.400 QQBrowser/10.8.4559.400",
"Referer": "https://www.baidu.com/"}
self.lst1 = []
self.path = './data/train/' # 保存路径
self.save_path = os.path.join(xname)
if not os.path.exists(self.save_path):
os.mkdir(self.path + "./{}".format(xname))
def parser_content(self):
# 配置谷歌浏览器环境
driver = webdriver.Chrome()
# 最大化页面
driver.maximize_window()
# 加载网页
driver.get(self.url)
# 设置隐式等待时长10s
driver.implicitly_wait(10)
# 设置强制等待时长2s
time.sleep(2)
# 定位输入栏
driver.find_element(By.XPATH, "//input[@class='s_ipt']").send_keys(name)
time.sleep(1)
# 键盘事件模拟点击回车键进入图片界面
driver.find_element(By.XPATH, "//input[@class='s_ipt']").send_keys(Keys.ENTER)
driver.implicitly_wait(10)
time.sleep(2)
start = time.time()
# 竖向滚动条操作----------------------------------------------------------------------------
temp_height = 0
for ii in range(1, 1000000, 8):
js1 = "document.documentElement.scrollTop={}".format(ii)
driver.execute_script(js1)
time.sleep(0.01)
# 检查滑动条是否到达页面最底部
check_height = driver.execute_script(
"return document.documentElement.scrollTop || window.pageYOffset || document.body.scrollTop;")
if check_height == temp_height:
break
temp_height = check_height
if time.time() - start > 45:
break
# ----------------------------------------------------------------------------------------
# 获取全部图片url链接
url_lst = driver.find_elements(By.XPATH, "//div[@class='imgbox-border']/a")
for item in url_lst[1:201]:
new_url = item.get_attribute("href")
print(new_url)
self.lst1.append(new_url)
print("此次共获取到" + str(len(self.lst1)) + "张图片!")
# 关闭浏览器
driver.quit()
def download1(self):
for i in self.lst1[0:int(len(self.lst1) / 4)]:
try:
resp = requests.get(i, headers=self.headers)
resp.encoding = "utf-8"
html = resp.text
# print(html)
Be = BeautifulSoup(html, 'html.parser')
wrapper = Be.find('div', class_='img-wrapper')
img = wrapper.find('img')['src']
res = requests.get(img)
with open(
str(self.path) + str(self.save_path) + "/" + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(
random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + ".jpg", "wb") as file:
file.write(res.content)
print("下载完成保存至:" + str(self.path) + str(self.save_path) + "/" + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + ".jpg")
except:
pass
def download2(self):
for i in self.lst1[int(len(self.lst1) / 4):int(len(self.lst1) / 2)]:
try:
resp = requests.get(i, headers=self.headers)
resp.encoding = "utf-8"
html = resp.text
# print(html)
Be = BeautifulSoup(html, 'html.parser')
wrapper = Be.find('div', class_='img-wrapper')
img = wrapper.find('img')['src']
res = requests.get(img)
with open(
str(self.path) + str(self.save_path) + "/" + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(
random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + ".jpg", "wb") as file:
file.write(res.content)
print("下载完成保存至:" + str(self.path) + str(self.save_path) + "/" + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + ".jpg")
except:
pass
def download3(self):
for i in self.lst1[int(len(self.lst1) / 2):int(len(self.lst1) / 2) + int(len(self.lst1) / 4)]:
try:
resp = requests.get(i, headers=self.headers)
resp.encoding = "utf-8"
html = resp.text
# print(html)
Be = BeautifulSoup(html, 'html.parser')
wrapper = Be.find('div', class_='img-wrapper')
img = wrapper.find('img')['src']
res = requests.get(img)
with open(
str(self.path) + str(self.save_path) + "/" + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(
random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + ".jpg", "wb") as file:
file.write(res.content)
print("下载完成保存至:" + str(self.path) + str(self.save_path) + "/" + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + ".jpg")
except:
pass
def download4(self):
for i in self.lst1[int(len(self.lst1) / 2) + int(len(self.lst1) / 4):len(self.lst1)]:
try:
resp = requests.get(i, headers=self.headers)
resp.encoding = "utf-8"
html = resp.text
# print(html)
Be = BeautifulSoup(html, 'html.parser')
wrapper = Be.find('div', class_='img-wrapper')
img = wrapper.find('img')['src']
res = requests.get(img)
with open(
str(self.path) + str(self.save_path) + "/" + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(
random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + ".jpg", "wb") as file:
file.write(res.content)
print("下载完成保存至:" + str(self.path) + str(self.save_path) + "/" + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + str(random.randint(1, 9)) + str(
random.randint(1, 9)) + str(random.randint(1, 9)) + ".jpg")
except:
pass
def multi_thread(self):
t1 = threading.Thread(target=self.download1())
t2 = threading.Thread(target=self.download2())
t3 = threading.Thread(target=self.download3())
t4 = threading.Thread(target=self.download4())
t1.start()
t2.start()
t3.start()
t4.start()
if __name__ == '__main__':
name = input("请输入下载类型:")
xname = input("请输入创建文件夹名称(英文):")
spider = Spider()
spider.parser_content()
spider.multi_thread()
至此,我们整个代码的编写已经全部完成了,最后让我们一起来看一下效果。
我们打开最初命名的文件夹,可以发现,图片已经被成功下载保存到里面了,而且下载的速度很快。
总结
兴趣是最好的老师
好了,今天的代码就到这里了。