python selenium设置不加载图片

转载

mob64ca1402a190 2024-10-12 10:00:49

文章标签 selenium 测试工具爬虫 python Powered by 金山文档 文章分类 Python 后端开发

第一次自己尝试写爬虫，尝试爬取一个发了1w+图片的up的所有图片，由于他一直在更新，所以这个版本的代码去掉了自动翻页功能，最好扫码先录之后再继续程序，不然容易报错，B站会一直弹出登录界面。登录后，手动跳转的需要爬取的页面，就可以继续程序。（需要安装edge浏览器的驱动）

# bilibili单页下载（手动跳转并且不关闭页面，跳转下一页继续,遇到多图的时候跳过第一张然后下载）
import os
import time

import pyautogui
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver import ActionChains, Keys
import winsound


class Bilibili:

    def __init__(self, Vol):
        self.Vol = Vol

    def mainrun(self):

        # 创建 WebDriver 对象
        wd = webdriver.Edge()
        # 创建右键动作链接
        actions = ActionChains(wd)
        # 隐式等待时间
        wd.implicitly_wait(3)
        # 调用WebDriver 对象的get方法 可以让浏览器打开指定网址
        wd.get(self.Vol)
        # 保存主页面
        mainWindow = wd.current_window_handle

        # 获取图集总页数（返回maxPage）
        maxPage = self.find_max_page(wd)
        print('最大页数=', maxPage)
        print('请验证登录，并移动至需要下载的页面')

        # 先验证登录后继续
        manual_input = True
        while manual_input:
            k = input()
            if k == '0':
                print("下载完成！")
                break
            # 点击图片
            self.open_photos_in_new_window(wd, actions)
            print('窗口打开完毕，是否开始下载？')

            # 下载图片
            self.open_and_download(wd)

            # 关闭已下载的页面
            self.close_window(wd, mainWindow)
            print('下载完成！请手动切换到下一页，任意键继续...')
            self.Sound_beep()

    # 获取图集总页数函数
    def find_max_page(self, wd):
        Total_page_num = wd.find_element(By.CSS_SELECTOR, '.be-pager>li:nth-last-of-type(2)>a')
        maxPage = int(Total_page_num.text)
        return maxPage

    # 点击打开所有图片函数
    def open_photos_in_new_window(self, wd, actions):
        # 选择图片
        photos = wd.find_elements(By.CSS_SELECTOR, '.album-list__content a:nth-child(1)')
        print('本页共有', len(photos), '张图片')
        # 依次选择所有图片,从第一张开始，在while循环中，当a走到len(photos)+1时跳出循环
        a = 0
        while a < len(photos):
            # 判断有没有产生弹窗，若有，则会点击关闭,并跳出本次循环，a就不能加一（为了提高运算速度可省略，这里不等待，摆烂
            wd.implicitly_wait(0.1)
            self.is_pop(wd)
            # 在新标签页打开图片，b为第几张图片
            photo = photos[a]
            wd.implicitly_wait(3)
            actions.key_down(Keys.CONTROL).click(photo).key_up(Keys.CONTROL).perform()
            windows = wd.window_handles
            print('共打开', len(windows) - 1, '个页面')
            a += 1

    # 下载图片URL函数
    def downlaod_URL(self, wd):
        #  使用get_attribute()方法获取对应属性的属性值，src属性值就是图片地址。
        url = wd.find_element(By.CSS_SELECTOR, '.bili-gallery__content>img').get_attribute('src')
        print(url)
        # 获取图片信息以重命名图片
        photo_Name = self.renamed_photo(wd, url)
        # 通过requests发送一个get请求到图片地址，返回的响应就是图片内容
        r = requests.get(url)  # 将获取到的图片二进制流写入本地文件
        # 对于图片类型的通过r.content方式访问响应内容，将响应内容写入baidu.png中
        # 将path设置为当前工作目录
        os.chdir('D:\\下载站\\新建图集')
        print('当前下载路径：', os.getcwd())
        # 写入图片
        with open(photo_Name, 'wb') as f:
            f.write(r.content)

    # 打开大图并下载函数
    def open_and_download(self, wd):
        # 获取所有打开的页面
        windows = wd.window_handles
        print('窗口总数', len(windows))
        for window_num in range(1, len(windows)):  # 跳过主页面
            print('正在下载第', window_num, '个页面')
            wd.implicitly_wait(0.5)
            # 将页面切换到新打开的动态
            wd.switch_to.window(windows[window_num])
            wd.implicitly_wait(3)
            # 点击图片
            wd.find_element(By.CSS_SELECTOR, '.bili-album__preview__picture>div').click()
            # 如果发现裂图，则跳过本次循环，用continue，跳出所有循环break
            is_ignor = self.ignored_next(wd)
            if is_ignor == 1:
                continue
            # 点击查看大图
            time.sleep(0.5)
            wd.find_element(By.CSS_SELECTOR, "[class='bili-album__watch__control__option full-screen']").click()
            # 如果有下一页点击下一页
            if not wd.find_elements(By.CSS_SELECTOR, '.bili-gallery__pagination__total'):
                print('【只有一张图】')
                self.downlaod_URL(wd)
            else:
                gallery_nums = int(
                    wd.find_element(By.CSS_SELECTOR, '.bili-gallery__pagination__total').text.replace('/ ', ''))
                print('【多图共', gallery_nums, '张】')
                i = 1
                while i < gallery_nums:
                    # 先翻一页跳过第一章
                    wd.find_element(By.CSS_SELECTOR, '.bili-gallery__nav__next').click()
                    self.downlaod_URL(wd)
                    # 循环加1
                    i += 1
                time.sleep(3)

    # 获取并重命名函数
    def renamed_photo(self, wd, url):
        # 获取图片时间
        photo_time = wd.find_element(By.CSS_SELECTOR, '[data-module="time"]').text
        photo_time1 = photo_time[0:13] + '-' + photo_time[14:16]
        # 获取图片名称（并剔除无效字符）
        photo_man = wd.find_element(By.CSS_SELECTOR, '.bili-rich-text__content>span').text
        photo_man = photo_man[0:25]
        photo_man = photo_man.replace(':', '-')
        photo_man = photo_man.replace('\n', '')
        photo_man = photo_man.replace('：', '')
        photo_man = photo_man.replace('/', '')
        photo_man = photo_man.replace('**', '')
        photo_man = photo_man.replace('|', '')
        url = url.replace('https://', '')
        url = url.replace('/', '-')
        # 重命名图片
        photo_Name = "[" + photo_time1 + "]" + photo_man + url
        print(photo_Name)
        return photo_Name

    # 切换和关闭窗口函数（下载完一页后将其关闭）
    def close_window(self, wd, mainWindow):
        windows = wd.window_handles
        # wd.switch_to.window(windows[1])
        for handle in windows:
            wd.switch_to.window(handle)
            if '动态' in wd.title:
                wd.close()
        wd.switch_to.window(mainWindow)

    # 判断弹窗函数
    def is_pop(self, wd):
        # 搜寻广告元素
        ads = wd.find_elements(By.CSS_SELECTOR, '.bili-mini .bili-mini-close')
        # 用复数形式找不到也不至于报错
        for ad in ads:
            if ad:
                print('找到弹窗')
                # 找到弹窗后点击关闭
                ad.click()

    # 裂图跳过函数
    def ignored_next(self, wd):
        is_ignors = wd.find_elements(By.CSS_SELECTOR,
                                     '.bili-album__error')
        for is_ignor in is_ignors:
            # 如果找到了这个元素，则返回1
            if is_ignor:
                is_ignor = 1
                print('【这张图裂了。。。。。。】')
                return is_ignor

    #发出提示音
    def Sound_beep(self):
        duration = 1000  # millisecond
        freq = 440  # Hz
        winsound.Beep(freq, duration)


# 创建一个实例（测试代码），调用mainrun
Bilibili(Vol='https://space.bilibili.com/1581895468/album').mainrun()

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。