概述

项目需要从网站上自动下载大量文件。

  • 预使用PyQt5显示网站,并进行模型网站input填充,button或链接的点击,链接的下载等。 PyQt5提供了QWebEngineView显示html字符串或加载某网址显示web页面,同时提供了runJavascript()方法来执行JavaScript执行与web页面进行交互,且python有lxml包,进行xpath操作,但想实现类似selenium的web自动化操作,依然较为困难,原因是QWebEngineView虽然集成了Chrome内核,但和浏览器差异较大,无法通过Qt进行肆意操作。
  • 最终,不得不选择使用selenium与Qt程序分开使用,完成该任务。

pyqt5的使用

安装

创建虚拟环境,安装相关包。

# 创建虚拟环境
conda create --name pyqt5
# 激活虚拟环境
conda activate pyqt5

# 安装pyqt5相关包
conda install python=3.10
conda install pyqt
conda install PyQtWebEngine

# xpath操作包
conda install lxml

试验代码

main.py

import sys

from PyQt5.QtWidgets import QApplication

from main_widget import MainWidget

if __name__ == '__main__':
    app = QApplication(sys.argv)

    main_widget = MainWidget()
    main_widget.show()

    sys.exit(app.exec_())

main_widget.py

from PyQt5.QtCore import QUrl
from PyQt5.QtGui import QPalette, QColor
from PyQt5.QtWidgets import QWidget, QVBoxLayout, QLabel, QComboBox, QPushButton, QHBoxLayout
from lxml import etree

from web_engine_view import WebEngineView


class MainWidget(QWidget):
    def __init__(self):
        super().__init__()
        self.webview = None
        self.cmd_station = None
        self.init_widget()

    def init_widget(self):
        # top
        height = 25

        lb_station = QLabel("场站:")
        self.cmd_station = QComboBox()
        self.cmd_station.setFixedSize(200, height)

        btn_start_download = QPushButton("开始下载")
        btn_start_download.setFixedSize(100, height)

        top_layout = QHBoxLayout()
        top_layout.addStretch()
        top_layout.addWidget(lb_station)
        top_layout.addWidget(self.cmd_station)
        top_layout.addWidget(btn_start_download)
        top_layout.addStretch()
        top_layout.setContentsMargins(0, 5, 0, 0)

        # split line
        lb_slit_line = QLabel()
        lb_slit_line.setFixedHeight(1)
        lb_slit_line.setStyleSheet("background-color: rgba(129,127,104,0.5);");

        # web view
        self.webview = WebEngineView()
        self.webview.load(QUrl('https://www.bilibili.com'))
        self.webview.page()

        # main layout
        main_layout = QVBoxLayout()
        main_layout.addLayout(top_layout)
        main_layout.addWidget(lb_slit_line)
        main_layout.addWidget(self.webview)
        main_layout.setContentsMargins(1, 1, 1, 1)
        self.setLayout(main_layout)

        # connects
        btn_start_download.clicked.connect(self.slot_btn_start_download)

        #
        palette = QPalette()
        palette.setColor(QPalette.Background, QColor(255, 255, 255))
        self.setPalette(palette)

        self.showMaximized()

    def slot_btn_start_download(self):
        # self.webview.page().runJavaScript("""
        # document.querySelector('.nav-search-input').value='ddddd';
        # document.querySelector('.nav-search-input').placeholder='aaaaa';
        # //document.querySelector('.nav-search-btn').click();
        # """)
        self.webview.page().toHtml(self.web_page_html)

    # def js_callback(self, result):
    #     print('js_callback: ', result)

    def web_page_html(self, html_str):
        print(html_str)
        print('**********************')
        encoding = 'utf-8'
        tree_html = etree.HTML(html_str.encode(encoding), etree.HTMLParser(encoding=encoding))
        elements = tree_html.xpath('//input[@class="nav-search-input"]')
        elements[0].set('value', 'asdadfasf')

        new_html = etree.tostring(tree_html, method='html', encoding='utf-8')
        print(new_html)

        self.webview.setHtml(new_html.decode(encoding))

web_engine_view.py

from PyQt5.QtWebEngineWidgets import QWebEngineView


class WebEngineView(QWebEngineView):
    def __init__(self):
        super().__init__()
        self.urlChanged.connect(self.slot_url_changed)

    def createWindow(self, type):
        # 此处return self 即可实现点击当前页面内链接,依然将新链接页面加载到当前QWebEngineView
        return self

    def slot_url_changed(self, url):
        print('url: ', url)