基于某网站的信息爬取与保存

原创

Shen Liang 2023-02-21 09:33:03 博主文章分类：Python案例 ©著作权

文章标签 python爬虫 BeautifulSoup selenium Chrome xml 文章分类 JavaScript 前端开发

©著作权归作者所有：来自51CTO博客作者Shen Liang的原创作品，请联系作者获取转载授权，否则将追究法律责任

需求：对某网站实现动态爬取并能每天保存到文本文件中

解决方法：通过Python的BeautifulSoup、selenium完成该需求。

完整代码：

import json
import urllib.request
import urllib.error
from urllib.parse import quote
from bs4 import BeautifulSoup
from builtins import str

from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from urllib.request import urlopen
from selenium.common.exceptions import NoSuchElementException
import re
import time
import datetime
import sys
sys.setrecursionlimit(1000000)
import os

from selenium.webdriver.common.keys import Keys

def getQuestionsLinks(driver):
    bs = BeautifulSoup(driver.page_source, 'lxml')
    AllInfo=bs.findAll('tr', {'class': 'bgcol'})

    linkyesterdaystr = str(datetime.date.today() - datetime.timedelta(days=0)) ###0代表当天,1代表昨天
    print(linkyesterdaystr)
    firstdaystr=AllInfo[1].get_text().replace('\t','').replace('\n','|').split('|')[-2]
    print(firstdaystr)
    com = int(linkyesterdaystr.__eq__(firstdaystr))
    if (com == 0):
        return
    for info in AllInfo: #[0:2]
        if info.find('a', {'class':'xjxd_nr'}) is None:
            print("No usefull Info")
        else:
            paras=info.find('a', {'class': 'xjxd_nr'}).get('onclick').replace('detail(','').replace("'",'')[0:-2]
            listparas=paras.split(',')
            innerlink='http://www.shenl.com.cn/todetail?id='+listparas[0]+'&isSearchPassWord='+listparas[1]+'&tag='+listparas[2]

            innerDetail=info.get_text().replace('\t','').replace('\n','|').split('|')

            while '' in innerDetail:
                innerDetail.remove('')

            innerdate = time.strptime(innerDetail[-1], "%Y-%m-%d") #Struct_time
            print(innerdate)
            firstdaydate = time.strptime(firstdaystr, "%Y-%m-%d")
            print(firstdaydate)
            depart=(datetime.datetime(*firstdaydate[:3]) - datetime.datetime(*innerdate[:3])).days
            if depart > 0: #如先获得首页里的开始时间,如果首次出现时间小于开始时间则停止爬虫
                return
            f.write('\t'.join(innerDetail) + "\t" + innerlink + "\n")
    try:
        print(type(driver.find_element(By.LINK_TEXT, "下一页")))
        driver.find_element_by_xpath("//a[contains(text(),'下一页')]").click()
    except NoSuchElementException:
        time.sleep(1)
        print("No more pages found")
        return
    time.sleep(4)
    getQuestionsLinks(driver)


if __name__ == '__main__':
    for n in range(0,1,1):
        import time
        IsoTimeFormat = '%Y_%m_%d'
        f = open('G:\\temp\\Question_Incr_'+str(time.strftime(IsoTimeFormat))+'.txt', 'w', encoding='utf-8')
        driver = webdriver.Chrome("C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe")
        driver.get("http://www.shenl.com.cn/xjxdList")
        time.sleep(3)
        getQuestionsLinks(driver)
        driver.close()
        f.close()