需求:对某网站实现动态爬取并能每天保存到文本文件中
解决方法:通过Python的BeautifulSoup、selenium完成该需求。
完整代码:
import json
import urllib.request
import urllib.error
from urllib.parse import quote
from bs4 import BeautifulSoup
from builtins import str
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
from urllib.request import urlopen
from selenium.common.exceptions import NoSuchElementException
import re
import time
import datetime
import sys
sys.setrecursionlimit(1000000)
import os
from selenium.webdriver.common.keys import Keys
def getQuestionsLinks(driver):
bs = BeautifulSoup(driver.page_source, 'lxml')
AllInfo=bs.findAll('tr', {'class': 'bgcol'})
linkyesterdaystr = str(datetime.date.today() - datetime.timedelta(days=0)) ###0代表当天,1代表昨天
print(linkyesterdaystr)
firstdaystr=AllInfo[1].get_text().replace('\t','').replace('\n','|').split('|')[-2]
print(firstdaystr)
com = int(linkyesterdaystr.__eq__(firstdaystr))
if (com == 0):
return
for info in AllInfo: #[0:2]
if info.find('a', {'class':'xjxd_nr'}) is None:
print("No usefull Info")
else:
paras=info.find('a', {'class': 'xjxd_nr'}).get('onclick').replace('detail(','').replace("'",'')[0:-2]
listparas=paras.split(',')
innerlink='http://www.shenl.com.cn/todetail?id='+listparas[0]+'&isSearchPassWord='+listparas[1]+'&tag='+listparas[2]
innerDetail=info.get_text().replace('\t','').replace('\n','|').split('|')
while '' in innerDetail:
innerDetail.remove('')
innerdate = time.strptime(innerDetail[-1], "%Y-%m-%d") #Struct_time
print(innerdate)
firstdaydate = time.strptime(firstdaystr, "%Y-%m-%d")
print(firstdaydate)
depart=(datetime.datetime(*firstdaydate[:3]) - datetime.datetime(*innerdate[:3])).days
if depart > 0: #如先获得首页里的开始时间,如果首次出现时间小于开始时间则停止爬虫
return
f.write('\t'.join(innerDetail) + "\t" + innerlink + "\n")
try:
print(type(driver.find_element(By.LINK_TEXT, "下一页")))
driver.find_element_by_xpath("//a[contains(text(),'下一页')]").click()
except NoSuchElementException:
time.sleep(1)
print("No more pages found")
return
time.sleep(4)
getQuestionsLinks(driver)
if __name__ == '__main__':
for n in range(0,1,1):
import time
IsoTimeFormat = '%Y_%m_%d'
f = open('G:\\temp\\Question_Incr_'+str(time.strftime(IsoTimeFormat))+'.txt', 'w', encoding='utf-8')
driver = webdriver.Chrome("C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe")
driver.get("http://www.shenl.com.cn/xjxdList")
time.sleep(3)
getQuestionsLinks(driver)
driver.close()
f.close()