20行Python代码爬取WUST教务处的所有通知

目标

20行Python代码爬取WUST教务处的所有通知_数据

直接上代码,简单爬虫。

xpath实现

from lxml import etree
import requests
import csv

url = 'https://jwc.wust.edu.cn/'

r = requests.get(url)

r.encoding = r.apparent_encoding
c = r.text

s = '/html/body/div[3]/div[1]/div/div/div[2]/div/div/div/div/div[2]/div/ul/li'

html = etree.HTML(c)
date_lst = html.xpath(s+'/div[2]/span/text()')
url_lst = html.xpath(s+'/div[1]/span[2]/a/@title')
f = open("data2.csv", mode="w", encoding="utf-8", newline='')
cw = csv.writer(f)
for i in zip(url_lst, date_lst):
    cw.writerow(i)

f.close()
r.close()

re实现也非常简单

import requests
import csv
import re

url = 'https://jwc.wust.edu.cn/'

r = requests.get(url)

r.encoding = r.apparent_encoding
c = r.text
# print(c)
o = re.compile(r"<span class='Article_Title'>.*?title='"
               r"(?P<title>.*?)'>"
               r".*?<span class='Article_PublishDate'>(?P<time>.*?)</span>"
               , re.S)
f = open('data1.csv', mode="w", encoding='utf-8', newline='')
cw = csv.writer(f)
it = o.finditer(c)

for i in it:
    dic = i.groupdict()
    cw.writerow(dic.values())

f.close()
r.close()

bs4实现

import requests
import csv
from bs4 import BeautifulSoup

url = 'https://jwc.wust.edu.cn/'

r = requests.get(url)

r.encoding = r.apparent_encoding
c = r.text

html = BeautifulSoup(c, 'html.parser')

l = html.findAll('span', class_='Article_Title')
lst = []
for i in l:
    lst.append(i.find('a').get('title'))

l = html.findAll('span', attrs={'class': 'Article_PublishDate'})
tlst = []
for i in l:
    tlst.append(i.text)

f = open('data3.csv', mode="w", encoding='utf-8', newline='')
cw = csv.writer(f)
for i in zip(lst, tlst):
    cw.writerow(i)

f.close()
r.close()

data.csv
20行Python代码爬取WUST教务处的所有通知_数据_02


总结

教务处的反爬机制太差,没有爬虫识别,没有防盗链,然后数据就可以随便爬了。。