20行Python代码爬取WUST教务处的所有通知

原创

Herio 2021-08-10 08:22:59 博主文章分类：Web ©著作权

文章标签 Python html xml python 数据 文章分类 Python 后端开发

©著作权归作者所有：来自51CTO博客作者Herio的原创作品，请联系作者获取转载授权，否则将追究法律责任

20行Python代码爬取WUST教务处的所有通知

目标

20行Python代码爬取WUST教务处的所有通知_数据

直接上代码，简单爬虫。

xpath实现

from lxml import etree
import requests
import csv

url = 'https://jwc.wust.edu.cn/'

r = requests.get(url)

r.encoding = r.apparent_encoding
c = r.text

s = '/html/body/div[3]/div[1]/div/div/div[2]/div/div/div/div/div[2]/div/ul/li'

html = etree.HTML(c)
date_lst = html.xpath(s+'/div[2]/span/text()')
url_lst = html.xpath(s+'/div[1]/span[2]/a/@title')
f = open("data2.csv", mode="w", encoding="utf-8", newline='')
cw = csv.writer(f)
for i in zip(url_lst, date_lst):
    cw.writerow(i)

f.close()
r.close()

re实现也非常简单

import requests
import csv
import re

url = 'https://jwc.wust.edu.cn/'

r = requests.get(url)

r.encoding = r.apparent_encoding
c = r.text
# print(c)
o = re.compile(r"<span class='Article_Title'>.*?title='"
               r"(?P<title>.*?)'>"
               r".*?<span class='Article_PublishDate'>(?P<time>.*?)</span>"
               , re.S)
f = open('data1.csv', mode="w", encoding='utf-8', newline='')
cw = csv.writer(f)
it = o.finditer(c)

for i in it:
    dic = i.groupdict()
    cw.writerow(dic.values())

f.close()
r.close()

bs4实现

import requests
import csv
from bs4 import BeautifulSoup

url = 'https://jwc.wust.edu.cn/'

r = requests.get(url)

r.encoding = r.apparent_encoding
c = r.text

html = BeautifulSoup(c, 'html.parser')

l = html.findAll('span', class_='Article_Title')
lst = []
for i in l:
    lst.append(i.find('a').get('title'))

l = html.findAll('span', attrs={'class': 'Article_PublishDate'})
tlst = []
for i in l:
    tlst.append(i.text)

f = open('data3.csv', mode="w", encoding='utf-8', newline='')
cw = csv.writer(f)
for i in zip(lst, tlst):
    cw.writerow(i)

f.close()
r.close()