小白初入python,借鉴了一些源码,然后改进了一下,选了湖大硕士招生分数线的一个表格进行爬取。成功是成功了,咳咳,还有很多改进的地方。啥也不说了,直接上源码~
# -*- coding:utf-8 -*-
# The author is Sympathy
from bs4 import BeautifulSoup
import requests
import csv
import bs4
# 用于抓取湖大硕士生招生初试线表格数据
def check_link(url):
try:
r = requests.get(url)
r.raise_for_status()
r.encoding = r.apparent_encoding
return r.text
except:
print('无法连接服务器')
def get_contents(ulist, rurl):
soup = BeautifulSoup(rurl, 'lxml')
trs = soup.find_all('tr')
for tr in trs:
ui = []
for td in tr:
ui.append(td.string)
ulist.append(ui)
def save_contents(urlist):
with open("E:/code/captureweb/2018年湖大初试成绩线.csv", 'w',newline='') as f:
writer = csv.writer(f)
writer.writerow(['2018湖大初试成绩'])
for i in range(len(urlist)):
for p in range(1,10):
urlist[i].append(' ')
if i not in [0,1,15,32,34,35,36,37]:
for p in range(2):
urlist[i].insert(0,' ')
elif i in [32,34,36]:
for p in range(4):
urlist[i].insert(0,' ')
elif i in [35,37]:
for p in range(6):
urlist[i].insert(0,' ')
writer.writerow([urlist[i][1], urlist[i][3], urlist[i][5], urlist[i][7], urlist[i][9], urlist[i][11],
urlist[i][13]])
def main():
urli = []
url = "http://gra.hnu.edu.cn/info/1075/4129.htm"
rs = check_link(url)
get_contents(urli, rs)
save_contents(urli)
main()