小白初入python,借鉴了一些源码,然后改进了一下,选了湖大硕士招生分数线的一个表格进行爬取。成功是成功了,咳咳,还有很多改进的地方。啥也不说了,直接上源码~

# -*- coding:utf-8 -*-
# The author is Sympathy
from bs4 import BeautifulSoup
import requests
import csv
import bs4
# 用于抓取湖大硕士生招生初试线表格数据
def check_link(url):
    try:
        r = requests.get(url)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        print('无法连接服务器')

def get_contents(ulist, rurl):
    soup = BeautifulSoup(rurl, 'lxml')
    trs = soup.find_all('tr')
    for tr in trs:
        ui = []
        for td in tr:
            ui.append(td.string)
        ulist.append(ui)

def save_contents(urlist):
    with open("E:/code/captureweb/2018年湖大初试成绩线.csv", 'w',newline='') as f:
        writer = csv.writer(f)
        writer.writerow(['2018湖大初试成绩'])
        for i in range(len(urlist)):
            for p in range(1,10):
                urlist[i].append(' ')
            if i not in [0,1,15,32,34,35,36,37]:
                for p in range(2):
                   urlist[i].insert(0,' ')
            elif i in [32,34,36]:
                for p in range(4):
                   urlist[i].insert(0,' ')
            elif i in [35,37]:
                for p in range(6):
                   urlist[i].insert(0,' ')
            writer.writerow([urlist[i][1], urlist[i][3], urlist[i][5], urlist[i][7], urlist[i][9], urlist[i][11],
                                 urlist[i][13]])
def main():
    urli = []
    url = "http://gra.hnu.edu.cn/info/1075/4129.htm"
    rs = check_link(url)
    get_contents(urli, rs)
    save_contents(urli)
main()