今年由于疫情原因,考研复试、调剂纷纷推迟,时至5月20日,才开通考研调剂系统

但是调剂信息量非常大,毕竟中国大学多到数不清,而且一所学校不止一条调剂信息,可想而知,信息量有多大。虽然系统有一部分筛选条件,但是这些筛选条件可能依然不能满足需求,这就需要把所有可能需要的数据爬取下来,进行进一步的筛选。

1 前期工作

  • 打开研招网

python 爬取高考各学校各专业录取分数线 用python爬取考研信息_json

  • 点击网上调剂

python 爬取高考各学校各专业录取分数线 用python爬取考研信息_json_02

  • 此时就可以根据条件进行查找调剂信息

python 爬取高考各学校各专业录取分数线 用python爬取考研信息_考研调剂_03

  • 在专业的地方可以输入你想要查询的专业,比如“数学”,然后如图依次进行操作即可

python 爬取高考各学校各专业录取分数线 用python爬取考研信息_爬虫_04

  • 点击消息头,查看请求网址、方法和请求头(包括cookies)

python 爬取高考各学校各专业录取分数线 用python爬取考研信息_爬虫_05

python 爬取高考各学校各专业录取分数线 用python爬取考研信息_json_06

  • 点击参数,查看表单数据

python 爬取高考各学校各专业录取分数线 用python爬取考研信息_python_07

2 代码编写

import json

import requests


def find_school(start, zymc):
    headers = {
        "Host": "yz.chsi.com.cn",
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:73.0) Gecko/20100101 Firefox/73.0",
        "Accept": "*/*",
        "Accept-Language": "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2",
        "Accept-Encoding": "gzip, deflate, br",
        "Content-Type": "application/x-www-form-urlencoded; charset=UTF-8",
        "X-Requested-With": "XMLHttpRequest",
        "Content-Length": "123",
        "Origin": "https://yz.chsi.com.cn",
        "Connection": "keep-alive",
        "Referer": "https://yz.chsi.com.cn/sytj/tj/qecx.html",
        "Cookie": "JSESSIONID=978BA64E6F4530EDB50FE43A3C805EBE; _ga=GA1.3.1200733016.1569507824; zg_did=%7B%22did%22%3A%20%2216d6df4d287815-0a7df95b3ba0b78-4c312272-144000-16d6df4d288505%22%7D; zg_adfb574f9c54457db21741353c3b0aa7=%7B%22sid%22%3A%201590297728210%2C%22updated%22%3A%201590298426969%2C%22info%22%3A%201589874544864%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22www.baidu.com%22%2C%22landHref%22%3A%20%22https%3A%2F%2Fyz.chsi.com.cn%2F%22%2C%22cuid%22%3A%20%226dc3b88f096e00fec19e8c56fc31b1e3%22%7D; _ga=GA1.4.1200733016.1569507824; zg_0d76434d9bb94abfaa16e1d5a3d82b52=%7B%22sid%22%3A%201569508637357%2C%22updated%22%3A%201569510245680%2C%22info%22%3A%201569508637361%2C%22superProperty%22%3A%20%22%7B%7D%22%2C%22platform%22%3A%20%22%7B%7D%22%2C%22utm%22%3A%20%22%7B%7D%22%2C%22referrerDomain%22%3A%20%22my.chsi.com.cn%22%2C%22cuid%22%3A%20%226dc3b88f096e00fec19e8c56fc31b1e3%22%7D; acw_tc=2760827b15879041262595723ec515fe39010551ca912d1067523ce103dcbf; _gid=GA1.3.770260242.1589874545; __utma=229973332.1200733016.1569507824.1589904202.1589904202.1; __utmz=229973332.1589904202.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); aliyungf_tc=AQAAAGUbgAPRoQoA1Qjjb9p0NlS6rwTj; JSESSIONID=386B18A445F1FE086FAB8E0CD8865D8B; XSRF-CCKTOKEN=3514cccbe2916ac49c3a4b561173a314; CHSICC_CLIENTFLAGYZ=7013645e7a0bb77e53e19b04fd77cf6a; CHSICC_CLIENTFLAGSYTJ=43fdc33b0ff004fe35738549a78838c4",
        "Pragma": "no-cache",
        "Cache-Control": "no-cache"
    }

    data = {
        "pageSize": "20",
        "start": start,  # 翻页
        "orderBy": "",
        "ssdm": "",
        "dwmc": "",
        "xxfs": "1",
        "zymc": zymc,  # 专业名称
        "qers": "",
        "data_type": "json",
        "agent_from": "web",
        "pageid": "tj_qe_list"
    }

    url = "https://yz.chsi.com.cn/sytj/stu/sytjqexxcx.action"
    resp = requests.post(url, headers=headers, data=data)
    # 学校列表
    school_list = json.loads(resp.text)["data"]["vo_list"]["vos"]
    return school_list


all_school_list = []
# 专业名称代码
zymc_list = ["数学", "图论及其应用", "运筹学与控制论", "应用数学", "概率论与数理统计", "计算数学", "几何与代数", "控制论与智能优化", "数理统计", "大数据分析与应用",
             "应用统计和数据分析"]
for zymc in zymc_list:
    for i in range(20):
        # 翻页
        start = str(i * 20) if i > 0 else ""
        school_list = find_school(start, zymc)
        all_school_list += school_list
        if len(school_list) < 20:
            break

print(all_school_list)

说明: "qers": "余额人数", "fbsjStr": "发布时间", "hasit": "考生是否已经填报该志愿 true 或 false", "dwmc": "单位名称", "yxsmc": "院系所名称", "yjfxdm": "研究方向代码", "zymc": "专业名称", "zydm": "专业代码", "dwdm": "单位代码", "xxfs": "学习方式", "sfmzyq": "是否满足要求,空为满足要求,非空其内容为不满足要求原因", "bz": "调剂特殊要求", "gxsj": "距离最后更新时间已过xx分钟", "yjfxmc": "研究方向名称", "zt": "余额状态", "id": "余额信息ID", "yxsdm": "院系所代码", "ssdm": "省市代码"

运行结果:

[{
	'qers': 0,
	'fbsjStr': '',
	'hasit': False,
	'dwmc': '沈阳航空航天大学',
	'yxsmc': '理学院',
	'yjfxdm': '00',
	'zymc': '数学',
	'zydm': '070100',
	'dwdm': '10143',
	'xxfs': '1',
	'sfmzyq': '',
	'bz': '详见研究生院官网学院调剂公告',
	'gxsj': 1812,
	'yjfxmc': '不区分研究方向',
	'zt': '只公布有计划余额',
	'id': 'glygw21dkpjb7vlj',
	'yxsdm': '012',
	'ssdm': ''
}, 
... ... ... ...
{
	'qers': 0,
	'fbsjStr': '',
	'hasit': False,
	'dwmc': '宝鸡文理学院',
	'yxsmc': '数学与信息科学学院',
	'yjfxdm': '00',
	'zymc': '计算数学',
	'zydm': '070102',
	'dwdm': '10721',
	'xxfs': '1',
	'sfmzyq': '',
	'bz': '本科修读专业原则上为数学、应用数学、信息与计算科学、概率统计等专业。',
	'gxsj': 146,
	'yjfxmc': '不区分研究方向',
	'zt': '只公布有计划余额',
	'id': 'opiep577yqad0xx9',
	'yxsdm': '009',
	'ssdm': ''
}]