QS世界大学排名有两个官方的网站,一个是国际网站,另一个是国内网站,上面的数据应该都是一样的,只是一个是英文,一个是中文。
综合排名
qsChina,也就是国内网站上的QS排名数据获取相对比较方便,因为它可以选择每页显示所有数据,而国际网站上的那个则每页最多100条数据。虽然有这样的问题,但我最后还是选择国际网站上的数据,因为有些大学的中文翻译很奇怪。
我主要采用的还是selenium,这个虽然很慢,但是比较稳定。
#encoding=utf-8
from selenium.webdriver import Edge
from selenium.webdriver.common.by import By
import time
import xlsxwriter
Workbook = xlsxwriter.Workbook("QSRank2022.xlsx")
Sheet = Workbook.add_worksheet()
driver = Edge()
Sheet.write(0, 0, 'Rank')
Sheet.write(0, 1, 'University')
Sheet.write(0, 2, 'Location')
Sheet.write(0, 3, 'Overall Score')
Sheet.write(0, 4, 'International Students Ratio')
Sheet.write(0, 5, 'International Faculty Ratio')
Sheet.write(0, 6, 'Faculty Student Ratio')
Sheet.write(0, 7, 'Citations per Faculty')
Sheet.write(0, 8, 'Academic Reputation')
Sheet.write(0, 9, 'Employer Reputation')
curl = 'https://www.topuniversities.com/university-rankings/world-university-rankings/2022'
driver.get(curl)
time.sleep(1)
currentRow = 1
for j in range(13):#13 page
for i in range(1, 103, 1):
subItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div['+str(i)+']')
attr = subItem.get_attribute('customblock')#exclude ad
if not attr:
subItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div['+str(i)+']/div/div/div/div[1]/div/div/div/div/div[1]')
Sheet.write(currentRow, 0, subItem.text)
subItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div['+str(i)+']/div/div/div/div[1]/div/div/div/div/div[2]/div/div[1]')
Sheet.write(currentRow, 1, subItem.text)
subItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div['+str(i)+']/div/div/div/div[1]/div/div/div/div/div[2]/div/div[2]')
Sheet.write(currentRow, 2, subItem.text)
for k in range(3,10,1):
subItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div['+str(i)+']/div/div/div/div[2]/div/div/div/div['+str(k-2)+']')
Sheet.write(currentRow, k, subItem.text)
currentRow = currentRow + 1
print(str(currentRow) + ' finished!')
# change to next page
if j < 12:
q = 3
while True:
try:
nextPage = driver.find_element(By.XPATH, '//*[@id="alt-style-pagination"]/li['+str(q)+']/a')
except:
q = q + 1
continue
attr = nextPage.get_attribute('class')
if attr == 'page-link next':
break
q = q + 1
driver.execute_script('arguments[0].click();', nextPage)
time.sleep(1)
print('chaneg to page ' + str(j+2))
Workbook.close()
driver.close()
上面的代码用于获取综合排名。当时主要遇到了这样一些问题:
- selenium打开网页自动跳转到qsChina
- 网页中的几个条目之间有广告
- 翻页的按键位置会变化
问题解决:
- 在selenium打开网页后打断点,就是在driver.get(curl)之后暂停,然后手动在网页中输入国际网站的网址。
- 广告的条目有属性customblock=“true”,用get_attribute的方法获取这个属性进行判断,正常的条目没有这个属性,应该是None
- 翻页的按键也有一个属性class=“page-link next”,根据这一点也可以找到翻页的位置,然后用js脚本模拟点击
学科排名
总共有56个学科排名,5个是"BROAD SUBJECT AREA",还有51个是"SPECIFIC SUBJECT",下面是代码的实现。每次启动的时候都需要在浏览器打开网页之后手动重新输入网页地址,不然会跳转到qsChina的网页
打开网页后先要选Subject,也是利用selenium执行js脚本的方式模拟浏览器点击。先打开下拉框,再选择相应的Subject,然后根据选择的Subject创建Excel表格。接着在网页下方可以获取这个Subject的条目数量。然后切换到具体的指标页面,改变每页显示的条目数量(尽可能大,可以少翻页)。再之后就可以跟综合排名一样获取具体的数据了。每遍历一页之后要翻页。
#encoding=utf-8
from selenium.webdriver import Edge
from selenium.webdriver.common.by import By
import time
import xlsxwriter
import math
driver = Edge()
curl = 'https://www.topuniversities.com/university-rankings/university-subject-rankings/2021/arts-humanities'
driver.get(curl)
time.sleep(1)
for i in range(1, 59, 1):
# skip the border. "1" for "BROAD SUBJECT AREA"; "7" for "SPECIFIC SUBJECT"
if i == 1 or i == 7:
continue
#change subject
SubjectSel = driver.find_element(By.XPATH, '//*[@id="ranking-fillters"]/div[7]/div/div')
driver.execute_script('arguments[0].click();', SubjectSel)
time.sleep(1)
Subject = driver.find_element(By.XPATH, '//*[@id="ranking-fillters"]/div[7]/div/div/div[2]/div['+str(i)+']')
SubjectName = Subject.text
driver.execute_script('arguments[0].click();', Subject)
print('Select Subject: '+SubjectName)
time.sleep(1)
# create sheet
Workbook = xlsxwriter.Workbook(SubjectName+'.xlsx')
Sheet = Workbook.add_worksheet()
# get item total number
itemNumber = driver.find_element(By.XPATH, '//*[@id="_totalcountresults"]')
itemNum = int(itemNumber.text)
print('Total Item count in ' + SubjectName + ': ' + itemNumber.text)
# change tab to rank index
rankInd = driver.find_element(By.XPATH, '//*[@id="block-tu-d8-content"]/div/article/div/div[3]/div/div[1]/div/div[1]/div/div/ul/li[2]/a')
driver.execute_script('arguments[0].click();', rankInd)
time.sleep(1)
print('Change tab to Ranking Indicators')
# change items number in every page
dropdown = driver.find_element(By.XPATH, '//*[@id="block-tu-d8-content"]/div/article/div/div[3]/div/div[1]/div/div[3]/div[4]/div[1]/div[2]/i')
driver.execute_script('arguments[0].click();', dropdown)
time.sleep(1)
itemsPerPage = driver.find_element(By.XPATH, '//*[@id="block-tu-d8-content"]/div/article/div/div[3]/div/div[1]/div/div[3]/div[4]/div[1]/div[2]/div[2]/div[4]')
driver.execute_script('arguments[0].click();', itemsPerPage)
time.sleep(1)
print('Now there are 100 items in every page')
#initial the table head
Sheet.write(0, 0, 'Rank')
Sheet.write(0, 1, 'University')
Sheet.write(0, 2, 'Location')
Sheet.write(0, 3, 'Overall Score')
Sheet.write(0, 4, 'H-index Citations')
Sheet.write(0, 5, 'Citations per Paper')
Sheet.write(0, 6, 'Academic Reputation')
Sheet.write(0, 7, 'Employer Reputation')
CycleCnt = int(math.ceil(itemNum/100))
currentRow = 1
for j in range(CycleCnt):
k = 1
while True:
try:
eachItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div['+str(k)+']')
except:
break
attr = eachItem.get_attribute('customblock')#exclude ad
time.sleep(0.5)
if not attr:
subItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div['+str(k)+']/div/div/div/div[1]/div/div/div/div/div[1]/div')
Sheet.write(currentRow, 0, subItem.text)
subItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div['+str(k)+']/div/div/div/div[1]/div/div/div/div/div[2]/div/div[1]/div')
Sheet.write(currentRow, 1, subItem.text)
subItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div['+str(k)+']/div/div/div/div[1]/div/div/div/div/div[2]/div/div[2]')
Sheet.write(currentRow, 2, subItem.text)
for q in range(3,8,1):
subItem = driver.find_element(By.XPATH, '//*[@id="ranking-data-load_ind"]/div['+str(k)+']/div/div/div/div[2]/div/div/div/div['+str(q-2)+']')
Sheet.write(currentRow, q, subItem.text)
print(str(currentRow)+ '/' + str(itemNum) + ' finished!')
currentRow = currentRow + 1
k = k + 1
# next page
if j < CycleCnt-1:
q = 3
while True:
try:
nextPage = driver.find_element(By.XPATH, '//*[@id="alt-style-pagination"]/li['+str(q)+']/a')
except:
q = q + 1
continue
attr = nextPage.get_attribute('class')
if attr == 'page-link next':
break
q = q + 1
driver.execute_script('arguments[0].click();', nextPage)
time.sleep(1)
print('chaneg to page ' + str(j+2))
print('finish ' + SubjectName)
Workbook.close()
driver.close()