跟着慕课敲了一波使用nodejs爬取慕课网
废话不多说,直接上代码,太晚了。安装的教程以及这篇的注释明天补上
爬取这个页面:https://www.imooc.com/learn/348
创建一个:crawler.js
var http = require("https")
var cheerio = require("cheerio")
var url = "https://www.imooc.com/learn/348"
function filterChapters(html){
var $ = cheerio.load(html)
var chapters = $(".chapter")
var courseData = []
chapters.each(function (item) {
var chapter = $(this)
var chapterTitle = chapter.find("h3").text()
var videos = chapter.find(".video").children("li")
var chapterData = {
chapterTitle : chapterTitle,
videos : []
}
videos.each(function (item) {
var video = $(this).find(".J-media-item")
var videoTitle = video.text()
var id = video.attr("href").split("video/")[1]
chapterData.videos.push({
title: videoTitle,
id: id
})
})
courseData.push(chapterData)
})
return courseData
}
function printCourseInfo(courseData){
courseData.forEach(function (item) {
var chapterTitle = item.chapterTitle
// console.log(chapterTitle + "\n")
console.log(chapterTitle)
item.videos.forEach(function (video) {
// console.log(" 【" + video.id + "】 " + video.title +"\n")
console.log("【" + video.id + "】" + video.title)
})
})
}
http.get(url, function (res) {
var html = ""
res.on("data", function (data) {
html += data;
})
res.on("end", function () {
var courseData = filterChapters(html)
printCourseInfo(courseData)
})
}).on("error", function () {
console.log("获取失败!")
})
运行结果:
拜了个拜,晚安!