puppeteer是谷歌推出的无头浏览器框架,提供了完备的api用于控制浏览器
以爬取某资源网站为例,简单使用puppeteer

puppeteer 小项目初体验_puppeteer


puppeteer 小项目初体验_html_02


这里使用页面获取的代码需要实现两部分,即打开网页,然后点击获取地址的按钮,最后获取链接地址和提取密码并返回。打开网页部分可以通过暴力枚举数字即可。

const puppeteer = require('puppeteer-core')
const cheerio = require('cheerio')
const sleep = () => new Promise((res, rej) => setTimeout(res, 2000));

async function crawlXXX(url){
var brower = puppeteer.launch({
headless:true,
executablePath:'你的chrome.exe地址'
});
var page = await (await brower).newPage();
try{
page.setDefaultTimeout(2 * 60 * 1000);
page.setUserAgent('Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.130 Safari/537.36');
page.setJavaScriptEnabled(true);
page.setCacheEnabled(true);
await page.goto(url);
await page.waitForSelector('html');
let button = await page.$('.data_details > div > .btn-primary'); // 获取按钮
button.click();
let title = await page.title();
let link = await page.$eval('.bs-example-modal-sm > .modal-sm > .modal-content > a > font',el=>el.innerHTML);
let pass = await page.$eval('.bs-example-modal-sm > .modal-sm > .modal-content > font',el=>el.innerHTML);
return {link,pass,title};
}catch(err){
console.error(err);
}
finally{
(await brower).close();
}
}

class crawlSiteController{
static async getSite(ctx){
let req = ctx.request.body;
var type = req.type;
if(type === 1){
let num = req.num;
let url = 'http://xxxx.xxxxxx.xxxx/article/';
let res = await crawlXXX(url+num);
ctx.response.status = 200;
ctx.response.body = res
}
}
}

module.exports = crawlSiteController;