这里使用nodejs下的chrome-har库来导出浏览器的har数据,经验证效果不错,比较靠谱。

1,创建日志配置(ultra-harlog/module/log.js)

//cnpm install --save log4js
const log4js = require('log4js');

const options = {
	appenders:{
		console:{
			type: "console"
		},
		"puppeteer-record":{
			type : 'dateFile',
			filename : 'logs/puppeteer/log',
      pattern : '-yyyy-MM-dd.log',
      alwaysIncludePattern : true,
      encoding : 'utf-8'
		},
		"puppeteer-har-record":{
			type : 'dateFile',
			filename : 'logs/puppeteerhar/log',
      pattern : '-yyyy-MM-dd.log',
      alwaysIncludePattern : true,
      encoding : 'utf-8'
		},
		"puppeteer-harevent-record":{
			type : 'dateFile',
			filename : 'logs/puppeteerharevent/log',
      pattern : '-yyyy-MM-dd.log',
      alwaysIncludePattern : true,
      encoding : 'utf-8'
		}
	}	,
	"categories": {
    "default": { "appenders": ['console', "puppeteer-record", "puppeteer-har-record","puppeteer-harevent-record"], "level": "all" }
  }
}
log4js.configure(options);

function getConsoleLogger(){
	let consoleLog = log4js.getLogger('console');	
	return consoleLog ;
}

function getPuppeteerRecordLogger(){
	let consoleLog = log4js.getLogger('puppeteer-record');	
	return consoleLog ;
}

function getPuppeteerHarRecordLogger(){
	let consoleLog = log4js.getLogger('puppeteer-har-record');	
	return consoleLog ;
}

function getPuppeteerHarEventRecordLogger(){
	let consoleLog = log4js.getLogger('puppeteer-harevent-record');	
	return consoleLog ;
}

exports.getConsoleLogger = getConsoleLogger;
exports.getPuppeteerRecordLogger = getPuppeteerRecordLogger;
exports.getPuppeteerHarRecordLogger = getPuppeteerHarRecordLogger;
exports.getPuppeteerHarEventRecordLogger = getPuppeteerHarEventRecordLogger;

创建抓取的代码(harlog/module/puppeteerhar.js)

const puppeteer = require('puppeteer');
const PuppeteerHar = require('puppeteer-har');
const path = require("path");

const logger=require("./log");
const grpcclient=require("./grpcclient");

const log = logger.getPuppeteerHarRecordLogger() ;


/*
	启动浏览器
*/ 
async function launchBrowser(){
	//启动浏览器实例 [puppeteer.createBrowserFetcher([options])]
  let browser = await puppeteer.launch({
    // 若是手动下载的chromium需要指定chromium地址, 默认引用地址为 /项目目录/node_modules/puppeteer/.local-chromium/
    //executablePath: '/Users/huqiyang/Documents/project/z/chromium/Chromium.app/Contents/MacOS/Chromium',
    //如果是访问https页面 此属性会忽略https错误
    ignoreHTTPSErrors: true,
    // 关闭headless模式, 不会打开浏览器
    headless: true,
    //浏览器启动参数 https://peter.sh/experiments/chromium-command-line-switches/   --timeout
    args:['--disk-cache-size=0','--disable-cache','--disable-infobars','--window-size=800,600','--ignore-certificate-errors','--enable-feaures'],
    //是否为每个选项卡自动打开DevTools面板。 如果此选项为true,则headless选项将设置为false。
    devtools: false,
    //Defaults to 30000 (30 seconds). Pass 0 to disable timeout.
    timeout: 0
    //放慢puppeteer执行的动作,方便调试
    //slowMo: 250
  });
  return browser ;
}

async function saveHarlog(url,dirPath,filename){
	let homesite = url ;
	//保存的文件路径
	let harFilePath = path.join(dirPath,filename) ;
	//处理URL
	if(!(url.startsWith('http://') || url.startsWith('https://'))){
		url = "http://" + url ;
	}
  //打开浏览器
  let browser = await launchBrowser() ;
  
  //Puppeteer 初始化的屏幕大小默认为 800px x 600px。但是这个尺寸可以通过 Page.setViewport() 设置。
  /*
  await page.setViewport({
        width: 800,
        height: 600
  });
  */
  
  //创建一个新页面
  //let page = await browser.newPage();
  const page = (await browser.pages())[0];
  await page.waitFor(1000); //delay 1 s
  
  //page.setDefaultTimeout(12000);
  //page.setJavaScriptEnabled(enabled)
  
  //事件监听轻松打出页面的log
  //page.on('console', msg => log.info('PAGE LOG:', msg.text()));
 
  let har = new PuppeteerHar(page);
  try{
  	await har.start({ path:harFilePath});
  	
  	/*
  		页面跳转相关函数:
  		page.goto(url, options)  //相当于在浏览器中输入了地址,然后回车
  		page.goBack(options)
  		page.goForward(options)
  		page.reload(options)
  	*/
  	await page.goto(url,{
  			timeout:0
  	});
  	
  	log.info(page.mainFrame().title());
  	log.info(page.mainFrame().url());
  	
  	//返回HTML文档内容
  	//const html = await page.$eval('html', e => e.outerHTML);
  	//const html = await page.content() ;
  	
  	//通知JAVA解析HAR文件
		/*
	  try{
	  	grpcclient.resovleHarLog({
				url:homesite,
				file_name:filename,
				file_dir:dirPath,
				context:''
			});
	  }catch(err){
	  	log.error('发送RPC请求失败,' + err);
	  }
		*/
  	
  }catch(error){
  	log.info('resovle error :' + url + ";  error message:" + error) ;
  }finally{
  	if(har){
  		await har.stop();		
  	}
  	if(browser){
  		await browser.close();		
  	}
  } 
}
exports.launchBrowser = launchBrowser;
exports.saveHarlog = saveHarlog;

创建启动文件(ultra-harlog/puppeteerhar-app.js)

const fs = require("fs");
const path = require("path");
const moment = require("moment");
const schedule = require('node-schedule');

const cvsresovler=require("./module/cvsresovle");
const mhar=require("./module/puppeteerhar");

/*
cnpm install --save moment
cnpm install --save csv
cnpm install --save node-schedule
cnpm install --save puppeteer
cnpm install --save puppeteer-har
cnpm install --save iconv-lite
cnpm install --save chrome-har

cnpm install --save grpc

*/  
function init(){
		console.log('初始化调度器') ;
  	//每分钟的第30秒定时执行一次:
    schedule.scheduleJob('0 14 10 * * *',()=>{
        let ftime = moment().format('YYYYMMDDHHmm');
        console.log('当前调度时间为:' + ftime) ;
        let dirPath = path.join(__dirname,'harlogs',ftime) ;
        console.log("创建目录:" + dirPath) ;
        
        let isExist = false ;
        if(fs.existsSync(dirPath)){
        		//创建文件夹
        		let stat = fs.lstatSync(dirPath);
		        if(stat.isDirectory()){
		        	isExist = true ;
		        }
        }
        if(!isExist){
        	//创建文件夹
        	console.log("创建文件夹" + ftime) ;
			    fs.mkdirSync(dirPath);
        }       
        //开始解析需要处理的URL
        let dataArr = cvsresovler.readUrlRecord(path.join(__dirname,'top300.csv')) ;
        console.log("解析出URL共计" + dataArr.length + "条") ;
 
        /*
        	开始抓取HAR数据【同步的方式执行】。
        	注意:如果这里直接通过for循环遍历dataArr并调用saveHarlog方法,那么这将是一个异步的过程。
        */
		    (async function iterator(i){
		    		let data = 	dataArr[i]
		    		let url = data['SITE_LINK'] ;
	        	
	        	url = url.trim() ;
	        	let filename = url.replace(/\//g,'-').replace(/\\/g,'-') + '.har' ;
	        	if(url){
							console.log((i+1) + "-starting to resovle url :" + url ) ;
		        	try{
								await mhar.saveHarlog(url,dirPath,"N" + "-" + filename) ;
							}catch(error){
								console.log(error) ;
							}
	        	}
	        	if(i + 1 < dataArr.length){
	        		iterator(i+1) ;
	        	}
		    })(0) ;
    }); 
    console.log('应用程序启动完成') ;
}
//执行
//init();


/**
	用于测试的方法
*/
async function test(){
		let ftime = moment().format('YYYYMMDDHHmm');
    console.log('当前执行时间为:' + ftime) ;
    let dirPath = path.join(__dirname,'harlogs',ftime) ;
    console.log("创建目录:" + dirPath) ;
      
    let isExist = false ;
    if(fs.existsSync(dirPath)){
    		//创建文件夹
    		let stat = fs.lstatSync(dirPath);
		    if(stat.isDirectory()){
		    	isExist = true ;
		    }
    }
    if(!isExist){
    	//创建文件夹
     	console.log("创建文件夹" + ftime) ;
		  fs.mkdirSync(dirPath);
    }     
    
    //测试的URL
		let url = "www.baidu.com" ;	
		
		let arguments = process.argv.splice(2);
		if(arguments.length > 0 ){
			url = arguments[0] ;
		}
		
		url = url.trim() ;
		let filename = url.replace(/\//g,'-').replace(/\\/g,'-') + '.har' ;
		if(url){
			console.log("starting to resovle test url :" + url ) ;
			try{
				await mhar.saveHarlog(url,dirPath,"NT" + "-" + filename) ;
			}catch(error){
				console.log(error) ;
			}
		}
}
//运行测试
test() ;

关于GRPC部分的代码,请参考我另外一篇博文

参考地址:https://michaljanaszek.com/blog/generate-har-with-puppeteer