Node.js实现网络爬虫实验
实验要求
爬取网页
——爬取网易新闻为例(未成功)
遇到的各种问题
爬取网站时,发现不同网站的编码格式不同,需要根据编码格式修改。在新闻网页中,按shift + ctrl + I 打开开发者工具,在console中输入document.charset,得到编码格式。将老师给的原代码中var myEncoding = “utf-8”;部分改为var myEncoding = “GBK”;
接下来,对比老师给代码,对比中国新闻网和网易新闻的源代码修改爬虫程序。
修改变量部分花了蛮多时间,一开始还很没有头绪。
通过比较两个网页源代码的不同
发现和这两类较为容易修改,按ctrl+F搜索关键字,比如搜索title,得到如下结果
其他格式不变,只要修改og:title部分即可,有的时候名称不同,比如网易新闻的时间名称是published_time,而中国新闻网不是。
content正文内容较为难找,因此打开新闻原页面,右键打开检查,鼠标移到相应代码位置,直到蓝色方框框住正文。通过这个部分的代码修改爬虫变量。
然后就是我花了最长时间的正则表达式,中国新闻网的非常容易看懂,/d{n}即代表几位数字,https://www.163.com/dy/article/G8O53AEJ051481US.html
网易的部分需转换“G8O53AEJ051481US.html”部分,该部分是16位字母加数字,一开始我试了^/w{16}$,代码运行失败,在该处设断点,发现就是正则表达式错误。后来发现这部分不含下划线,应该用[A-Za-z0-9]来表示,随后我对照老师给的爬虫代码,发现 ^ 和 $ 不需要,以及{16}应该用不限制位数的+代替。再次运行,终于修改成功了!!!😢
继续运行,又出现新的问题(至今未解决😫)
爬虫代码分析
1.引入需要的fs包,request包,cheerio包,iconv-lite包, date-utils包
var fs = require('fs');
var myRequest = require('request');
var myCheerio = require('cheerio');
var myIconv = require('iconv-lite');
require('date-utils');
var mysql = require('./mysql.js');
2.待爬取的网站和名称以及编码方式
var source_name = "网易新闻";
var domain = 'https://news.163.com/';
var myEncoding = "GBK";
var seedURL = 'https://news.163.com/';
3.需要爬取的关键词信息,标题,日期,作者,正文内容,来源等
var seedURL_format = "$('a')";
var keywords_format = " $('meta[name=\"keywords\"]').eq(0).attr(\"content\")";
var title_format = " $('meta[property=\"og:title\"]').eq(0).attr(\"content\")";
var date_format = " $('meta[property=\"article:published_time\"]').eq(0).attr(\"content\")";
var author_format = "$('meta[name=\"author\"]').eq(0).attr(\"content\")";
var content_format = "$('.post_body').text()";
var desc_format = " $('meta[name=\"description\"]').eq(0).attr(\"content\")";
var source_format = "$('#utm_source').text()";
var url_reg = /\/([A-Za-z0-9]+).html/;//https://www.163.com/dy/article/G8O53AEJ051481US.html
4.表示日期
var regExp = /((\d{4}|\d{2})(\-|\/|\.)\d{1,2}\3\d{1,2})|(\d{4}年\d{1,2}月\d{1,2}日)|(\d{4}\-\d{2}\-\d{2})/
5.防止网站屏蔽爬虫伪装成浏览器
var headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
}
6.request模块异步fetch url
function request(url, callback) {
var options = {
url: url,
encoding: null,
//proxy: 'http://x.x.x.x:8080',
headers: headers,
timeout: 10000 //
}
myRequest(options, callback)
};
7.读取种子页面,解析链接
seedget();
function seedget() {
request(seedURL, function(err, res, body) { //读取种子页面
// try {
//用iconv转换编码
var html = myIconv.decode(body, myEncoding);
//console.log(html);
//准备用cheerio解析html
var $ = myCheerio.load(html, { decodeEntities: true });
// } catch (e) { console.log('读种子页面并转码出错:' + e) };
var seedurl_news;
try {
seedurl_news = eval(seedURL_format);
} catch (e) { console.log('url列表所处的html块识别出错:' + e) };
seedurl_news.each(function(i, e) { //遍历种子页面里所有的a链接
var myURL = "";
try {
//得到具体新闻url
var href = "";
href = $(e).attr("href");
if (href == undefined) return;
if (href.toLowerCase().indexOf('http://') >= 0) myURL = href; //http://开头的
else if (href.startsWith('//')) myURL = 'http:' + href; 开头的
else myURL = seedURL.substr(0, seedURL.lastIndexOf('/') + 1) + href; //其他
} catch (e) { console.log('识别种子页面中的新闻链接出错:' + e) }
if (!url_reg.test(myURL)) return; //检验是否符合新闻url的正则表达式
//console.log(myURL);
var fetch_url_Sql = 'select url from fetches where url=?';
var fetch_url_Sql_Params = [myURL];
mysql.query(fetch_url_Sql, fetch_url_Sql_Params, function(qerr, vals, fields) {
if (vals.length > 0) {
console.log('URL duplicate!')
} else newsGet(myURL); //读取新闻页面
});
});
});
};
8.读取新闻页面,并将读取出来的数据储存到fetch中
function newsGet(myURL) { //读取新闻页面
request(myURL, function(err, res, body) { //读取新闻页面
//try {
var html_news = myIconv.decode(body, myEncoding); //用iconv转换编码
//console.log(html_news);
//准备用cheerio解析html_news
var $ = myCheerio.load(html_news, { decodeEntities: true });
myhtml = html_news;
//} catch (e) { console.log('读新闻页面并转码出错:' + e);};
console.log("转码读取成功:" + myURL);
//动态执行format字符串,构建json对象准备写入文件或数据库
var fetch = {};
fetch.title = "";
fetch.content = "";
fetch.publish_date = (new Date()).toFormat("YYYY-MM-DD");
//fetch.html = myhtml;
fetch.url = myURL;
fetch.source_name = source_name;
fetch.source_encoding = myEncoding; //编码
fetch.crawltime = new Date();
if (keywords_format == "") fetch.keywords = source_name; // eval(keywords_format); //没有关键词就用sourcename
else fetch.keywords = eval(keywords_format);
if (title_format == "") fetch.title = ""
else fetch.title = eval(title_format); //标题
// if (date_format != "") fetch.publish_date = eval(date_format); //刊登日期
// console.log('date: ' + fetch.publish_date);
/*fetch.publish_date = regExp.exec(fetch.publish_date)[0];
fetch.publish_date = fetch.publish_date.replace('年', '-')
fetch.publish_date = fetch.publish_date.replace('月', '-')
fetch.publish_date = fetch.publish_date.replace('日', '')
fetch.publish_date = new Date(fetch.publish_date).toFormat("YYYY-MM-DD");*/
if (author_format == "") fetch.author = source_name; //eval(author_format); //作者
else fetch.author = eval(author_format);
if (content_format == "") fetch.content = "";
else fetch.content = eval(content_format); //内容,是否要去掉作者信息自行决定
if (source_format == "") fetch.source = fetch.source_name;
else fetch.source = eval(source_format); //来源
if (desc_format == "") fetch.desc = fetch.title;
else fetch.desc = eval(desc_format); //摘要
// var filename = source_name + "_" + (new Date()).toFormat("YYYY-MM-DD") +
// "_" + myURL.substr(myURL.lastIndexOf('/') + 1) + ".json";
// 存储json
// fs.writeFileSync(filename, JSON.stringify(fetch));
var fetchAddSql = 'INSERT INTO fetches(url,source_name,source_encoding,title,' +
'keywords,author,publish_date,crawltime,content) VALUES(?,?,?,?,?,?,?,?,?)';
var fetchAddSql_Params = [fetch.url, fetch.source_name, fetch.source_encoding,
fetch.title, fetch.keywords, fetch.author, fetch.publish_date,
fetch.crawltime.toFormat("YYYY-MM-DD HH24:MI:SS"), fetch.content
];
//执行sql,数据库中fetch表里的url属性是unique的,不会把重复的url内容写入数据库
mysql.query(fetchAddSql, fetchAddSql_Params, function(qerr, vals, fields) {
if (qerr) {
console.log(qerr);
}
}); //mysql写入
});
}
运行爬虫程序,先安装相应依赖包 npm i 包。然后爬取成功
用mysql查询已爬取的数据
这里出现了一个麻烦的问题,就是网站爬取成功,但是数据库里显示不出来。
用网页发送请求到后端查询
1.首先创建一个7.02.html作为网页端(前端)
<!DOCTYPE html>
<html>
<body>
<form action="http://127.0.0.1:8080/7.02.html" method="GET">
<br> 标题:<input type="text" name="title">
<input type="submit" value="Submit">
</form>
<script>
</script>
</body>
</html>
2.用网页发送请求到后端查询 再创建一个7.02.js作为后端
var http = require('http');
var fs = require('fs');
var url = require('url');
var mysql = require('./mysql.js');
http.createServer(function(request, response) {
var pathname = url.parse(request.url).pathname;
var params = url.parse(request.url, true).query;
fs.readFile(pathname.substr(1), function(err, data) {
response.writeHead(200, { 'Content-Type': 'text/html; charset=utf-8' });
if ((params.title === undefined) && (data !== undefined))
response.write(data.toString());
else {
response.write(JSON.stringify(params));
var select_Sql = "select title,author,publish_date from fetches where title like '%" +
params.title + "%'";
mysql.query(select_Sql, function(qerr, vals, fields) {
console.log(vals);
});
}
response.end();
});
}).listen(8080);
console.log('Server running at http://127.0.0.1:8080/');
3.用express构建网站访问mysql 同样先创建一个7.03.html作为前端
<!DOCTYPE html>
<html>
<body>
<form action="http://127.0.0.1:8080/process_get" method="GET">
<br> 标题:<input type="text" name="title">
<input type="submit" value="Submit">
</form>
<script>
</script>
</body>
</html>
4.用express构建网站访问mysql 再创建一个7.03.js作为后端
var express = require('express');
var mysql = require('./mysql.js')
var app = express();
//app.use(express.static('public'));
app.get('/7.03.html', function(req, res) {
res.sendFile(__dirname + "/" + "7.03.html");
})
app.get('/7.04.html', function(req, res) {
res.sendFile(__dirname + "/" + "7.04.html");
})
app.get('/process_get', function(req, res) {
res.writeHead(200, { 'Content-Type': 'text/html;charset=utf-8' }); //设置res编码为utf-8
//sql字符串和参数
var fetchSql = "select url,source_name,title,author,publish_date from fetches where title like '%" +
req.query.title + "%'";
mysql.query(fetchSql, function(err, result, fields) {
console.log(result);
res.end(JSON.stringify(result));
});
})
var server = app.listen(8080, function() {
console.log("访问地址为 http://127.0.0.1:8080/7.03.html")
})
Node运行7.03.js后访问http://127.0.0.1:8080/7.03.html
输入任意查询词点击submit
用表格显示查询结果
用express脚手架来创建一个网站框架
express –e search_site
在search_site文件下cmd运行,node bin/www(这里学到一个特别简单的方法,只要打开相应文件夹,在路径一栏删去文件路径,输入cmd回车即可在该文件夹下cmd运行,其他方法还可以文件同时点击shift键+右键,但是由于只有powershell窗口,且修改成cmd特别麻烦,故没使用)
创建search.html后,运行http://127.0.0.1:3000/search.html
<!DOCTYPE html>
<html>
<header>
<script src="https://cdn.bootcss.com/jquery/3.4.1/jquery.js"></script>
</header>
<body>
<form>
<br> 标题:<input type="text" name="title_text">
<input class="form-submit" type="button" value="查询">
</form>
<div class="cardLayout" style="margin: 10px 0px">
<table width="100%" id="record2"></table>
</div>
<script>
$(document).ready(function() {
$("input:button").click(function() {
$.get('/process_get?title=' + $("input:text").val(), function(data) {
$("#record2").empty();
$("#record2").append('<tr class="cardLayout"><td>url</td><td>source_name</td>' +
'<td>title</td><td>author</td><td>publish_date</td></tr>');
for (let list of data) {
let table = '<tr class="cardLayout"><td>';
Object.values(list).forEach(element => {
table += (element + '</td><td>');
});
$("#record2").append(table + '</td></tr>');
}
});
});
});
</script>
</body>
</html>
由于网易新闻一直无法在数据库里显示,我只好换一个再爬
观察者网
其他部分基本不变
修改的部分如下
var source_name = "观察者";
var domain = 'https://www.guancha.cn/';
var myEncoding = "UTF-8";
var seedURL = 'https://www.guancha.cn/';
var seedURL_format = "$('a')";
var keywords_format = " $('meta[name=\"Keywords\"]').eq(0).attr(\"content\")";
var title_format = "$('title').text()";
var date_format = "$('#time fix').text()";
var author_format = "$('meta[name=\"author\"]').eq(0).attr(\"content\")";
var content_format = "$('all-txt').text()";
var desc_format = " $('meta[name=\"Description\"]').eq(0).attr(\"content\")";
var source_format = "$('#date-from').text()";
var url_reg = /\/(\d{4})_(\d{2})_(\d{2})_(\d{6}).shtml/;//www.guancha.cn/internation/2021_04_30_589327.shtml /\/(\d{4})\/(\d{2})-(\d{2})\/(\d{7}).shtml/;
var regExp = /((\d{4}|\d{2})(\-|\/|\.)\d{1,2}\3\d{1,2})|(\d{4}年\d{1,2}月\d{1,2}日)/
这次爬取成功了,数据库里也有显示了😀但是keywords没有
然后我改不了只好又爬了新浪新闻国内新闻
var source_name = "新浪新闻";
var domain = 'https://news.sina.com.cn/';
var myEncoding = "UTF-8";
var seedURL = 'https://news.sina.com.cn/';
var seedURL_format = "$('a')";
var keywords_format = " $('meta[name=\"keywords\"]').eq(0).attr(\"content\")";
var title_format = " $('meta[property=\"og:title\"]').eq(0).attr(\"content\")";
var date_format = " $('meta[property=\"article:published_time\"]').eq(0).attr(\"content\")";
var author_format = "$('.show_author').text()";
var content_format = "$('.article').text()";
var desc_format = " $('meta[name=\"description\"]').eq(0).attr(\"content\")";
var source_format = "$('.source').text()";
var url_reg = /\/(\d{4})-(\d{2})-(\d{2})\/doc-([A-Za-z0-9]+).shtml/;//2021-04-30/doc-ikmxzfmk9873302.shtml
var regExp = /((\d{4}|\d{2})(\-|\/|\.)\d{1,2}\3\d{1,2})|(\d{4}年\d{1,2}月\d{1,2}日)/
数据虽然可以,但是有部分显示为乱码或者NULL;
最后用表格显示查询数据时也一直有问题
没有找到界面,就很疑惑😥
我就借了同学的电脑运行我的爬虫,结果是可以的,查询成功
那应该是我自己的配置问题澎湃新闻的展示结果