JavaScript 爬虫js js爬虫代码

转载

网络安全专家 2023-12-10 09:09:36

文章标签 JavaScript 爬虫js 爬虫前端 json ViewUI 文章分类 JavaScript 前端开发

为什么要用JS抓取数据？

有的网站的安全性比较好，不能破解登录的限制，使用JS可以绕开登录的限制。实现方法：使用Google Chrome登录抓取站的用户账号，在console运行js脚本即可。实例抓取淘宝卖家商品分类

var CAT = {
    //[{id: '', name: '', data: [{id: '', name: '', data:[{id: '', name: ''}]},{}, ...]}, {} ...]
    data: [],
    url: function(){
        return 'https://upload.taobao.com/auction/json/reload_cats.htm?t='+Math.random();
    },
    init: function(){
        var url = CAT.url(),
            post_data = 'path=all';
        CAT.ajax(url, post_data, CAT.first_r);
    },
    first_r: function(data){
        var rs = data[0]['data'],
            first_l, first_d, i, j, second_id, second_d, func;
        for(i=0;i<rs.length; i++){
            //保存一级分类
            first_d = rs[i]['data'];
            first_l = [];
            for(j=0; j<first_d.length; j++){
                //保存二级分类同时查询三级分类，并提供存储数据的容器
                second_id = first_d[j]['sid'];
                second_d = {
                    'id': first_d[j]['sid'],
                    'name': first_d[j]['name'],
                    'spell': first_d[j]['spell'],
                    'data': []
                };
                first_l.push(second_d);
                func = CAT.second_r(second_d['data']);
                CAT.ajax(CAT.url(), 'path=next&sid='+second_id, func);
            }
            CAT.data.push({
                'id': rs[i]['id'],
                'name': rs[i]['name'],
                'data': first_l
            })
        }
    },
    second_r: function(container){
        return function(data){
            if(data.length<1){
                return
            }
            var rs = data[0]['data'],
                i, j, here, third_d;
            for(i=0; i<rs.length; i++){
                third_d = rs[i]['data'];
                for(j=0; j<third_d.length; j++){
                    here = third_d[j];
                    container.push({
                        'id': here['sid'],
                        'name': here['name'],
                        'spell': here['spell']
                    });
                }
            }
        }
    },
    ajax: function(url, post_data, func){
        var xhr = new XMLHttpRequest(),
        result;
        xhr.open('POST', url, true);
        xhr.setRequestHeader("Content-type","application/x-www-form-urlencoded");
        xhr.send(post_data);
        xhr.onreadystatechange=function(){
            if (xhr.readyState==4 && xhr.status==200){
                result = JSON.parse(xhr.responseText);
                func(result);
            }else if(xhr.readyState==4 && (!xhr.status==200)){
                console.log('Ajax Return Error!');
            }
        }
    }
};
CAT.init(); 
//console.log(CAT.data);

JS代码实现抓取网页数据有一定的局限性：数据源用ajax可以获取，数据源是json格式。

http://www.test.com跨域抓取数据例子

var CAT = {
    url: function () {
        return 'https://upload.taobao.com/auction/json/reload_cats.htm?t=' + Math.random();
    },
    init: function () {
        CAT.ajax(CAT.url(), 'path=next&sid=50024865', CAT.first_r);
    },
    first_r: function (data) {
        if (data.length < 1) {
            return
        }
        var rs = data[0]['data'],
            i, j, here, third_d;
        for (i = 0; i < rs.length; i++) {
            third_d = rs[i]['data'];
            for (j = 0; j < third_d.length; j++) {
                here = third_d[j];
                //配置Access-Control-Allow-Origin支持
                CAT.ajax("http://www.test.com/taobao/data", 'id='+here['sid']+"&name="+here['name']+"&spell="+here['spell']+"&sid=50024865", CAT.mycallback_r);
            }
        }
    },
    mycallback_r: function (data) { //json
        console.log(data);
    },
    ajax: function (url, post_data, func) {
        var xhr = new XMLHttpRequest(),
            result;
        xhr.open('POST', url, true);
        xhr.setRequestHeader("Content-type", "application/x-www-form-urlencoded");
        xhr.send(post_data);
        xhr.onreadystatechange = function () {
            if (xhr.readyState == 4 && xhr.status == 200) {
                result = JSON.parse(xhr.responseText);
                func(result);
            } else if (xhr.readyState == 4 && (!xhr.status == 200)) {
                console.log('Ajax Return Error!');
            }
        }
    }
};
CAT.init();

原生js实现Ajax方法：

var Ajax = {
    get: function (url, fn) {
        // XMLHttpRequest对象用于在后台与服务器交换数据   
        var xhr = new XMLHttpRequest();
        xhr.open('GET', url, true);
        xhr.onreadystatechange = function () {
            // readyState == 4说明请求已完成
            if (xhr.readyState == 4 && xhr.status == 200 || xhr.status == 304) {
                // 从服务器获得数据 
                fn.call(this, xhr.responseText);
            }
        };
        xhr.send();
    },
    // datat应为'a=a1&b=b1'这种字符串格式，在jq里如果data为对象会自动将对象转成这种字符串格式
    post: function (url, data, fn) {
        var xhr = new XMLHttpRequest();
        xhr.open("POST", url, true);
        // 添加http头，发送信息至服务器时内容编码类型
        xhr.setRequestHeader("Content-Type", "application/x-www-form-urlencoded");
        xhr.onreadystatechange = function () {
            if (xhr.readyState == 4 && (xhr.status == 200 || xhr.status == 304)) {
                fn.call(this, xhr.responseText);
            }
        };
        xhr.send(data);
    }
}

注释：

1. open(method, url, async) 方法需要三个参数:

　 method：发送请求所使用的方法（GET或POST）；与POST相比，GET更简单也更快，并且在大部分情况下都能用；然而，在以下情况中，请使用POST请求：

无法使用缓存文件（更新服务器上的文件或数据库）
向服务器发送大量数据（POST 没有数据量限制）
发送包含未知字符的用户输入时，POST 比 GET 更稳定也更可靠

　url：规定服务器端脚本的 URL(该文件可以是任何类型的文件，比如 .txt 和 .xml，或者服务器脚本文件，比如 .asp 和 .php （在传回响应之前，能够在服务器上执行任务）)；

　async：规定应当对请求进行异步（true）或同步（false）处理；true是在等待服务器响应时执行其他脚本，当响应就绪后对响应进行处理；false是等待服务器响应再执行。

2. send() 方法可将请求送往服务器。

3. onreadystatechange：存有处理服务器响应的函数，每当 readyState 改变时，onreadystatechange 函数就会被执行。

4. readyState：存有服务器响应的状态信息。

0: 请求未初始化（代理被创建，但尚未调用 open() 方法）
1: 服务器连接已建立（open方法已经被调用）
2: 请求已接收（send方法已经被调用，并且头部和状态已经可获得）
3: 请求处理中（下载中，responseText 属性已经包含部分数据）
4: 请求已完成，且响应已就绪（下载操作已完成）

5. responseText：获得字符串形式的响应数据。

6. setRequestHeader()：POST传数据时，用来添加 HTTP 头，然后send(data)，注意data格式；GET发送信息时直接加参数到url上就可以，比如url?a=a1&b=b1。

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。

上一篇：swiftUI拨打电话 swift to

下一篇：android selinux 查看用户权限 linux查看权限信息

提问和评论都可以，用心的回复会被更多人看到评论

发布评论

相关文章

官方博客	全部文章	热门标签	班级博客
了解我们	网站地图	意见反馈

鸿蒙开发者社区	51CTO学堂
51CTO	软考资讯

JavaScript 爬虫js js爬虫代码

JavaScript 爬虫js js爬虫代码

51CTO博客