猿人学爬虫题目第一题: 《抓取所有机票价格》,该案例非常适合js新手入门。
题目链接:http://match.yuanrenxue.com/match/1
F12打开控制台,可见debugger出现,右键选择Never pause here;
接下来F5,就可以跳过debugger了,查看数据包。
看一下请求参数:
直接点击查看Initiator进行调试。
随便选一个进来断点,选择下一页触发debug
没找到就点右侧的call stack,按顺序点一下看
点到request看到了有一端不能格式化的代码,最好是把这个代码全部复制到html文件中,然后自己处理下。
从这里可以看出:
timestamp = 13位时间戳 + 100000000;
把没用的删除掉,开始自己拼oo0O0()方法。
其中window.a:
然后缺什么补什么,在控制台去找更方便一些。
执行时如果报错:
- ReferenceError: atob is not defined
atob方法:natice code , 用于解码使用 base-64 编码的字符串。
如果有node环境的话,可以安装一下 npm install atob
(其实也可以换一种方法处理,比如返回数据后使用python来进行base64转换)
const atob = require('atob');
// npm install atob
function oo0O0(mw) {
window = {};
document = {};
window.b = '';
window.a = ' 太长了,这里不贴进来了'
document.e = 'fromC';
document.g = 'harCode';
document.f = 'charCo';
document.h = 'deAt';
window.c = 5;
for (var i = 0, len = window.a.length; i < len; i++) {
window.b += String[document.e + document.g](window.a[i][document.f + document.h]() - i - window.c)
}
var U = ['W5r5W6VdIHZcT8kU', 'WQ8CWRaxWQirAW=='];
var J = function (o, E) {
o = o - 0x0;
var N = U[o];
if (J['bSSGte'] === undefined) {
var Y = function (w) {
var m = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+/=',
T = String(w)['replace'](/=+$/, '');
var A = '';
for (var C = 0x0, b, W, l = 0x0; W = T['charAt'](l++); ~W && (b = C % 0x4 ? b * 0x40 + W : W, C++ % 0x4) ? A += String['fromCharCode'](0xff & b >> (-0x2 * C & 0x6)) : 0x0) {
W = m['indexOf'](W)
}
return A
};
var t = function (w, m) {
var T = [], A = 0x0, C, b = '', W = '';
w = Y(w);
for (var R = 0x0, v = w['length']; R < v; R++) {
W += '%' + ('00' + w['charCodeAt'](R)['toString'](0x10))['slice'](-0x2)
}
w = decodeURIComponent(W);
var l;
for (l = 0x0; l < 0x100; l++) {
T[l] = l
}
for (l = 0x0; l < 0x100; l++) {
A = (A + T[l] + m['charCodeAt'](l % m['length'])) % 0x100, C = T[l], T[l] = T[A], T[A] = C
}
l = 0x0, A = 0x0;
for (var L = 0x0; L < w['length']; L++) {
l = (l + 0x1) % 0x100, A = (A + T[l]) % 0x100, C = T[l], T[l] = T[A], T[A] = C, b += String['fromCharCode'](w['charCodeAt'](L) ^ T[(T[l] + T[A]) % 0x100])
}
return b
};
J['luAabU'] = t, J['qlVPZg'] = {}, J['bSSGte'] = !![]
}
var H = J['qlVPZg'][o];
return H === undefined ? (J['TUDBIJ'] === undefined && (J['TUDBIJ'] = !![]), N = J['luAabU'](N, E), J['qlVPZg'][o] = N) : N = H, N
};
console.log(eval(atob(window['b'])[J('0x0', ']dQW')](J('0x1', 'GTu!'), '\x27' + mw + '\x27')));
return ''
}
console.log(oo0O0(1611212930238))
到这里并没有结束,返回最开始,可以看到 m 其实是加密之后再加上window.f,
然后以字符串形式和 timestamp/1000进行拼接,得到最终的参数m。
打印了一下windows.f 发现不是固定参数,
全局中也没有搜索出来
这样说明,windows.f 可能是更改了字符或者编码或者用了什么代替
试了好几个地方,才找到window.f
这里打印了下 base64转码后的代码,发现是一段js,格式化之后删除无用的内容。
所以在看这个图,整体的生成流程已经掌握了。
另外我修改后生成了两遍md5的值,后来发现 oo0O0()并没有返回值 = =,是我自己加上了。
并且oo0O0()和window.f相同。
测试: 把 node 生成的m拿出来,直接使用python requests 请求,成功返回数据。
完整js代码:
const atob = require('atob');
// npm install atob
function oo0O0(mw) {
window = {};
document = {};
window.b = '';
window.a = "!!!!!!!!!A太长了,此处省略了!!!!!!!!!!!!!!!!!"
document.e = 'fromC';
document.g = 'harCode';
document.f = 'charCo';
document.h = 'deAt';
window.c = 5;
for (var i = 0, len = window.a.length; i < len; i++) {
window.b += String[document.e + document.g](window.a[i][document.f + document.h]() - i - window.c)
}
var U = ['W5r5W6VdIHZcT8kU', 'WQ8CWRaxWQirAW=='];
var J = function (o, E) {
o = o - 0x0;
var N = U[o];
if (J['bSSGte'] === undefined) {
var Y = function (w) {
var m = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789+/=',
T = String(w)['replace'](/=+$/, '');
var A = '';
for (var C = 0x0, b, W, l = 0x0; W = T['charAt'](l++); ~W && (b = C % 0x4 ? b * 0x40 + W : W, C++ % 0x4) ? A += String['fromCharCode'](0xff & b >> (-0x2 * C & 0x6)) : 0x0) {
W = m['indexOf'](W)
}
return A
};
var t = function (w, m) {
var T = [], A = 0x0, C, b = '', W = '';
w = Y(w);
for (var R = 0x0, v = w['length']; R < v; R++) {
W += '%' + ('00' + w['charCodeAt'](R)['toString'](0x10))['slice'](-0x2)
}
w = decodeURIComponent(W);
var l;
for (l = 0x0; l < 0x100; l++) {
T[l] = l
}
for (l = 0x0; l < 0x100; l++) {
A = (A + T[l] + m['charCodeAt'](l % m['length'])) % 0x100, C = T[l], T[l] = T[A], T[A] = C
}
l = 0x0, A = 0x0;
for (var L = 0x0; L < w['length']; L++) {
l = (l + 0x1) % 0x100, A = (A + T[l]) % 0x100, C = T[l], T[l] = T[A], T[A] = C, b += String['fromCharCode'](w['charCodeAt'](L) ^ T[(T[l] + T[A]) % 0x100])
}
return b
};
J['luAabU'] = t, J['qlVPZg'] = {}, J['bSSGte'] = !![]
}
var H = J['qlVPZg'][o];
return H === undefined ? (J['TUDBIJ'] === undefined && (J['TUDBIJ'] = !![]), N = J['luAabU'](N, E), J['qlVPZg'][o] = N) : N = H, N
};
result = eval(atob(window['b'])[J('0x0', ']dQW')](J('0x1', 'GTu!'), '\x27' + mw + '\x27'));
return result;
}
function getM() {
var timestamp = Date.parse(new Date()) + 100000000;
var m = oo0O0(timestamp.toString());
var result = m + '丨' + timestamp / 1000;
console.log(result);
return result
};
调用 getM()方法即可返回m即可。