发布于  更新于 

加速乐反爬虫技术初探

通过request库无法直接爬取,返回521

1
2
3
4
5
6
7
8
>>> import requests
>>> req = requests.get('https://www.seebug.org/vuldb/ssvid-92666')
>>> req.headers
{'Date': 'Wed, 07 Jun 2017 08:44:51 GMT', 'Transfer-Encoding': 'chunked', 'Connection': 'keep-alive', 'Set-Cookie': '__jsluid=62187a5fa0b416b1dbf26f4fd7ff649c; max-age=31536000; path=/; HttpOnly', 'Server': 'nginx'}
>>> req.status_code
521
>>> req.content
'<script>var x="document@charAt@while@catch@7@Jun@parseInt@Path@1496825590@window@10@3@DOMContentLoaded@String@join@5CMN@fromCharCode@0@09@36@f@l@href@captcha@53@toString@2@X@cd@var@658@if@dc@cookie@3D@false@GMT@i@__phantomas@1500@e@challenge@for@Ek@function@_phantom@replace@1y@07@try@addEventListener@Expires@26l3@__jsl_clearance@Wed@17@location@setTimeout@q@1@length@eval@x@return@else@attachEvent@onreadystatechange".replace(/@*$/,"").split("@"),y="11 m=1g(){3(a.1h||a.1a){};11 10,14=\'1p=9.12|i|\';11 l=[1g(25){26 25},1g(25){26 25;},1g(25){26 24(\'e.h(\'+25+\')\')},1g(25){1e(11 19=i;19<25.23;19++){25[19]=7(25[19]).q(k)};26 25.f(\'\')}];10=[\'1f\',[[5]+(-~(((-~{% image 10[19] %}};10=10.f(\'\');14+=10;20(\'1s.n=1s.n.1i(/[\\\\?|&]o-1d/,\\\\\'\\\\\')\',1b);1.15=(14+\';1n=1q, 1k-6-1r j:p:b 18;8=/;\');};13((1g(){1l{26 !!a.1m;}4(1c){26 17;}})()){1.1m(\'d\',m,17);}27{1.28(\'29\',m);}",z=0,f=function(x,y){var a=0,b=0,c=0;x=x.split("");y=y||99;while((a=x.shift())&&(b=a.charCodeAt(0)-77.5))c=(Math.abs(b)<13?(b+48.5):parseInt(a,36))+y*c;return c},g=y.match(/\\b\\w+\\b/g).sort(function(x,y){return f(x)-f(y)}).pop();while(f(g,++z)-x.length){};eval(y.replace(/\\b\\w+\\b/g, function(y){return x[f(y,z)-1]}));</script>\x00\x00\n'

通过浏览器访问,第一次访问时候明显有几秒延迟,之后页面正常打开

分析其521状态返回的js脚本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
var x = "document@charAt@while@catch@7@Jun@parseInt@Path@1496825590@window@10@3@DOMContentLoaded@String@join@5CMN@fromCharCode@0@09@36@f@l@href@captcha@53@toString@2@X@cd@var@658@if@dc@cookie@3D@false@GMT@i@__phantomas@1500@e@challenge@for@Ek@function@_phantom@replace@1y@07@try@addEventListener@Expires@26l3@__jsl_clearance@Wed@17@location@setTimeout@q@1@length@eval@x@return@else@attachEvent@onreadystatechange".replace(/@*$/, "").split("@"),
y = "11 m=1g(){3(a.1h||a.1a){};11 10,14='1p=9.12|i|';11 l=[1g(25){26 25},1g(25){26 25;},1g(25){26 24('e.h('+25+')')},1g(25){1e(11 19=i;19<25.23;19++){25[19]=7(25[19]).q(k)};26 25.f('')}];10=['1f',[[5]+(-~(((-~{% image 10[19] %}};10=10.f('');14+=10;20('1s.n=1s.n.1i(/[\\?|&]o-1d/,\\'\\')',1b);1.15=(14+';1n=1q, 1k-6-1r j:p:b 18;8=/;');};13((1g(){1l{26 !!a.1m;}4(1c){26 17;}})()){1.1m('d',m,17);}27{1.28('29',m);}",
z = 0,
f = function(x, y) {
var a = 0,
b = 0,
c = 0;
x = x.split("");
y = y || 99;
while ((a = x.shift()) && (b = a.charCodeAt(0) - 77.5)) c = (Math.abs(b) < 13 ? (b + 48.5) : parseInt(a, 36)) + y * c;
return c
},
g = y.match(/\b\w+\b/g).sort(function(x, y) {
return f(x) - f(y)
}).pop();
while (f(g, ++z) - x.length) {};
eval(y.replace(/\b\w+\b/g, function(y) {
return x[f(y, z) - 1]
}));

脚本经过一系列的转发界面之后执行eval,我们将eval替换为console.log可以看到其实执行了如下代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
var l = function() {
while (window._phantom || window.__phantomas) {};
var cd, dc = '__jsl_clearance=1496825590.658|0|';
var f = [function(x) {
return x
}, function(x) {
return x;
}, function(x) {
return eval('String.fromCharCode(' + x + ')')
}, function(x) {
for (var i = 0; i < x.length; i++) {
x[i] = parseInt(x[i]).toString(36)
};
return x.join('')
}];
cd = ['Ek', [
[7] + (-~(((-~![] << -~![])) * [(-~![] << -~![])]) + [] + [])
],
[3 - ~(((-~![] << -~![])) * [(-~![] << -~![])])],
[
[-~![] - ~![]] + [3 - ~(((-~![] << -~![])) * [(-~![] << -~![])])]
],
[
[3 - ~(((-~![] << -~![])) * [(-~![] << -~![])])] + (2 + 2 + [
[], (+!+[])
][~~![]]), [(-~![] - ~![] ^ -~[])] + [7]
], '2', [
[
[2] * ((-~![] - ~![] ^ -~[]))
] + [
[2] * ((-~![] - ~![] ^ -~[]))
]
], 'X', (-~(((-~![] << -~![])) * [(-~![] << -~![])]) + [] + []) + [{} + [] + []][0].charAt((-~[] << 3)), '26l3', [
[-~![] - ~![]] + [
[2] * ((-~![] - ~![] ^ -~[]))
]
], 'q', [
[3 - ~(((-~![] << -~![])) * [(-~![] << -~![])])] + [~~[]]
], '7', [((-~![] << -~![])) * [(-~![] << -~![])] + ([-~-~~~[]] + ~~[] >> -~-~~~[])], '5CMN', [
[
[2] * ((-~![] - ~![] ^ -~[]))
] + [7]
], '1y', [~~[]], '%3D'
];
for (var i = 0; i < cd.length; i++) {
cd[i] = f[[1, 2, 0, 3, 2, 1, 2, 1, 0, 1, 3, 1, 2, 1, 0, 1, 2, 1, 0, 1][i]](cd[i])
};
cd = cd.join('');
dc += cd;
setTimeout('location.href=location.href.replace(/[\?|&]captcha-challenge/,\'\')', 1500);
document.cookie = (dc + ';Expires=Wed, 07-Jun-17 09:53:10 GMT;Path=/;');
};
if ((function() {
try {
return !!window.addEventListener;
} catch (e) {
return false;
}
})()) {
document.addEventListener('DOMContentLoaded', l, false);
} else {
document.attachEvent('onreadystatechange', l);
}

进一步分析while (window._phantom || window.__phantomas) {}; 当发现是_phantom或者__phantomas后就直接进入死循环了。

而最终l函数进一步解码,到的dc变量,并设置__jsl_clearance的cookies。

另外也找到了之所以第一次访问seebug的时候会出现1.5秒的延迟的原因

setTimeout('location.href=location.href.replace(/[\?|&]captcha-challenge/,\'\')', 1500);

分析反爬逻辑

通过控制变量法,尝试清除每一个seebug的cookies,发现当没有__jsl_clearance或__jsluid这两个cookies的时候就会出现521状态码

进一步测试,__jsl_clearance和__jsluid两个cookies如果不是成对的话,也会出现521状态码。

python如何爬取

爬取转换为如何获得成对的__jsl_clearance和__jsluid

  • __jsluid可直接从头中获得

  • __jsl_clearance获得通过修改eval为eavl_s

1
2
3
4
5
6
7
8
9
var eval_s=function (str) {
str = str.replace('var l=function(){while(window._phantom||window.__phantomas){};','');
str = str.replace('__jsl_clearance=','')
str = str.substring(0,str.indexOf('setTimeout'))
str = str + "console.log(dc);"
eval(str);
}

var x="document@charAt@while@catch@7@Jun@parseInt@Path@1496825590@window@10@3@DOMContentLoaded@String@join@5CMN@fromCharCode@0@09@36@f@l@href@captcha@53@toString@2@X@cd@var@658@if@dc@cookie@3D@false@GMT@i@__phantomas@1500@e@challenge@for@Ek@function@_phantom@replace@1y@07@try@addEventListener@Expires@26l3@__jsl_clearance@Wed@17@location@setTimeout@q@1@length@eval@x@return@else@attachEvent@onreadystatechange".replace(/@*$/,"").split("@"),y="11 m=1g(){3(a.1h||a.1a){};11 10,14='1p=9.12|i|';11 l=[1g(25){26 25},1g(25){26 25;},1g(25){26 24('e.h('+25+')')},1g(25){1e(11 19=i;19<25.23;19++){25[19]=7(25[19]).q(k)};26 25.f('')}];10=['1f',[[5]+(-~(((-~{% image 10[19] %}};10=10.f('');14+=10;20('1s.n=1s.n.1i(/[\\?|&]o-1d/,\\'\\')',1b);1.15=(14+';1n=1q, 1k-6-1r j:p:b 18;8=/;');};13((1g(){1l{26 !!a.1m;}4(1c){26 17;}})()){1.1m('d',m,17);}27{1.28('29',m);}",z=0,f=function(x,y){var a=0,b=0,c=0;x=x.split("");y=y||99;while((a=x.shift())&&(b=a.charCodeAt(0)-77.5))c=(Math.abs(b)<13?(b+48.5):parseInt(a,36))+y*c;return c},g=y.match(/\b\w+\b/g).sort(function(x,y){return f(x)-f(y)}).pop();while(f(g,++z)-x.length){};eval_s(y.replace(/\b\w+\b/g, function(y){return x[f(y,z)-1]}));