每个网站通常都会遇到很多非搜索引擎的爬虫,这些爬虫大部分都是用于内容采集或是初学者所写,它们和搜索引擎的爬虫不一样,没有频率控制,往往会消耗大量服务器资源,导致带宽白白浪费了。
其实nginx可以非常容易地根据user-agent过滤请求,我们只需要在需要url入口位置通过一个简单的正则表达式就可以过滤不符合要求的爬虫请求:
location / {
if ($http_user_agent ~* python|curl|java|wget|httpclient|okhttp) {
return 503;
}
# 其它正常配置
...
}注意:变量$http_user_agent是一个可以直接在location中引用的nginx变量。~*表示不区分大小写的正则匹配,通过python就可以过滤掉80%的python爬虫。
nginx中禁止屏蔽网络爬虫
server {
listen 80;
server_name www.xxx.com;
#charset koi8-r;
#access_log logs/host.access.log main;
#location / {
# root html;
# index index.html index.htm;
#}
if ($http_user_agent ~* qihoobot|baiduspider|googlebot|googlebot-mobile|googlebot-image|mediapartners-google|adsbot-google|feedfetcher-google|yahoo! slurp|yahoo! slurp china|youdaobot|sosospider|sogou spider|sogou web spider|msnbot|ia_archiver|tomato bot) {
return 403;
}
location ~ ^/(.*)$ {
proxy_pass http://localhost:8080;
proxy_redirect off;
proxy_set_header host $host;
proxy_set_header x-real-ip $remote_addr;
proxy_set_header x-forwarded-for $proxy_add_x_forwarded_for;
client_max_body_size 10m;
client_body_buffer_size 128k;
proxy_connect_timeout 90;
proxy_send_timeout 90;
proxy_read_timeout 90;
proxy_buffer_size 4k;
proxy_buffers 4 32k;
proxy_busy_buffers_size 64k;
proxy_temp_file_write_size 64k;
}
#error_page 404 /404.html;
# redirect server error pages to the static page /50x.html
#
error_page 500 502 503 504 /50x.html;
location = /50x.html {
root html;
}
# proxy the php scripts to apache listening on 127.0.0.1:80
#
#location ~ \.php$ {
# proxy_pass http://127.0.0.1;
#}
# pass the php scripts to fastcgi server listening on 127.0.0.1:9000
#
#location ~ \.php$ {
# root html;
# fastcgi_pass 127.0.0.1:9000;
# fastcgi_index index.php;
# fastcgi_param script_filename /scripts$fastcgi_script_name;
# include fastcgi_params;
#}
# deny access to .htaccess files, if apache's document root
# concurs with nginx's one
#
#location ~ /\.ht {
# deny all;
#}
}可以用 curl 测试一下
curl -i -a qihoobot www.xxx.com
哪个域名注册商免实名的epic服务器离线进不了游戏怎么办 epic服务器离线进不了游戏如何解决Nginx内存池初始化配置技术讲解360浏览器保存网页账号密码的操作方法gpu云物理服务器价格程序放在云服务器性能测试商标申请对图样有要求吗阿里巴巴云服务器怎么添加库存