我使用Node.js – async & request模块来抓取超过1亿的网站,几分钟后我不断遇到错误ESOCKETTIMEDOUT & ETIMEDOUT。
我重新启动脚本后,它又可以工作了。这似乎不是连接限制的问题,因为我仍然可以做resolve4, resolveNs, resolveMx和curl没有延迟。
你觉得代码有什么问题吗?或任何建议吗?我希望将async.queue()并发性至少提升到1000。谢谢你!
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 |
var request = require('request'), async = require('async'), mysql = require('mysql'), dns = require('dns'), url = require('url'), cheerio = require('cheerio'), iconv = require('iconv-lite'), charset = require('charset'), config = require('./spy.config'), pool = mysql.createPool(config.db); iconv.skipDecodeWarning = true; var queue = async.queue(function (task, cb) { dns.resolve4('www.' + task.domain, function (err, addresses) { if (err) { // // Do something // setImmediate(function () { cb() }); } else { request({ url: 'http://www.' + task.domain, method: 'GET', encoding: 'binary', followRedirect: true, pool: false, pool: { maxSockets: 1000 }, timeout: 15000 // 15 sec }, function (error, response, body) { //console.info(task); if (!error) { // If ok, do something } else { // If not ok, do these console.log(error); // It keeps erroring here after few minutes, resolve4, resolveNs, resolveMx still work here. // { [Error: ETIMEDOUT] code: 'ETIMEDOUT' } // { [Error: ESOCKETTIMEDOUT] code: 'ESOCKETTIMEDOUT' } var ns = [], ip = [], mx = []; async.parallel([ function (callback) { // Resolves the domain's name server records dns.resolveNs(task.domain, function (err, addresses) { if (!err) { ns = addresses; } callback(); }); }, function (callback) { // Resolves the domain's IPV4 addresses dns.resolve4(task.domain, function (err, addresses) { if (!err) { ip = addresses; } callback(); }); }, function (callback) { // Resolves the domain's MX records dns.resolveMx(task.domain, function (err, addresses) { if (!err) { addresses.forEach(function (a) { mx.push(a.exchange); }); } callback(); }); } ], function (err) { if (err) return next(err); // do something }); } setImmediate(function () { cb() }); }); } }); }, 200); // When the queue is emptied we want to check if we're done queue.drain = function () { setImmediate(function () { checkDone() }); }; function consoleLog(msg) { //console.info(msg); } function checkDone() { if (queue.length() == 0) { setImmediate(function () { crawlQueue() }); } else { console.log("checkDone() not zero"); } } function query(sql) { pool.getConnection(function (err, connection) { if (!err) { //console.log(sql); connection.query(sql, function (err, results) { connection.release(); }); } }); } function crawlQueue() { pool.getConnection(function (err, connection) { if (!err) { var sql = "SELECT * FROM domain last_update < (UNIX_TIMESTAMP() - 2592000) LIMIT 500"; connection.query(sql, function (err, results) { if (!err) { if (results.length) { for (var i = 0, len = results.length; i < len; ++i) { queue.push({"id": results[i]['id'], "domain": results[i]['domain'] }); } } else { process.exit(); } connection.release(); } else { connection.release(); setImmediate(function () { crawlQueue() }); } }); } else { setImmediate(function () { crawlQueue() }); } }); } setImmediate(function () { crawlQueue() }); |
And the system limits are pretty high.
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 |
Limit Soft Limit Hard Limit Units Max cpu time unlimited unlimited seconds Max file size unlimited unlimited bytes Max data size unlimited unlimited bytes Max stack size 8388608 unlimited bytes Max core file size 0 unlimited bytes Max resident set unlimited unlimited bytes Max processes 257645 257645 processes Max open files 500000 500000 files Max locked memory 65536 65536 bytes Max address space unlimited unlimited bytes Max file locks unlimited unlimited locks Max pending signals 257645 257645 signals Max msgqueue size 819200 819200 bytes Max nice priority 0 0 Max realtime priority 0 0 Max realtime timeout unlimited unlimited us |
sysctl
0 1 |
net.ipv4.ip_local_port_range = 10000 61000 |
———- ———- ———- ———- ———-
默认情况下,Node有4个worker来解析DNS查询。如果您的DNS查询时间较长,请求将在DNS阶段阻塞,而问题正是ESOCKETTIMEDOUT
或 ETIMEDOUT
。
尝试增加你的uv线程池大小:
0 1 2 |
export UV_THREADPOOL_SIZE=128 node ... |
或index.js(或你的入口点):
0 1 2 3 4 5 6 |
#!/usr/bin/env node process.env.UV_THREADPOOL_SIZE = 128; function main() { ... } |
———- ———- ———- ———- ———-
在阅读本文之后,通过使用request选项中的“agent: false”来解决这个问题。
2017年10月31日上述最初的回应似乎并没有完全解决问题。我们找到的最终解决方案是在代理中使用keepAlive选项。例如:
0 1 2 3 4 5 6 7 8 9 |
var pool = new https.Agent({ keepAlive: true }); function getJsonOptions(_url) { return { url: _url, method: 'GET', agent: pool, json: true }; } |
Node的默认池似乎默认为keepAlive=false,这会导致在每个请求上创建一个新连接。如果在短时间内创建了太多的连接,就会出现上述错误。我的猜测是一个或多个路由器沿着路径到服务阻塞连接请求,可能在怀疑拒绝服务攻击。无论如何,上面的代码示例完全解决了我们的问题。
———- ———- ———- ———- ———-
相关知识:http://nodejs.cn/api/http.html#http_class_http_agent
原文:https://stackoverflow.com/questions/24320578/node-js-get-request-etimedout-esockettimedout