问题描述
我正在使用Node.js - async&请求模块抓取1亿多个网站并且我一直遇到错误 ESOCKETTIMEDOUT
&几分钟后 ETIMEDOUT
。
I'm using Node.js - async & request module to crawl 100+ millions of websites and I keep bumping into errors ESOCKETTIMEDOUT
& ETIMEDOUT
after few minutes.
我重新启动脚本后再次运行。它似乎不是连接限制问题,因为我仍然可以立即执行resolve4,resolveNs,resolveMx以及 curl
。
It works again after I restart the script. It doesn't seem to be connection limit issue because I can still do resolve4, resolveNs, resolveMx and also curl
without delay.
你看到代码有什么问题吗?或任何建议?我想将async.queue()并发推高到至少1000.谢谢。
Do you see any issue with the code? or any advice? I'd like to push up the async.queue() concurrency to at least a 1000. Thank you.
var request = require('request'),
async = require('async'),
mysql = require('mysql'),
dns = require('dns'),
url = require('url'),
cheerio = require('cheerio'),
iconv = require('iconv-lite'),
charset = require('charset'),
config = require('./spy.config'),
pool = mysql.createPool(config.db);
iconv.skipDecodeWarning = true;
var queue = async.queue(function (task, cb) {
dns.resolve4('www.' + task.domain, function (err, addresses) {
if (err) {
//
// Do something
//
setImmediate(function () {
cb()
});
} else {
request({
url: 'http://www.' + task.domain,
method: 'GET',
encoding: 'binary',
followRedirect: true,
pool: false,
pool: { maxSockets: 1000 },
timeout: 15000 // 15 sec
}, function (error, response, body) {
//console.info(task);
if (!error) {
// If ok, do something
} else {
// If not ok, do these
console.log(error);
// It keeps erroring here after few minutes, resolve4, resolveNs, resolveMx still work here.
// { [Error: ETIMEDOUT] code: 'ETIMEDOUT' }
// { [Error: ESOCKETTIMEDOUT] code: 'ESOCKETTIMEDOUT' }
var ns = [],
ip = [],
mx = [];
async.parallel([
function (callback) {
// Resolves the domain's name server records
dns.resolveNs(task.domain, function (err, addresses) {
if (!err) {
ns = addresses;
}
callback();
});
}, function (callback) {
// Resolves the domain's IPV4 addresses
dns.resolve4(task.domain, function (err, addresses) {
if (!err) {
ip = addresses;
}
callback();
});
}, function (callback) {
// Resolves the domain's MX records
dns.resolveMx(task.domain, function (err, addresses) {
if (!err) {
addresses.forEach(function (a) {
mx.push(a.exchange);
});
}
callback();
});
}
], function (err) {
if (err) return next(err);
// do something
});
}
setImmediate(function () {
cb()
});
});
}
});
}, 200);
// When the queue is emptied we want to check if we're done
queue.drain = function () {
setImmediate(function () {
checkDone()
});
};
function consoleLog(msg) {
//console.info(msg);
}
function checkDone() {
if (queue.length() == 0) {
setImmediate(function () {
crawlQueue()
});
} else {
console.log("checkDone() not zero");
}
}
function query(sql) {
pool.getConnection(function (err, connection) {
if (!err) {
//console.log(sql);
connection.query(sql, function (err, results) {
connection.release();
});
}
});
}
function crawlQueue() {
pool.getConnection(function (err, connection) {
if (!err) {
var sql = "SELECT * FROM domain last_update < (UNIX_TIMESTAMP() - 2592000) LIMIT 500";
connection.query(sql, function (err, results) {
if (!err) {
if (results.length) {
for (var i = 0, len = results.length; i < len; ++i) {
queue.push({"id": results[i]['id'], "domain": results[i]['domain'] });
}
} else {
process.exit();
}
connection.release();
} else {
connection.release();
setImmediate(function () {
crawlQueue()
});
}
});
} else {
setImmediate(function () {
crawlQueue()
});
}
});
}
setImmediate(function () {
crawlQueue()
});
并且系统限制相当高。
Limit Soft Limit Hard Limit Units
Max cpu time unlimited unlimited seconds
Max file size unlimited unlimited bytes
Max data size unlimited unlimited bytes
Max stack size 8388608 unlimited bytes
Max core file size 0 unlimited bytes
Max resident set unlimited unlimited bytes
Max processes 257645 257645 processes
Max open files 500000 500000 files
Max locked memory 65536 65536 bytes
Max address space unlimited unlimited bytes
Max file locks unlimited unlimited locks
Max pending signals 257645 257645 signals
Max msgqueue size 819200 819200 bytes
Max nice priority 0 0
Max realtime priority 0 0
Max realtime timeout unlimited unlimited us
sysctl
net.ipv4.ip_local_port_range = 10000 61000
推荐答案
默认情况下,Node有的。如果您的DNS查询需要很长时间,请求将在DNS阶段阻止,并且症状正好是 ESOCKETTIMEDOUT
或 ETIMEDOUT
。
By default, Node has 4 workers to resolve DNS queries. If your DNS query takes long-ish time, requests will block on the DNS phase, and the symptom is exactly ESOCKETTIMEDOUT
or ETIMEDOUT
.
尝试增加你的uv线程池大小:
Try increasing your uv thread pool size:
export UV_THREADPOOL_SIZE=128
node ...
或 index .js
(或您的入口点在哪里):
or in index.js
(or wherever your entry point is):
#!/usr/bin/env node
process.env.UV_THREADPOOL_SIZE = 128;
function main() {
...
}
编辑:。
这篇关于Node.js GET请求ETIMEDOUT& ESOCKETTIMEDOUT的文章就介绍到这了,希望我们推荐的答案对大家有所帮助,也希望大家多多支持!