我在一个简单的http爬虫程序上编写代码,但是在底部运行代码时遇到问题。我要求50个网址,并得到20+的内容回来。我生成了几个文件,每个文件大小为150KB,以测试爬虫程序。所以我认为20+的响应受到带宽的限制?但是:如何告诉erlang代码片段在最后一个文件未被获取之前不要退出?测试数据服务器已联机,请尝试代码,并欢迎任何提示:)

-module(crawler).
-define(BASE_URL, "http://46.4.117.69/").
-export([start/0, send_reqs/0, do_send_req/1]).

start() ->
  ibrowse:start(),
  proc_lib:spawn(?MODULE, send_reqs, []).

to_url(Id) ->
  ?BASE_URL ++ integer_to_list(Id).

fetch_ids() ->
  lists:seq(1, 50).

send_reqs() ->
  spawn_workers(fetch_ids()).

spawn_workers(Ids) ->
  lists:foreach(fun do_spawn/1, Ids).

do_spawn(Id) ->
  proc_lib:spawn_link(?MODULE, do_send_req, [Id]).

do_send_req(Id) ->
  io:format("Requesting ID ~p ... ~n", [Id]),
  Result = (catch ibrowse:send_req(to_url(Id), [], get, [], [], 10000)),
  case Result of
{ok, Status, _H, B} ->
  io:format("OK -- ID: ~2..0w -- Status: ~p -- Content length: ~p~n", [Id, Status, length(B)]);
Err ->
  io:format("ERROR -- ID: ~p -- Error: ~p~n", [Id, Err])
  end.

这是输出:
Requesting ID 1 ...
Requesting ID 2 ...
Requesting ID 3 ...
Requesting ID 4 ...
Requesting ID 5 ...
Requesting ID 6 ...
Requesting ID 7 ...
Requesting ID 8 ...
Requesting ID 9 ...
Requesting ID 10 ...
Requesting ID 11 ...
Requesting ID 12 ...
Requesting ID 13 ...
Requesting ID 14 ...
Requesting ID 15 ...
Requesting ID 16 ...
Requesting ID 17 ...
Requesting ID 18 ...
Requesting ID 19 ...
Requesting ID 20 ...
Requesting ID 21 ...
Requesting ID 22 ...
Requesting ID 23 ...
Requesting ID 24 ...
Requesting ID 25 ...
Requesting ID 26 ...
Requesting ID 27 ...
Requesting ID 28 ...
Requesting ID 29 ...
Requesting ID 30 ...
Requesting ID 31 ...
Requesting ID 32 ...
Requesting ID 33 ...
Requesting ID 34 ...
Requesting ID 35 ...
Requesting ID 36 ...
Requesting ID 37 ...
Requesting ID 38 ...
Requesting ID 39 ...
Requesting ID 40 ...
Requesting ID 41 ...
Requesting ID 42 ...
Requesting ID 43 ...
Requesting ID 44 ...
Requesting ID 45 ...
Requesting ID 46 ...
Requesting ID 47 ...
Requesting ID 48 ...
Requesting ID 49 ...
Requesting ID 50 ...
OK -- ID: 49 -- Status: "200" -- Content length: 150000
OK -- ID: 47 -- Status: "200" -- Content length: 150000
OK -- ID: 50 -- Status: "200" -- Content length: 150000
OK -- ID: 17 -- Status: "200" -- Content length: 150000
OK -- ID: 48 -- Status: "200" -- Content length: 150000
OK -- ID: 45 -- Status: "200" -- Content length: 150000
OK -- ID: 46 -- Status: "200" -- Content length: 150000
OK -- ID: 10 -- Status: "200" -- Content length: 150000
OK -- ID: 09 -- Status: "200" -- Content length: 150000
OK -- ID: 19 -- Status: "200" -- Content length: 150000
OK -- ID: 13 -- Status: "200" -- Content length: 150000
OK -- ID: 21 -- Status: "200" -- Content length: 150000
OK -- ID: 16 -- Status: "200" -- Content length: 150000
OK -- ID: 27 -- Status: "200" -- Content length: 150000
OK -- ID: 03 -- Status: "200" -- Content length: 150000
OK -- ID: 23 -- Status: "200" -- Content length: 150000
OK -- ID: 29 -- Status: "200" -- Content length: 150000
OK -- ID: 14 -- Status: "200" -- Content length: 150000
OK -- ID: 18 -- Status: "200" -- Content length: 150000
OK -- ID: 01 -- Status: "200" -- Content length: 150000
OK -- ID: 30 -- Status: "200" -- Content length: 150000
OK -- ID: 40 -- Status: "200" -- Content length: 150000
OK -- ID: 05 -- Status: "200" -- Content length: 150000

更新:
谢谢Stem给我们的提示。我把你和我的行为准则结合在一起了(
-module(crawler).
-define(BASE_URL, "http://46.4.117.69/").
-export([start/0, send_reqs/0, do_send_req/2]).

start() ->
  ibrowse:start(),
  proc_lib:spawn(?MODULE, send_reqs, []).

to_url(Id) ->
  ?BASE_URL ++ integer_to_list(Id).

fetch_ids() ->
  lists:seq(1, 50).

send_reqs() ->
  spawn_workers(fetch_ids()).

spawn_workers(Ids) ->
  %% collect reference to each worker
  Refs = [ do_spawn(Id) || Id <- Ids ],
  %% wait for response from each worker
  wait_workers(Refs).

wait_workers(Refs) ->
  lists:foreach(fun receive_by_ref/1, Refs).

receive_by_ref(Ref) ->
  %% receive message only from worker with specific reference
  receive
{Ref, done} ->
    done
  end.

do_spawn(Id) ->
  Ref = make_ref(),
  proc_lib:spawn_link(?MODULE, do_send_req, [Id, {self(), Ref}]),
  Ref.

do_send_req(Id, {Pid, Ref}) ->
  io:format("Requesting ID ~p ... ~n", [Id]),
  Result = (catch ibrowse:send_req(to_url(Id), [], get, [], [], 10000)),
  case Result of
  {ok, Status, _H, B} ->
    io:format("OK -- ID: ~2..0w -- Status: ~p -- Content length: ~p~n", [Id, Status, length(B)]),
    %% send message that work is done
    Pid ! {Ref, done};
  Err ->
    io:format("ERROR -- ID: ~p -- Error: ~p~n", [Id, Err]),
    %% repeat request if there was error while fetching a page,
    do_send_req(Id, {Pid, Ref})
    %% or - if you don't want to repeat request, put there:
    %% Pid ! {Ref, done}
end.

运行crawler可以很好地处理一些文件,但是代码甚至无法获取整个文件(文件大小为每150000字节一个)-crawler会部分获取一些文件,请参阅以下web服务器日志:(
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /10 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /1 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /3 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /8 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /39 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /7 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /6 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /2 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /5 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /50 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /9 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /44 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /38 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /47 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /49 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /43 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /37 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /46 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /48 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:00 +0200] "GET /36 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /42 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /41 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /45 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /17 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /35 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /16 HTTP/1.1" 200 150000 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /15 HTTP/1.1" 200 17020 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /21 HTTP/1.1" 200 120360 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /40 HTTP/1.1" 200 117600 "-" "-"
82.114.62.14 - - [13/Sep/2012:15:17:01 +0200] "GET /34 HTTP/1.1" 200 60660 "-" "-"

欢迎任何提示。我不知道那里出了什么事:(

最佳答案

所以,如果我理解正确的话-在每个工作进程都没有停止(并获取一个页面)之前,您不想从函数spawn_workers返回控件?如果是这样,您可以这样更改代码:

spawn_workers(Ids) ->
  %% collect reference to each worker
  Refs = [ do_spawn(Id) || Id <- Ids ],
  %% wait for response from each worker
  wait_workers(Refs).

wait_workers(Refs) ->
  lists:foreach(fun receive_by_ref/1, Refs).

receive_by_ref(Ref) ->
  %% receive message only from worker with specific reference
  receive
    {Ref, done} ->
        done
  end.

do_spawn(Id) ->
  Ref = make_ref(),
  proc_lib:spawn_link(?MODULE, do_send_req, [Id, {self(), Ref}]),
  Ref.

do_send_req(Id, {Pid, Ref}) ->
  io:format("Requesting ID ~p ... ~n", [Id]),
  Result = (catch ibrowse:send_req(to_url(Id), [], get, [], [], 10000)),
  case Result of
      {ok, Status, _H, B} ->
        io:format("OK -- ID: ~2..0w -- Status: ~p -- Content length: ~p~n", [Id, Status, length(B)]),
        %% send message that work is done
        Pid ! {Ref, done};
      Err ->
        io:format("ERROR -- ID: ~p -- Error: ~p~n", [Id, Err]),
        %% repeat request if there was error while fetching a page,
        do_send_req(Id, {Pid, Ref})
        %% or - if you don't want to repeat request, put there:
        %% Pid ! {Ref, done}
  end.

编辑:
我注意到,您的入口点(函数start)返回控制,而不等待所有工作人员结束其任务(因为您在那里调用spawn)。如果你也想在那里等-就做类似的把戏:
start() ->
  ibrowse:start(),
  Ref = make_ref(),
  proc_lib:spawn(?MODULE, send_reqs, [self(), Ref]),
  receive_by_ref(Ref).

send_reqs(Pid, Ref) ->
  spawn_workers(fetch_ids()),
  Pid ! {Ref, done}.

10-05 23:11