Skip to content

Commit e9135ac

Browse files
committed
Replace replicator open_doc_revs kaboom with something more descriptive
Use something more descriptive, especially since it can bubble up the user level when the replication job crashes. In order to also log the error causing the failure, move the exit call to the main open_doc_revs body and the remove `retries=0` clause. However, avoid changing the exit value shape and keep it as an atom for now. There is a chance it might break error handling in the upper API layers in case when we match on error reason "shape". To test everything end-to-end, modify the test setup to create replication jobs with additional options, when previously it was only a choice between continuous or one-shot.
1 parent 407e618 commit e9135ac

File tree

2 files changed

+65
-40
lines changed

2 files changed

+65
-40
lines changed

src/couch_replicator/src/couch_replicator_api_wrap.erl

Lines changed: 14 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -277,14 +277,6 @@ bulk_get_zip({Id, Rev, _}, {[_ | _] = Props}) ->
277277
end.
278278

279279
-spec open_doc_revs(#httpdb{}, binary(), list(), list(), function(), any()) -> no_return().
280-
open_doc_revs(#httpdb{retries = 0} = HttpDb, Id, Revs, Options, _Fun, _Acc) ->
281-
Path = encode_doc_id(Id),
282-
QS = options_to_query_args(HttpDb, Path, [revs, {open_revs, Revs} | Options]),
283-
Url = couch_util:url_strip_password(
284-
couch_replicator_httpc:full_url(HttpDb, [{path, Path}, {qs, QS}])
285-
),
286-
couch_log:error("Replication crashing because GET ~s failed", [Url]),
287-
exit(kaboom);
288280
open_doc_revs(#httpdb{} = HttpDb, Id, Revs, Options, Fun, Acc) ->
289281
Path = encode_doc_id(Id),
290282
QS = options_to_query_args(HttpDb, Path, [revs, {open_revs, Revs} | Options]),
@@ -369,17 +361,20 @@ open_doc_revs(#httpdb{} = HttpDb, Id, Revs, Options, Fun, Acc) ->
369361
couch_replicator_httpc:full_url(HttpDb, [{path, Path}, {qs, QS}])
370362
),
371363
#httpdb{retries = Retries, wait = Wait0} = HttpDb,
372-
Wait = 2 * erlang:min(Wait0 * 2, ?MAX_WAIT),
373-
couch_log:notice(
374-
"Retrying GET to ~s in ~p seconds due to error ~w",
375-
[Url, Wait / 1000, error_reason(Else)]
376-
),
377-
ok = timer:sleep(Wait),
378-
RetryDb = HttpDb#httpdb{
379-
retries = Retries - 1,
380-
wait = Wait
381-
},
382-
open_doc_revs(RetryDb, Id, Revs, Options, Fun, Acc)
364+
NewRetries = Retries - 1,
365+
case NewRetries > 0 of
366+
true ->
367+
Wait = 2 * erlang:min(Wait0 * 2, ?MAX_WAIT),
368+
LogRetryMsg = "Retrying GET to ~s in ~p seconds due to error ~w",
369+
couch_log:notice(LogRetryMsg, [Url, Wait / 1000, error_reason(Else)]),
370+
ok = timer:sleep(Wait),
371+
RetryDb = HttpDb#httpdb{retries = NewRetries, wait = Wait},
372+
open_doc_revs(RetryDb, Id, Revs, Options, Fun, Acc);
373+
false ->
374+
LogFailMsg = "Replication crashing because GET ~s failed. Error ~p",
375+
couch_log:error(LogFailMsg, [Url, error_reason(Else)]),
376+
exit(open_doc_revs_failed)
377+
end
383378
end.
384379

385380
error_reason({http_request_failed, "GET", _Url, {error, timeout}}) ->

src/couch_replicator/test/eunit/couch_replicator_error_reporting_tests.erl

Lines changed: 51 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ error_reporting_test_() ->
3434
?TDEF_FE(t_skip_doc_put_invalid_attachment_name),
3535
?TDEF_FE(t_fail_revs_diff),
3636
?TDEF_FE(t_fail_bulk_get, 15),
37+
?TDEF_FE(t_fail_open_docs_get, 15),
3738
?TDEF_FE(t_fail_changes_queue),
3839
?TDEF_FE(t_fail_changes_manager),
3940
?TDEF_FE(t_fail_changes_reader_proc),
@@ -120,7 +121,7 @@ t_skip_doc_put_401_errors({_Ctx, {Source, Target}}) ->
120121
populate_db(Source, 6, 6, _WithAttachments = true),
121122
ErrBody = [<<"{\"error\":\"unauthorized\", \"reason\":\"vdu\"}">>],
122123
mock_fail_req(put, "/6", {ok, "401", [], ErrBody}),
123-
{ok, RepId} = replicate(Source, Target, false),
124+
{ok, RepId} = replicate(Source, Target, #{continuous => false}),
124125
{ok, Listener} = rep_result_listener(RepId),
125126
Res = wait_rep_result(RepId),
126127
% Replication job should succeed
@@ -140,7 +141,7 @@ t_skip_doc_put_403_errors({_Ctx, {Source, Target}}) ->
140141
populate_db(Source, 6, 6, _WithAttachments = true),
141142
ErrBody = [<<"{\"error\":\"forbidden\", \"reason\":\"vdu\"}">>],
142143
mock_fail_req(put, "/6", {ok, "403", [], ErrBody}),
143-
{ok, RepId} = replicate(Source, Target, false),
144+
{ok, RepId} = replicate(Source, Target, #{continuous => false}),
144145
{ok, Listener} = rep_result_listener(RepId),
145146
Res = wait_rep_result(RepId),
146147
% Replication job should succeed
@@ -160,7 +161,7 @@ t_skip_doc_put_413_errors({_Ctx, {Source, Target}}) ->
160161
populate_db(Source, 6, 6, _WithAttachments = true),
161162
ErrBody = [<<"{\"error\":\"too_large\", \"reason\":\"too_large\"}">>],
162163
mock_fail_req(put, "/6", {ok, "413", [], ErrBody}),
163-
{ok, RepId} = replicate(Source, Target, false),
164+
{ok, RepId} = replicate(Source, Target, #{continuous => false}),
164165
{ok, Listener} = rep_result_listener(RepId),
165166
Res = wait_rep_result(RepId),
166167
% Replication job should succeed
@@ -180,7 +181,7 @@ t_skip_doc_put_415_errors({_Ctx, {Source, Target}}) ->
180181
populate_db(Source, 6, 6, _WithAttachments = true),
181182
ErrBody = [<<"{\"error\":\"unsupported_media_type\", \"reason\":\"bad_media\"}">>],
182183
mock_fail_req(put, "/6", {ok, "415", [], ErrBody}),
183-
{ok, RepId} = replicate(Source, Target, false),
184+
{ok, RepId} = replicate(Source, Target, #{continuous => false}),
184185
{ok, Listener} = rep_result_listener(RepId),
185186
Res = wait_rep_result(RepId),
186187
% Replication job should succeed
@@ -202,7 +203,7 @@ t_skip_doc_put_invalid_attachment_name({_Ctx, {Source, Target}}) ->
202203
<<"{\"error\":\"bad_request\", \"reason\":\"Attachment name '_foo' starts with prohibited character '_'\"}">>
203204
],
204205
mock_fail_req(put, "/6", {ok, "400", [], ErrBody}),
205-
{ok, RepId} = replicate(Source, Target, false),
206+
{ok, RepId} = replicate(Source, Target, #{continuous => false}),
206207
{ok, Listener} = rep_result_listener(RepId),
207208
Res = wait_rep_result(RepId),
208209
% Replication job should succeed
@@ -256,6 +257,33 @@ t_fail_bulk_get({_Ctx, {Source, Target}}) ->
256257
% Check that there was a falback to a plain GET
257258
?assertEqual(1, meck:num_calls(couch_replicator_api_wrap, open_doc_revs, 6)).
258259

260+
t_fail_open_docs_get({_Ctx, {Source, Target}}) ->
261+
populate_db(Source, 1, 5),
262+
Opts = #{
263+
% We're testing the case of individual doc rev GETs
264+
use_bulk_get => false,
265+
% Perform at least one retry before giving up (for extra coverage)
266+
retries_per_request => 2
267+
},
268+
{ok, RepId} = replicate(Source, Target, Opts),
269+
wait_target_in_sync(Source, Target),
270+
271+
{ok, Listener} = rep_result_listener(RepId),
272+
% Break open_doc_revs on the server side and see what happens
273+
meck:new(fabric_doc_open_revs, [passthrough]),
274+
meck:expect(fabric_doc_open_revs, go, fun
275+
(Src, <<"6">>, _, _) when Src =:= Source ->
276+
% This is a random error, no particular reason for a 404
277+
meck:exception(throw, not_found);
278+
(ArgDb, ArgDocId, ArgRevs, ArgOpts) ->
279+
meck:passthrough([ArgDb, ArgDocId, ArgRevs, ArgOpts])
280+
end),
281+
populate_db(Source, 6, 6),
282+
{error, Result} = wait_rep_result(RepId),
283+
?assertMatch({worker_died, _, {process_died, _, open_doc_revs_failed}}, Result),
284+
?assert(meck:num_calls(fabric_doc_open_revs, go, 4) >= 2),
285+
couch_replicator_notifier:stop(Listener).
286+
259287
t_fail_changes_queue({_Ctx, {Source, Target}}) ->
260288
populate_db(Source, 1, 5),
261289
{ok, RepId} = replicate(Source, Target),
@@ -311,7 +339,7 @@ t_dont_start_duplicate_job({_Ctx, {Source, Target}}) ->
311339
meck:new(couch_replicator_pg, [passthrough]),
312340
Pid = pid_from_another_node(),
313341
meck:expect(couch_replicator_pg, should_start, fun(_, _) -> {no, Pid} end),
314-
Rep = make_rep(Source, Target, true),
342+
Rep = make_rep(Source, Target, #{continuous => true}),
315343
ExpectErr = {error, {already_started, Pid}},
316344
?assertEqual(ExpectErr, couch_replicator_scheduler_job:start_link(Rep)).
317345

@@ -444,26 +472,28 @@ wait_target_in_sync_loop(DocCount, TargetName, RetriesLeft) ->
444472
end.
445473

446474
replicate(Source, Target) ->
447-
replicate(Source, Target, true).
475+
replicate(Source, Target, #{}).
448476

449-
replicate(Source, Target, Continuous) ->
450-
Rep = make_rep(Source, Target, Continuous),
477+
replicate(Source, Target, #{} = Opts) ->
478+
Rep = make_rep(Source, Target, Opts),
451479
ok = couch_replicator_scheduler:add_job(Rep),
452480
couch_replicator_scheduler:reschedule(),
453481
{ok, Rep#rep.id}.
454482

455-
make_rep(Source, Target, Continuous) ->
456-
RepObject =
457-
{[
458-
{<<"source">>, url(Source)},
459-
{<<"target">>, url(Target)},
460-
{<<"continuous">>, Continuous},
461-
{<<"worker_processes">>, 1},
462-
{<<"retries_per_request">>, 1},
463-
% Low connection timeout so _changes feed gets restarted quicker
464-
{<<"connection_timeout">>, 3000}
465-
]},
466-
{ok, Rep} = couch_replicator_parse:parse_rep_doc(RepObject, ?ADMIN_USER),
483+
make_rep(Source, Target, #{} = OverrideOpts) ->
484+
Opts0 = #{
485+
source => url(Source),
486+
target => url(Target),
487+
continuous => true,
488+
worker_processes => 1,
489+
retries_per_request => 1,
490+
% Low connection timeout so _changes feed gets restarted quicker
491+
connection_timeout => 3000
492+
},
493+
RepMap = maps:merge(Opts0, OverrideOpts),
494+
% parse_rep_doc accepts {[...]} ejson format
495+
RepEJson = couch_util:json_decode(couch_util:json_encode(RepMap)),
496+
{ok, Rep} = couch_replicator_parse:parse_rep_doc(RepEJson, ?ADMIN_USER),
467497
Rep.
468498

469499
url(DbName) ->

0 commit comments

Comments
 (0)