Skip to content

Commit 4621fe7

Browse files
committed
mirrored_supervisor: Catch timeout from Khepri in hanlde_info/2
[Why] The code assumed that the transaction would always succeed. It was kind of the case with Mnesia because it would throw an exception if it failed. Khepri returns an error instead. The code has to handle it. In particular, we see timeouts in CI and before this patch, they caused a crash because the list comprehension was asked to work on a tuple. [How] We now retry a few times for 10 seconds.
1 parent 913bd9f commit 4621fe7

File tree

1 file changed

+17
-1
lines changed

1 file changed

+17
-1
lines changed

deps/rabbit/src/mirrored_supervisor.erl

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -345,7 +345,7 @@ handle_info({'DOWN', _Ref, process, Pid, _Reason},
345345
child_order = ChildOrder}) ->
346346
%% No guarantee pg will have received the DOWN before us.
347347
R = case lists:sort(pg:get_members(Group)) -- [Pid] of
348-
[O | _] -> ChildSpecs = update_all(O, Pid),
348+
[O | _] -> ChildSpecs = retry_update_all(O, Pid),
349349
[start(Delegate, ChildSpec)
350350
|| ChildSpec <- restore_child_order(ChildSpecs,
351351
ChildOrder)];
@@ -428,6 +428,22 @@ check_stop(Group, Delegate, Id) ->
428428

429429
id({Id, _, _, _, _, _}) -> Id.
430430

431+
retry_update_all(O, Pid) ->
432+
retry_update_all(O, Pid, 10000).
433+
434+
retry_update_all(O, Pid, TimeLeft) when TimeLeft > 0 ->
435+
case update_all(O, Pid) of
436+
List when is_list(List) ->
437+
List;
438+
{error, timeout} ->
439+
Sleep = 200,
440+
TimeLeft1 = TimeLeft - Sleep,
441+
timer:sleep(Sleep),
442+
retry_update_all(O, Pid, TimeLeft1)
443+
end;
444+
retry_update_all(O, Pid, _TimeLeft) ->
445+
update_all(O, Pid).
446+
431447
update_all(Overall, OldOverall) ->
432448
rabbit_db_msup:update_all(Overall, OldOverall).
433449

0 commit comments

Comments
 (0)