From 1d0a45b12ce7861b289323e8eecbae50a922bf81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-S=C3=A9bastien=20P=C3=A9dron?= Date: Wed, 27 Nov 2024 13:31:37 +0100 Subject: [PATCH 1/2] rabbit_db: Fix `rabbit_db_msup:update_all/2` spec [Why] It can return an error. (cherry picked from commit 913bd9fa4274ba537cfe7fd6188f2c9aec3cec20) --- deps/rabbit/src/rabbit_db_msup.erl | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/deps/rabbit/src/rabbit_db_msup.erl b/deps/rabbit/src/rabbit_db_msup.erl index a4bbb68d7e40..fd5d31b369ff 100644 --- a/deps/rabbit/src/rabbit_db_msup.erl +++ b/deps/rabbit/src/rabbit_db_msup.erl @@ -226,9 +226,10 @@ find_mirror_in_khepri(Group, Id) -> %% update_all(). %% ------------------------------------------------------------------- --spec update_all(Overall, Overall) -> [ChildSpec] when +-spec update_all(Overall, Overall) -> [ChildSpec] | Error when Overall :: pid(), - ChildSpec :: supervisor2:child_spec(). + ChildSpec :: supervisor2:child_spec(), + Error :: {error, any()}. update_all(Overall, OldOverall) -> rabbit_khepri:handle_fallback( From 5a329746a8ac48df586e94e1109ae5dba19e621e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jean-S=C3=A9bastien=20P=C3=A9dron?= Date: Wed, 27 Nov 2024 13:32:29 +0100 Subject: [PATCH 2/2] mirrored_supervisor: Catch timeout from Khepri in `hanlde_info/2` [Why] The code assumed that the transaction would always succeed. It was kind of the case with Mnesia because it would throw an exception if it failed. Khepri returns an error instead. The code has to handle it. In particular, we see timeouts in CI and before this patch, they caused a crash because the list comprehension was asked to work on a tuple. [How] We now retry a few times for 10 seconds. (cherry picked from commit 4621fe7730889168b133029a02a7da1a2b50aa6f) --- deps/rabbit/src/mirrored_supervisor.erl | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/deps/rabbit/src/mirrored_supervisor.erl b/deps/rabbit/src/mirrored_supervisor.erl index 1aa33413cbf5..357186dc9339 100644 --- a/deps/rabbit/src/mirrored_supervisor.erl +++ b/deps/rabbit/src/mirrored_supervisor.erl @@ -345,7 +345,7 @@ handle_info({'DOWN', _Ref, process, Pid, _Reason}, child_order = ChildOrder}) -> %% No guarantee pg will have received the DOWN before us. R = case lists:sort(pg:get_members(Group)) -- [Pid] of - [O | _] -> ChildSpecs = update_all(O, Pid), + [O | _] -> ChildSpecs = retry_update_all(O, Pid), [start(Delegate, ChildSpec) || ChildSpec <- restore_child_order(ChildSpecs, ChildOrder)]; @@ -428,6 +428,22 @@ check_stop(Group, Delegate, Id) -> id({Id, _, _, _, _, _}) -> Id. +retry_update_all(O, Pid) -> + retry_update_all(O, Pid, 10000). + +retry_update_all(O, Pid, TimeLeft) when TimeLeft > 0 -> + case update_all(O, Pid) of + List when is_list(List) -> + List; + {error, timeout} -> + Sleep = 200, + TimeLeft1 = TimeLeft - Sleep, + timer:sleep(Sleep), + retry_update_all(O, Pid, TimeLeft1) + end; +retry_update_all(O, Pid, _TimeLeft) -> + update_all(O, Pid). + update_all(Overall, OldOverall) -> rabbit_db_msup:update_all(Overall, OldOverall).