Skip to content

Commit 4bc4132

Browse files
committed
Fix supervisor restart when child fails to match OTP
Fixes a function clause exception that would crash the supervisor if a child fails to restart by matching the same behavior as OTP, and continue to try restarting the child until success, or maximum `intensity` is reached within the allowed `period`. Closed #1957 Signed-off-by: Winford <[email protected]>
1 parent 0a2f074 commit 4bc4132

File tree

3 files changed

+235
-6
lines changed

3 files changed

+235
-6
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ instead `badarg`.
9494
- Fix `list_to_integer`, it was likely buggy with integers close to INT64_MAX
9595
- Added missing support for supervisor `one_for_all` strategy.
9696
- Supervisor now honors period and intensity options.
97+
- Fix supervisor crash if a `one_for_one` child fails to restart.
9798

9899
## [0.6.7] - Unreleased
99100

libs/estdlib/src/supervisor.erl

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -311,7 +311,28 @@ handle_info({restart_many_children, [#child{pid = Pid} = Child | Children]}, Sta
311311
{noreply, State#state{children = NewChildren},
312312
{timeout, 0, {restart_many_children, Children}}};
313313
{error, Reason} ->
314-
handle_child_exit(Pid, Reason, State)
314+
handle_child_exit(Pid, {restart, Reason}, State)
315+
end;
316+
handle_info({try_again_restart, Id}, State) ->
317+
case lists:keyfind(Id, #child.id, State#state.children) of
318+
false ->
319+
{noreply, State};
320+
Child ->
321+
case add_restart(State) of
322+
{ok, State1} ->
323+
case try_start(Child) of
324+
{ok, NewPid, _Result} ->
325+
UpdatedChildren = lists:keyreplace(
326+
Id, Child#child.id, State1#state.children, Child#child{pid = NewPid}
327+
),
328+
{noreply, State1#state{children = UpdatedChildren}};
329+
{error, {_, _}} ->
330+
{noreply, State1, {timeout, 0, {try_again_restart, Id}}}
331+
end;
332+
{shutdown, State1} ->
333+
RemainingChildren = lists:keydelete(Id, #child.id, State1#state.children),
334+
{stop, shutdown, State1#state{children = RemainingChildren}}
335+
end
315336
end;
316337
handle_info(_Msg, State) ->
317338
%TODO: log unexpected message
@@ -351,7 +372,7 @@ handle_child_exit(Pid, Reason, State) ->
351372
RemainingChildren = lists:keydelete(
352373
Pid, #child.pid, State1#state.children
353374
),
354-
{shutdown, State1#state{children = RemainingChildren}}
375+
{stop, shutdown, State1#state{children = RemainingChildren}}
355376
end;
356377
false ->
357378
Children = lists:keydelete(Pid, #child.pid, State#state.children),
@@ -368,7 +389,13 @@ handle_restart_strategy(
368389
Children = lists:keyreplace(
369390
Id, #child.id, State#state.children, NewChild
370391
),
371-
{noreply, State#state{children = Children}}
392+
{noreply, State#state{children = Children}};
393+
{error, _} ->
394+
NewChild = Child#child{pid = {restarting, Child#child.pid}},
395+
Children = lists:keyreplace(
396+
Id, #child.id, State#state.children, NewChild
397+
),
398+
{noreply, State#state{children = Children}, {timeout, 0, {try_again_restart, Id}}}
372399
end;
373400
handle_restart_strategy(
374401
#child{pid = Pid} = Child, #state{restart_strategy = one_for_all} = State

tests/libs/estdlib/test_supervisor.erl

Lines changed: 204 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ test() ->
3737
ok = test_count_children(),
3838
ok = test_one_for_all(),
3939
ok = test_crash_limits(),
40+
ok = try_again_restart(),
4041
ok.
4142

4243
test_basic_supervisor() ->
@@ -220,7 +221,51 @@ child_start({trap_exit, Parent}) ->
220221
ok -> ok
221222
end
222223
end),
223-
{ok, Pid}.
224+
{ok, Pid};
225+
child_start({get_permission, Arbitrator, Parent}) ->
226+
Arbitrator ! {can_start, self()},
227+
CanStart =
228+
receive
229+
{do_start, Start} -> Start
230+
after 2000 ->
231+
{timeout, arbitrator}
232+
end,
233+
case CanStart of
234+
true ->
235+
Pid = spawn_link(fun() ->
236+
receive
237+
stop -> exit(normal)
238+
end
239+
end),
240+
register(crashy, Pid),
241+
Parent ! Pid,
242+
{ok, Pid};
243+
false ->
244+
{error, start_denied};
245+
Error ->
246+
{error, Error}
247+
end.
248+
249+
arbitrator_start(Deny) when is_integer(Deny) ->
250+
receive
251+
{can_start, From} ->
252+
From ! {do_start, true}
253+
end,
254+
arbitrator(Deny).
255+
256+
arbitrator(Deny) ->
257+
Allow =
258+
if
259+
Deny =< 0 -> true;
260+
true -> false
261+
end,
262+
receive
263+
{can_start, From} ->
264+
From ! {do_start, Allow},
265+
arbitrator(Deny - 1);
266+
shutdown ->
267+
ok
268+
end.
224269

225270
test_ping_pong(SupPid) ->
226271
Pid1 = get_and_test_server(),
@@ -334,7 +379,39 @@ init({test_crash_limits, Intensity, Period, Parent}) ->
334379
modules => [ping_pong_server]
335380
}
336381
],
337-
{ok, {#{strategy => one_for_one, intensity => Intensity, period => Period}, ChildSpec}}.
382+
{ok, {#{strategy => one_for_one, intensity => Intensity, period => Period}, ChildSpec}};
383+
init({test_try_again, Arbitrator, Parent}) ->
384+
ChildSpec = [
385+
#{
386+
id => finicky_child,
387+
start => {?MODULE, child_start, [{get_permission, Arbitrator, Parent}]},
388+
restart => permanent,
389+
shutdown => brutal_kill,
390+
type => worker,
391+
modules => [?MODULE]
392+
}
393+
],
394+
{ok, {#{strategy => one_for_one, intensity => 5, period => 10}, ChildSpec}};
395+
init({test_retry_one_for_all, Arbitrator, Parent}) ->
396+
ChildSpec = [
397+
#{
398+
id => ping,
399+
start => {ping_pong_server, start_link, [Parent]},
400+
restart => permanent,
401+
shutdown => brutal_kill,
402+
type => worker,
403+
modules => [ping_pong_server]
404+
},
405+
#{
406+
id => crashy_child,
407+
start => {?MODULE, child_start, [{get_permission, Arbitrator, Parent}]},
408+
restart => permanent,
409+
shutdown => brutal_kill,
410+
type => worker,
411+
modules => [?MODULE]
412+
}
413+
],
414+
{ok, {#{strategy => one_for_all, intensity => 5, period => 10}, ChildSpec}}.
338415

339416
test_supervisor_order() ->
340417
{ok, SupPid} = supervisor:start_link(?MODULE, {test_supervisor_order, self()}),
@@ -436,7 +513,7 @@ test_one_for_all() ->
436513
end
437514
end,
438515

439-
demonitor(MonRef),
516+
demonitor(MonRef, [flush]),
440517

441518
% Collect startup message from restarted transient ping_pong_server child
442519
_Restart_2 = get_and_test_server(),
@@ -521,3 +598,127 @@ get_ping_pong_pid() ->
521598
{ping_pong_server_ready, Pid} -> Pid
522599
after 2000 -> throw(timeout)
523600
end.
601+
602+
try_again_restart() ->
603+
process_flag(trap_exit, true),
604+
605+
%% Intensity is 5, use the arbitrator to prevent the child from restarting
606+
%% 4 times. This should not exit the supervisor due to intensity.
607+
Arbitrator1 = erlang:spawn(fun() -> arbitrator_start(4) end),
608+
{ok, SupPid1} = supervisor:start_link(
609+
{local, try_again_test1}, ?MODULE, {test_try_again, Arbitrator1, self()}
610+
),
611+
ChildPid = wait_child_pid("ChildPid"),
612+
613+
ChildPid ! stop,
614+
ChildPid1 = wait_child_pid("ChildPid1"),
615+
616+
ChildPid1 ! stop,
617+
Arbitrator1 ! shutdown,
618+
exit(SupPid1, normal),
619+
ok =
620+
receive
621+
{'EXIT', SupPid1, normal} ->
622+
ok
623+
after 2000 ->
624+
error({supervisor_not_stopped, normal})
625+
end,
626+
627+
%% Prevent 5 restart attempts allow on the 6th, this should cause the supervisor
628+
%% to shutdown on the 6th attempt, which happens before period expires and we are
629+
%% already at max restart intensity.
630+
Arbitrator2 = erlang:spawn(fun() -> arbitrator_start(5) end),
631+
{ok, SupPid2} = supervisor:start_link(
632+
{local, test_try_again2}, ?MODULE, {test_try_again, Arbitrator2, self()}
633+
),
634+
ChildPid2 = wait_child_pid("ChildPid2"),
635+
636+
ChildPid2 ! stop,
637+
ok =
638+
receive
639+
{'EXIT', SupPid2, shutdown} ->
640+
ok
641+
after 2000 ->
642+
error({supervisor_not_stopped, restart_try_again_exceeded})
643+
end,
644+
Arbitrator2 ! shutdown,
645+
646+
%% Test one_for_all
647+
%% child 2 uses arbitrator to deny 4 restart attempts, succeeding on the 5th.
648+
Arbitrator3 = erlang:spawn(fun() -> arbitrator_start(4) end),
649+
{ok, SupPid3} = supervisor:start_link(
650+
{local, try_again_test3}, ?MODULE, {test_retry_one_for_all, Arbitrator3, self()}
651+
),
652+
653+
{Ping1, _Crashy1} = wait_ping_server_and_child_pid(),
654+
655+
%% this will require 6 restarts (1 to restart ping + 4 denied attempts for
656+
%% crashy and succeed on the 5th)
657+
gen_server:call(Ping1, {stop, normal}),
658+
659+
%% Crashy will restart without error since the deny count was reached after
660+
%% first time it was stopped
661+
{Ping2, _Crashy2} = wait_ping_server_and_child_pid(),
662+
663+
%% this will surely exceed the limit
664+
%crashy ! stop,
665+
ok = gen_server:call(Ping2, {stop, normal}),
666+
667+
%% ping_pong_server has 2000ms timeout, we need to wait longer.
668+
ok =
669+
receive
670+
{'EXIT', SupPid3, shutdown} ->
671+
ok
672+
after 5000 ->
673+
error({supervisor_not_stopped, one_for_all_restarts_exceeded})
674+
end,
675+
Arbitrator3 ! shutdown,
676+
677+
process_flag(trap_exit, false),
678+
ok.
679+
680+
wait_child_pid(Name) ->
681+
receive
682+
Pid when is_pid(Pid) ->
683+
Pid;
684+
{'EXIT', _, shutdown} ->
685+
error({unexpected, supervisor_shutdown});
686+
{'EXIT', _, _} ->
687+
wait_child_pid(Name)
688+
after 1000 ->
689+
error({timeout, no_child_pid, Name})
690+
end.
691+
692+
%% In the case where we have one_for_all, process all `ping_pong_server_ready`
693+
%% messages until we get the crashy `pid()` message which means the crashy
694+
%% process eventually started. Last `ping_pong_server_ready` will be received.
695+
wait_ping_server_and_child_pid() ->
696+
wait_ping_server_and_child_pid0(undefined, undefined).
697+
698+
wait_ping_server_and_child_pid0(PingPongPid, ChildPid) ->
699+
Timeout =
700+
if
701+
is_pid(PingPongPid) andalso is_pid(ChildPid) -> 0;
702+
true -> 2000
703+
end,
704+
receive
705+
{ping_pong_server_ready, NewPingPongPid} ->
706+
wait_ping_server_and_child_pid0(NewPingPongPid, ChildPid);
707+
NewChildPid when is_pid(NewChildPid) ->
708+
wait_ping_server_and_child_pid0(PingPongPid, NewChildPid);
709+
{'EXIT', _, shutdown} ->
710+
error({unexpected, supervisor_shutdown});
711+
{'EXIT', _, _} = Exit ->
712+
wait_ping_server_and_child_pid0(PingPongPid, ChildPid)
713+
after Timeout ->
714+
if
715+
is_pid(PingPongPid) andalso is_pid(ChildPid) ->
716+
{PingPongPid, ChildPid};
717+
is_pid(PingPongPid) ->
718+
error({timeout, no_child_pid, crashy});
719+
is_pid(ChildPid) ->
720+
error({timeout, no_child_pid, ping_pong_server});
721+
true ->
722+
error({timeout, no_child_pid, either})
723+
end
724+
end.

0 commit comments

Comments
 (0)