Skip to content

Commit 7b2435d

Browse files
committed
Fix supervisor restart when child fails to match OTP
Fixes a function clause exception that would crash the supervisor if a child fails to restart by matching the same behavior as OTP, and continue to try restarting the child until success, or maximum `intensity` is reached within the allowed `period`. Closed #1957 Signed-off-by: Winford <[email protected]>
1 parent 990520a commit 7b2435d

File tree

3 files changed

+195
-14
lines changed

3 files changed

+195
-14
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ instead `badarg`.
9191
- Fix `list_to_integer`, it was likely buggy with integers close to INT64_MAX
9292
- Added missing support for supervisor `one_for_all` strategy.
9393
- Supervisor now honors period and intensity options.
94+
- Fix supervisor crash if a child fails to restart.
9495

9596
## [0.6.7] - Unreleased
9697

libs/estdlib/src/supervisor.erl

Lines changed: 61 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -235,7 +235,12 @@ restart_child(Pid, Reason, #state{restart_strategy = one_for_one} = State) ->
235235
Children = lists:keyreplace(
236236
Pid, #child.pid, State1#state.children, NewChild
237237
),
238-
{ok, State1#state{children = Children}}
238+
{ok, State1#state{children = Children}};
239+
{error, _} ->
240+
erlang:send_after(
241+
50, self(), {try_again_restart, Child#child.id}
242+
),
243+
{ok, State1}
239244
end;
240245
{shutdown, State1} ->
241246
RemainingChildren = lists:keydelete(
@@ -271,8 +276,9 @@ restart_child(Pid, Reason, #state{restart_strategy = one_for_all} = State) ->
271276
case restart_many_children(Child, Siblings) of
272277
{ok, NewChildren} ->
273278
{ok, State1#state{children = NewChildren}};
274-
_Error ->
275-
{shutdown, State1}
279+
{ok, NewChildren, RetryID} ->
280+
erlang:send_after(50, self(), {try_again_restart, RetryID}),
281+
{ok, State1#state{children = NewChildren}}
276282
end;
277283
{shutdown, State1} ->
278284
RemainingChildren = lists:keydelete(
@@ -380,6 +386,26 @@ handle_info({ensure_killed, Pid}, State) ->
380386
exit(Pid, kill),
381387
{noreply, State}
382388
end;
389+
handle_info({try_again_restart, Id}, State) ->
390+
Child = lists:keyfind(Id, #child.id, State#state.children),
391+
case add_restart(State) of
392+
{ok, State1} ->
393+
case try_start(Child) of
394+
{ok, NewPid, _Result} ->
395+
UpdatedChildren = lists:keyreplace(
396+
Id, Child#child.id, State#state.children, Child#child{pid = NewPid}
397+
),
398+
{noreply, State#state{children = UpdatedChildren}};
399+
{error, {_, _}} ->
400+
erlang:send_after(50, self(), {try_again_restart, Id}),
401+
{noreply, State1}
402+
end;
403+
{shutdown, State1} ->
404+
RemainingChildren = lists:keydelete(
405+
Id, #child.id, State1#state.children
406+
),
407+
{stop, shutdown, State1#state{children = RemainingChildren}}
408+
end;
383409
handle_info(_Msg, State) ->
384410
%TODO: log unexpected message
385411
{noreply, State}.
@@ -488,7 +514,7 @@ trim_expired_restarts(_Threshold, Restarts) ->
488514
restart_many_children(#child{pid = Pid} = Child, Children) ->
489515
Siblings = lists:keydelete(Pid, #child.pid, Children),
490516
{ok, Children1} = terminate_many_children(Siblings, [Child#child{pid = {restarting, Pid}}]),
491-
do_restart_children(Children1, []).
517+
do_restart_children(Children1, [], []).
492518

493519
terminate_many_children([], NewChildren) ->
494520
{ok, lists:reverse(NewChildren)};
@@ -510,26 +536,49 @@ terminate_many_children([Child | Children], NewChildren) ->
510536
])
511537
end.
512538

513-
do_restart_children([], NewChildren) ->
539+
do_restart_children([], NewChildren, []) ->
514540
{ok, lists:reverse(NewChildren)};
515-
do_restart_children([#child{pid = Pid} = Child | Children], NewChildren) ->
541+
do_restart_children([], NewChildren, [RetryChild | T] = RetryChildren) ->
542+
if
543+
length(T) =:= 0 ->
544+
{ok, {lists:reverse(NewChildren), RetryChild#child.id}};
545+
true ->
546+
ok = differed_try_again(RetryChildren),
547+
{ok, lists:reverse(NewChildren)}
548+
end;
549+
do_restart_children([#child{pid = Pid} = Child | Children], NewChildren, RetryChildren) ->
516550
case Pid of
517551
{restarting, _} ->
518552
case try_start(Child) of
519553
{ok, Pid1, {ok, Pid1}} ->
520-
do_restart_children(Children, [Child#child{pid = Pid1} | NewChildren]);
554+
do_restart_children(
555+
Children, [Child#child{pid = Pid1} | NewChildren], RetryChildren
556+
);
521557
{ok, Pid1, {ok, Pid1, _Result}} ->
522-
do_restart_children(Children, [Child#child{pid = Pid1} | NewChildren]);
558+
do_restart_children(
559+
Children, [Child#child{pid = Pid1} | NewChildren], RetryChildren
560+
);
523561
{ok, undefined, {ok, undefined}} ->
524-
do_restart_children(Children, [Child#child{pid = undefined} | NewChildren]);
525-
{error, _} = Error ->
526-
Error
562+
do_restart_children(
563+
Children, [Child#child{pid = undefined} | NewChildren], RetryChildren
564+
);
565+
{error, _} ->
566+
do_restart_children(Children, NewChildren, [Child | RetryChildren])
527567
end;
528568
_ ->
529569
% retain previous ignore children without starting them
530-
do_restart_children(Children, [Child | NewChildren])
570+
do_restart_children(Children, [Child | NewChildren], RetryChildren)
531571
end.
532572

573+
%% Schedules "try again" restarts at 50ms intervals when multiple children have failed to restart
574+
%% on the first attempt. This is an accumulated (reverse start order) list, so the children that
575+
%% should start last get the longest delay before sending the try_again_restart request.
576+
differed_try_again([]) ->
577+
ok;
578+
differed_try_again([Child | Children] = RetryChildren) ->
579+
erlang:send_after(50 * length(RetryChildren), self(), {try_again_restart, Child#child.id}),
580+
differed_try_again(Children).
581+
533582
child_to_info(#child{id = Id, pid = Pid, type = Type, modules = Modules}) ->
534583
Child =
535584
case Pid of

tests/libs/estdlib/test_supervisor.erl

Lines changed: 133 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ test() ->
3737
ok = test_count_children(),
3838
ok = test_one_for_all(),
3939
ok = test_crash_limits(),
40+
ok = try_again_restart(),
4041
ok.
4142

4243
test_basic_supervisor() ->
@@ -220,7 +221,50 @@ child_start({trap_exit, Parent}) ->
220221
ok -> ok
221222
end
222223
end),
223-
{ok, Pid}.
224+
{ok, Pid};
225+
child_start({get_permission, Arbitrator, Parent}) ->
226+
Arbitrator ! {can_start, self()},
227+
CanStart =
228+
receive
229+
{do_start, Start} -> Start
230+
after 2000 ->
231+
{timeout, arbitrator}
232+
end,
233+
case CanStart of
234+
true ->
235+
Pid = spawn_link(fun() ->
236+
receive
237+
ok -> ok
238+
end
239+
end),
240+
Parent ! Pid,
241+
{ok, Pid};
242+
false ->
243+
{error, start_denied};
244+
Error ->
245+
{error, Error}
246+
end.
247+
248+
arbitrator_start(Deny) when is_integer(Deny) ->
249+
receive
250+
{can_start, From} ->
251+
From ! {do_start, true}
252+
end,
253+
arbitrator(Deny).
254+
255+
arbitrator(Deny) ->
256+
Allow =
257+
if
258+
Deny =< 0 -> true;
259+
true -> false
260+
end,
261+
receive
262+
{can_start, From} ->
263+
From ! {do_start, Allow},
264+
arbitrator(Deny - 1);
265+
shutdown ->
266+
ok
267+
end.
224268

225269
test_ping_pong(SupPid) ->
226270
Pid1 = get_and_test_server(),
@@ -334,7 +378,19 @@ init({test_crash_limits, Intensity, Period, Parent}) ->
334378
modules => [ping_pong_server]
335379
}
336380
],
337-
{ok, {#{strategy => one_for_one, intensity => Intensity, period => Period}, ChildSpec}}.
381+
{ok, {#{strategy => one_for_one, intensity => Intensity, period => Period}, ChildSpec}};
382+
init({test_try_again, Arbitrator, Parent}) ->
383+
ChildSpec = [
384+
#{
385+
id => finicky_child,
386+
start => {?MODULE, child_start, [{get_permission, Arbitrator, Parent}]},
387+
restart => permanent,
388+
shutdown => brutal_kill,
389+
type => worker,
390+
modules => [?MODULE]
391+
}
392+
],
393+
{ok, {#{strategy => one_for_one, intensity => 5, period => 10}, ChildSpec}}.
338394

339395
test_supervisor_order() ->
340396
{ok, SupPid} = supervisor:start_link(?MODULE, {test_supervisor_order, self()}),
@@ -521,3 +577,78 @@ get_ping_pong_pid() ->
521577
{ping_pong_server_ready, Pid} -> Pid
522578
after 2000 -> throw(timeout)
523579
end.
580+
581+
try_again_restart() ->
582+
process_flag(trap_exit, true),
583+
timer:sleep(100),
584+
Arbitrator = erlang:spawn(fun() -> arbitrator_start(4) end),
585+
ok =
586+
case is_pid(Arbitrator) of
587+
true -> ok;
588+
false -> {error, no_arbitrator}
589+
end,
590+
{ok, SupPid} = supervisor:start_link(
591+
{local, try_again_test}, ?MODULE, {test_try_again, Arbitrator, self()}
592+
),
593+
ok =
594+
case is_pid(SupPid) of
595+
true -> ok;
596+
false -> {error, no_supervisor}
597+
end,
598+
ChildPid =
599+
receive
600+
Pid0 when is_pid(Pid0) ->
601+
Pid0
602+
after 2000 ->
603+
error({timeout, no_child_start})
604+
end,
605+
ok =
606+
case is_pid(ChildPid) of
607+
true -> ok;
608+
false -> ChildPid
609+
end,
610+
ChildPid ! ok,
611+
ChildPid1 =
612+
receive
613+
Pid1 when is_pid(Pid1) ->
614+
Pid1
615+
after 2000 ->
616+
error({timeout, no_child_restart})
617+
end,
618+
ChildPid1 ! ok,
619+
Arbitrator ! shutdown,
620+
exit(SupPid, normal),
621+
ok =
622+
receive
623+
{'EXIT', SupPid, normal} ->
624+
ok
625+
after 2000 ->
626+
error({supervisor_not_stopped, normal})
627+
end,
628+
629+
Arbitrator2 = erlang:spawn(fun() -> arbitrator_start(5) end),
630+
{ok, SupPid2} = supervisor:start_link(
631+
{local, test_try_again}, ?MODULE, {test_try_again, Arbitrator2, self()}
632+
),
633+
ok =
634+
case is_pid(SupPid2) of
635+
true -> ok;
636+
false -> {error, no_supervisor2}
637+
end,
638+
ChildPid2 =
639+
receive
640+
Pid2 when is_pid(Pid2) ->
641+
Pid2
642+
after 2000 ->
643+
error({timeout, no_child2_start})
644+
end,
645+
ChildPid2 ! ok,
646+
ok =
647+
receive
648+
{'EXIT', SupPid2, shutdown} ->
649+
ok
650+
after 2000 ->
651+
error({supervisor_not_stopped, restart_try_again_exceeded})
652+
end,
653+
process_flag(trap_exit, false),
654+
ok.

0 commit comments

Comments
 (0)