Skip to content

Commit 707f71b

Browse files
committed
Fix supervisor restart when child fails to match OTP
Fixes a function clause exception that would crash the supervisor if a child fails to restart by matching the same behavior as OTP, and continue to try restarting the child until success, or maximum `intensity` is reached within the allowed `period`. Closed #1957 Signed-off-by: Winford <[email protected]>
1 parent 511b037 commit 707f71b

File tree

3 files changed

+218
-7
lines changed

3 files changed

+218
-7
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ instead `badarg`.
9393
- Fix `list_to_integer`, it was likely buggy with integers close to INT64_MAX
9494
- Added missing support for supervisor `one_for_all` strategy.
9595
- Supervisor now honors period and intensity options.
96+
- Fix supervisor crash if a `one_for_one` child fails to restart.
9697

9798
## [0.6.7] - Unreleased
9899

libs/estdlib/src/supervisor.erl

Lines changed: 56 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -285,11 +285,13 @@ handle_cast(_Msg, State) ->
285285

286286
handle_info({'EXIT', Pid, Reason}, State) ->
287287
case handle_child_exit(Pid, Reason, State) of
288+
{ok, State1} ->
289+
{noreply, State1};
288290
{ok, State1, restart_all_children} ->
289291
Children = State1#state.children,
290292
{noreply, State1, {timeout, 0, {restart_many_children, Children}}};
291-
{ok, State1} ->
292-
{noreply, State1};
293+
{ok, State1, {try_again_restart, ChildId}} ->
294+
{noreply, State1, {timeout, 0, {try_again_restart, ChildId}}};
293295
{shutdown, State1} ->
294296
{stop, shutdown, State1}
295297
end;
@@ -303,7 +305,7 @@ handle_info({ensure_killed, Pid}, State) ->
303305
end;
304306
handle_info({restart_many_children, []}, State) ->
305307
{noreply, State};
306-
handle_info({restart_many_children, [#child{id = Id} = Child | Children] = ChildSpecs}, State) ->
308+
handle_info({restart_many_children, [#child{id = Id} = Child | Children] = _ChildSpecs}, State) ->
307309
case try_start(Child) of
308310
{ok, NewPid, _Result} ->
309311
NewChild = Child#child{pid = NewPid},
@@ -313,7 +315,50 @@ handle_info({restart_many_children, [#child{id = Id} = Child | Children] = Child
313315
{noreply, State#state{children = NewChildren},
314316
{timeout, 0, {restart_many_children, Children}}};
315317
{error, _Reason} ->
316-
{noreply, State, {timeout, 0, {restart_many_children, ChildSpecs}}}
318+
{noreply, State, {timeout, 0, {try_again_restart_continue, Child, Children}}}
319+
end;
320+
handle_info({try_again_restart, Id}, State) ->
321+
case lists:keyfind(Id, #child.id, State#state.children) of
322+
false ->
323+
{ok, State};
324+
Child ->
325+
case add_restart(State) of
326+
{ok, State1} ->
327+
case try_start(Child) of
328+
{ok, NewPid, _Result} ->
329+
UpdatedChildren = lists:keyreplace(
330+
Id, Child#child.id, State1#state.children, Child#child{pid = NewPid}
331+
),
332+
{noreply, State1#state{children = UpdatedChildren}};
333+
{error, {_, _}} ->
334+
{noreply, State1, {timeout, 0, {try_again_restart, Id}}}
335+
end;
336+
{shutdown, State1} ->
337+
RemainingChildren = lists:keydelete(Id, #child.id, State#state.children),
338+
{stop, shutdown, State1#state{children = RemainingChildren}}
339+
end
340+
end;
341+
handle_info({try_again_restart_continue, Id, Children}, State) ->
342+
case lists:keyfind(Id, #child.id, State#state.children) of
343+
false ->
344+
{ok, State};
345+
Child ->
346+
case add_restart(State) of
347+
{ok, State1} ->
348+
case try_start(Child) of
349+
{ok, NewPid, _Result} ->
350+
UpdatedChildren = lists:keyreplace(
351+
Id, Child#child.id, State1#state.children, Child#child{pid = NewPid}
352+
),
353+
{noreply, State1#state{children = UpdatedChildren},
354+
{timeout, 0, {restart_many_children, Children}}};
355+
{error, {_, _}} ->
356+
{noreply, State1, {timeout, 0, {try_again_restart, Id}}}
357+
end;
358+
{shutdown, State1} ->
359+
RemainingChildren = lists:keydelete(Id, #child.id, State#state.children),
360+
{stop, shutdown, State1#state{children = RemainingChildren}}
361+
end
317362
end;
318363
handle_info(_Msg, State) ->
319364
%TODO: log unexpected message
@@ -370,7 +415,13 @@ handle_restart_strategy(
370415
Children = lists:keyreplace(
371416
Id, #child.id, State#state.children, NewChild
372417
),
373-
{ok, State#state{children = Children}}
418+
{ok, State#state{children = Children}};
419+
{error, _} ->
420+
NewChild = Child#child{pid = {restarting, Child#child.pid}},
421+
Children = lists:keyreplace(
422+
Id, #child.id, State#state.children, NewChild
423+
),
424+
{ok, State#state{children = Children}, {try_again_restart, Id}}
374425
end;
375426
handle_restart_strategy(
376427
#child{pid = Pid} = Child, #state{restart_strategy = one_for_all} = State

tests/libs/estdlib/test_supervisor.erl

Lines changed: 161 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ test() ->
3737
ok = test_count_children(),
3838
ok = test_one_for_all(),
3939
ok = test_crash_limits(),
40+
ok = try_again_restart(),
4041
ok.
4142

4243
test_basic_supervisor() ->
@@ -220,7 +221,50 @@ child_start({trap_exit, Parent}) ->
220221
ok -> ok
221222
end
222223
end),
223-
{ok, Pid}.
224+
{ok, Pid};
225+
child_start({get_permission, Arbitrator, Parent}) ->
226+
Arbitrator ! {can_start, self()},
227+
CanStart =
228+
receive
229+
{do_start, Start} -> Start
230+
after 2000 ->
231+
{timeout, arbitrator}
232+
end,
233+
case CanStart of
234+
true ->
235+
Pid = spawn_link(fun() ->
236+
receive
237+
stop -> exit(normal)
238+
end
239+
end),
240+
Parent ! Pid,
241+
{ok, Pid};
242+
false ->
243+
{error, start_denied};
244+
Error ->
245+
{error, Error}
246+
end.
247+
248+
arbitrator_start(Deny) when is_integer(Deny) ->
249+
receive
250+
{can_start, From} ->
251+
From ! {do_start, true}
252+
end,
253+
arbitrator(Deny).
254+
255+
arbitrator(Deny) ->
256+
Allow =
257+
if
258+
Deny =< 0 -> true;
259+
true -> false
260+
end,
261+
receive
262+
{can_start, From} ->
263+
From ! {do_start, Allow},
264+
arbitrator(Deny - 1);
265+
shutdown ->
266+
ok
267+
end.
224268

225269
test_ping_pong(SupPid) ->
226270
Pid1 = get_and_test_server(),
@@ -334,7 +378,39 @@ init({test_crash_limits, Intensity, Period, Parent}) ->
334378
modules => [ping_pong_server]
335379
}
336380
],
337-
{ok, {#{strategy => one_for_one, intensity => Intensity, period => Period}, ChildSpec}}.
381+
{ok, {#{strategy => one_for_one, intensity => Intensity, period => Period}, ChildSpec}};
382+
init({test_try_again, Arbitrator, Parent}) ->
383+
ChildSpec = [
384+
#{
385+
id => finicky_child,
386+
start => {?MODULE, child_start, [{get_permission, Arbitrator, Parent}]},
387+
restart => permanent,
388+
shutdown => brutal_kill,
389+
type => worker,
390+
modules => [?MODULE]
391+
}
392+
],
393+
{ok, {#{strategy => one_for_one, intensity => 5, period => 10}, ChildSpec}};
394+
init({test_retry_one_for_all, Arbitrator, Parent}) ->
395+
ChildSpec = [
396+
#{
397+
id => ping,
398+
start => {ping_pong_server, start_link, [Parent]},
399+
restart => permanent,
400+
shutdown => 2000,
401+
type => worker,
402+
modules => [?MODULE]
403+
},
404+
#{
405+
id => crashy_child,
406+
start => {?MODULE, child_start, [{get_permission, Arbitrator, Parent}]},
407+
restart => permanent,
408+
shutdown => brutal_kill,
409+
type => worker,
410+
modules => [?MODULE]
411+
}
412+
],
413+
{ok, {#{strategy => one_for_all, intensity => 5, period => 10}, ChildSpec}}.
338414

339415
test_supervisor_order() ->
340416
{ok, SupPid} = supervisor:start_link(?MODULE, {test_supervisor_order, self()}),
@@ -521,3 +597,86 @@ get_ping_pong_pid() ->
521597
{ping_pong_server_ready, Pid} -> Pid
522598
after 2000 -> throw(timeout)
523599
end.
600+
601+
try_again_restart() ->
602+
process_flag(trap_exit, true),
603+
604+
%% Intensity is 5, use the arbitrator to prevent the child from restarting
605+
%% 4 times. This should not exit the supervisor due to intensity.
606+
Arbitrator1 = erlang:spawn(fun() -> arbitrator_start(4) end),
607+
{ok, SupPid1} = supervisor:start_link(
608+
{local, try_again_test1}, ?MODULE, {test_try_again, Arbitrator1, self()}
609+
),
610+
ChildPid = wait_child_pid(),
611+
612+
ChildPid ! stop,
613+
ChildPid1 = wait_child_pid(),
614+
615+
ChildPid1 ! stop,
616+
Arbitrator1 ! shutdown,
617+
exit(SupPid1, normal),
618+
ok =
619+
receive
620+
{'EXIT', SupPid1, normal} ->
621+
ok
622+
after 2000 ->
623+
error({supervisor_not_stopped, normal})
624+
end,
625+
626+
%% Prevent 5 restart attempts allow on the 6th, this should cause the supervisor
627+
%% to shutdown on the 6th attempt, which happens before period expires and we are
628+
%% already at max restart intensity.
629+
Arbitrator2 = erlang:spawn(fun() -> arbitrator_start(5) end),
630+
{ok, SupPid2} = supervisor:start_link(
631+
{local, test_try_again2}, ?MODULE, {test_try_again, Arbitrator2, self()}
632+
),
633+
ChildPid2 = wait_child_pid(),
634+
635+
ChildPid2 ! stop,
636+
ok =
637+
receive
638+
{'EXIT', SupPid2, shutdown} ->
639+
ok
640+
after 2000 ->
641+
error({supervisor_not_stopped, restart_try_again_exceeded})
642+
end,
643+
Arbitrator2 ! shutdown,
644+
645+
%% Test one_for_all, child 2 uses arbitrator to deny 4 restart attempts, Ping1 exiting
646+
%% a single time after should cause a supervisor shutdown.
647+
Arbitrator3 = erlang:spawn(fun() -> arbitrator_start(4) end),
648+
{ok, SupPid3} = supervisor:start_link(
649+
{local, try_again_test3}, ?MODULE, {test_retry_one_for_all, Arbitrator3, self()}
650+
),
651+
652+
Ping1 = get_and_test_server(),
653+
654+
_Crashy1 = wait_child_pid(),
655+
656+
gen_server:cast(Ping1, {crash, test}),
657+
658+
_Ping2 = get_and_test_server(),
659+
Crashy2 = wait_child_pid(),
660+
%% this will exit the child triggering a one_for_all restart.
661+
Crashy2 ! stop,
662+
663+
%% ping_pong_server has 2000ms timeout, we need to wait longer.
664+
ok =
665+
receive
666+
{'EXIT', SupPid3, shutdown} ->
667+
ok
668+
after 5000 ->
669+
error({supervisor_not_stopped, one_for_all_restarts_exceeded})
670+
end,
671+
Arbitrator3 ! shutdown,
672+
673+
process_flag(trap_exit, false),
674+
ok.
675+
676+
wait_child_pid() ->
677+
receive
678+
Pid when is_pid(Pid) ->
679+
Pid
680+
after 1000 ->
681+
error({timeout, no_child_pid})
682+
end.

0 commit comments

Comments
 (0)