Skip to content

Commit d081272

Browse files
committed
Fix supervisor restart when child fails to match OTP
Fixes a function clause exception that would crash the supervisor if a child fails to restart by matching the same behavior as OTP, and continue to try restarting the child until success, or maximum `intensity` is reached within the allowed `period`. Closed #1957 Signed-off-by: Winford <[email protected]>
1 parent 9c4f7f2 commit d081272

File tree

3 files changed

+247
-6
lines changed

3 files changed

+247
-6
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ instead `badarg`.
9393
- Fix `list_to_integer`, it was likely buggy with integers close to INT64_MAX
9494
- Added missing support for supervisor `one_for_all` strategy.
9595
- Supervisor now honors period and intensity options.
96+
- Fix supervisor crash if a `one_for_one` child fails to restart.
9697

9798
## [0.6.7] - Unreleased
9899

libs/estdlib/src/supervisor.erl

Lines changed: 60 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,8 @@ handle_info({'EXIT', Pid, Reason}, State) ->
292292
{ok, State1, restart_all_children} ->
293293
Children = State1#state.children,
294294
{noreply, State1, {timeout, 0, {restart_many_children, Children}}};
295+
{ok, State1, {try_again_restart, ChildId}} ->
296+
{noreply, State1, {timeout, 0, {try_again_restart, ChildId}}};
295297
{shutdown, State1} ->
296298
{stop, shutdown, State1}
297299
end;
@@ -305,17 +307,65 @@ handle_info({ensure_killed, Pid}, State) ->
305307
end;
306308
handle_info({restart_many_children, []}, State) ->
307309
{noreply, State};
308-
handle_info({restart_many_children, [#child{id = Id} = Child | Children] = ChildSpecs}, State) ->
310+
handle_info({restart_many_children, [#child{id = Id} = Child | Siblings] = _ChildSpecs}, State) ->
309311
case try_start(Child) of
310312
{ok, NewPid, _Result} ->
311313
NewChild = Child#child{pid = NewPid},
312314
NewChildren = lists:keyreplace(
313315
Id, #child.id, State#state.children, NewChild
314316
),
315317
{noreply, State#state{children = NewChildren},
316-
{timeout, 0, {restart_many_children, Children}}};
318+
{timeout, 0, {restart_many_children, Siblings}}};
317319
{error, _Reason} ->
318-
{noreply, State, {timeout, 0, {restart_many_children, ChildSpecs}}}
320+
case Siblings of
321+
[] ->
322+
{noreply, State, {timeout, 0, {try_again_restart, Id}}};
323+
Siblings ->
324+
{noreply, State, {timeout, 0, {try_again_restart_continue, Child, Siblings}}}
325+
end
326+
end;
327+
handle_info({try_again_restart, Id}, State) ->
328+
case lists:keyfind(Id, #child.id, State#state.children) of
329+
false ->
330+
{noreply, State};
331+
Child ->
332+
case add_restart(State) of
333+
{ok, State1} ->
334+
case try_start(Child) of
335+
{ok, NewPid, _Result} ->
336+
UpdatedChildren = lists:keyreplace(
337+
Id, Child#child.id, State1#state.children, Child#child{pid = NewPid}
338+
),
339+
{noreply, State1#state{children = UpdatedChildren}};
340+
{error, {_, _}} ->
341+
{noreply, State1, {timeout, 0, {try_again_restart, Id}}}
342+
end;
343+
{shutdown, State1} ->
344+
RemainingChildren = lists:keydelete(Id, #child.id, State1#state.children),
345+
{stop, shutdown, State1#state{children = RemainingChildren}}
346+
end
347+
end;
348+
handle_info({try_again_restart_continue, #child{id = Id} = Child, Siblings}, State) ->
349+
case lists:keyfind(Id, #child.id, State#state.children) of
350+
false ->
351+
{noreply, State, {timeout, 0, {restart_many_children, Siblings}}};
352+
Child ->
353+
case add_restart(State) of
354+
{ok, State1} ->
355+
case try_start(Child) of
356+
{ok, NewPid, _Result} ->
357+
UpdatedChildren = lists:keyreplace(
358+
Child#child.id, #child.id, State1#state.children, Child#child{pid = NewPid}
359+
),
360+
{noreply, State1#state{children = UpdatedChildren},
361+
{timeout, 0, {restart_many_children, Siblings}}};
362+
{error, {_, _}} ->
363+
{noreply, State1, {timeout, 0, {try_again_restart_continue, Child, Siblings}}}
364+
end;
365+
{shutdown, State1} ->
366+
RemainingChildren = lists:keydelete(Child#child.id, #child.id, State1#state.children),
367+
{stop, shutdown, State1#state{children = RemainingChildren}}
368+
end
319369
end;
320370
handle_info(_Msg, State) ->
321371
%TODO: log unexpected message
@@ -372,7 +422,13 @@ handle_restart_strategy(
372422
Children = lists:keyreplace(
373423
Id, #child.id, State#state.children, NewChild
374424
),
375-
{ok, State#state{children = Children}}
425+
{ok, State#state{children = Children}};
426+
{error, _} ->
427+
NewChild = Child#child{pid = {restarting, Child#child.pid}},
428+
Children = lists:keyreplace(
429+
Id, #child.id, State#state.children, NewChild
430+
),
431+
{ok, State#state{children = Children}, {try_again_restart, Id}}
376432
end;
377433
handle_restart_strategy(
378434
#child{pid = Pid} = Child, #state{restart_strategy = one_for_all} = State

tests/libs/estdlib/test_supervisor.erl

Lines changed: 186 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ test() ->
3737
ok = test_count_children(),
3838
ok = test_one_for_all(),
3939
ok = test_crash_limits(),
40+
ok = try_again_restart(),
4041
ok.
4142

4243
test_basic_supervisor() ->
@@ -220,7 +221,51 @@ child_start({trap_exit, Parent}) ->
220221
ok -> ok
221222
end
222223
end),
223-
{ok, Pid}.
224+
{ok, Pid};
225+
child_start({get_permission, Arbitrator, Parent}) ->
226+
Arbitrator ! {can_start, self()},
227+
CanStart =
228+
receive
229+
{do_start, Start} -> Start
230+
after 2000 ->
231+
{timeout, arbitrator}
232+
end,
233+
case CanStart of
234+
true ->
235+
Pid = spawn_link(fun() ->
236+
receive
237+
stop -> exit(normal)
238+
end
239+
end),
240+
register(crashy, Pid),
241+
Parent ! Pid,
242+
{ok, Pid};
243+
false ->
244+
{error, start_denied};
245+
Error ->
246+
{error, Error}
247+
end.
248+
249+
arbitrator_start(Deny) when is_integer(Deny) ->
250+
receive
251+
{can_start, From} ->
252+
From ! {do_start, true}
253+
end,
254+
arbitrator(Deny).
255+
256+
arbitrator(Deny) ->
257+
Allow =
258+
if
259+
Deny =< 0 -> true;
260+
true -> false
261+
end,
262+
receive
263+
{can_start, From} ->
264+
From ! {do_start, Allow},
265+
arbitrator(Deny - 1);
266+
shutdown ->
267+
ok
268+
end.
224269

225270
test_ping_pong(SupPid) ->
226271
Pid1 = get_and_test_server(),
@@ -334,7 +379,39 @@ init({test_crash_limits, Intensity, Period, Parent}) ->
334379
modules => [ping_pong_server]
335380
}
336381
],
337-
{ok, {#{strategy => one_for_one, intensity => Intensity, period => Period}, ChildSpec}}.
382+
{ok, {#{strategy => one_for_one, intensity => Intensity, period => Period}, ChildSpec}};
383+
init({test_try_again, Arbitrator, Parent}) ->
384+
ChildSpec = [
385+
#{
386+
id => finicky_child,
387+
start => {?MODULE, child_start, [{get_permission, Arbitrator, Parent}]},
388+
restart => permanent,
389+
shutdown => brutal_kill,
390+
type => worker,
391+
modules => [?MODULE]
392+
}
393+
],
394+
{ok, {#{strategy => one_for_one, intensity => 5, period => 10}, ChildSpec}};
395+
init({test_retry_one_for_all, Arbitrator, Parent}) ->
396+
ChildSpec = [
397+
#{
398+
id => ping,
399+
start => {ping_pong_server, start_link, [Parent]},
400+
restart => permanent,
401+
shutdown => brutal_kill,
402+
type => worker,
403+
modules => [ping_pong_server]
404+
},
405+
#{
406+
id => crashy_child,
407+
start => {?MODULE, child_start, [{get_permission, Arbitrator, Parent}]},
408+
restart => permanent,
409+
shutdown => brutal_kill,
410+
type => worker,
411+
modules => [?MODULE]
412+
}
413+
],
414+
{ok, {#{strategy => one_for_all, intensity => 5, period => 10}, ChildSpec}}.
338415

339416
test_supervisor_order() ->
340417
{ok, SupPid} = supervisor:start_link(?MODULE, {test_supervisor_order, self()}),
@@ -521,3 +598,110 @@ get_ping_pong_pid() ->
521598
{ping_pong_server_ready, Pid} -> Pid
522599
after 2000 -> throw(timeout)
523600
end.
601+
602+
try_again_restart() ->
603+
process_flag(trap_exit, true),
604+
605+
%% Intensity is 5, use the arbitrator to prevent the child from restarting
606+
%% 4 times. This should not exit the supervisor due to intensity.
607+
Arbitrator1 = erlang:spawn(fun() -> arbitrator_start(4) end),
608+
{ok, SupPid1} = supervisor:start_link(
609+
{local, try_again_test1}, ?MODULE, {test_try_again, Arbitrator1, self()}
610+
),
611+
ChildPid = wait_child_pid("ChildPid"),
612+
613+
ChildPid ! stop,
614+
ChildPid1 = wait_child_pid("ChildPid1"),
615+
616+
ChildPid1 ! stop,
617+
Arbitrator1 ! shutdown,
618+
exit(SupPid1, normal),
619+
ok =
620+
receive
621+
{'EXIT', SupPid1, normal} ->
622+
ok
623+
after 2000 ->
624+
error({supervisor_not_stopped, normal})
625+
end,
626+
627+
%% Prevent 5 restart attempts allow on the 6th, this should cause the supervisor
628+
%% to shutdown on the 6th attempt, which happens before period expires and we are
629+
%% already at max restart intensity.
630+
Arbitrator2 = erlang:spawn(fun() -> arbitrator_start(5) end),
631+
{ok, SupPid2} = supervisor:start_link(
632+
{local, test_try_again2}, ?MODULE, {test_try_again, Arbitrator2, self()}
633+
),
634+
ChildPid2 = wait_child_pid("ChildPid2"),
635+
636+
ChildPid2 ! stop,
637+
ok =
638+
receive
639+
{'EXIT', SupPid2, shutdown} ->
640+
ok
641+
after 2000 ->
642+
error({supervisor_not_stopped, restart_try_again_exceeded})
643+
end,
644+
Arbitrator2 ! shutdown,
645+
646+
%% Test one_for_all
647+
%% child 2 uses arbitrator to deny 3 restart attempts, succeeding on the 4th.
648+
Arbitrator3 = erlang:spawn(fun() -> arbitrator_start(4) end),
649+
{ok, SupPid3} = supervisor:start_link(
650+
{local, try_again_test3}, ?MODULE, {test_retry_one_for_all, Arbitrator3, self()}
651+
),
652+
653+
Ping1 = wait_ping_server(),
654+
_Crashy1 = wait_child_pid("Crashy1"),
655+
656+
%% this will take 5 restarts (1 to restart ping + 3 denied attempts for
657+
%% crashy and succeed on the 4th)
658+
gen_server:call(Ping1, {stop, normal}),
659+
660+
Ping2 = wait_ping_server(),
661+
%% Crashy will restart without error since the deny count was reached after
662+
%% first time it was stopped
663+
_Crashy2 = wait_child_pid("Crashy2"),
664+
665+
%% this will surely exceed the limit
666+
%crashy ! stop,
667+
gen_server:call(Ping2, {stop, normal}),
668+
669+
%% ping_pong_server has 2000ms timeout, we need to wait longer.
670+
ok =
671+
receive
672+
{'EXIT', SupPid3, shutdown} ->
673+
ok
674+
after 5000 ->
675+
error({supervisor_not_stopped, one_for_all_restarts_exceeded})
676+
end,
677+
Arbitrator3 ! shutdown,
678+
679+
process_flag(trap_exit, false),
680+
ok.
681+
682+
wait_ping_server() ->
683+
receive
684+
{ping_pong_server_ready, Pid} ->
685+
Pid;
686+
{'EXIT', _, shutdown} ->
687+
error({unexpected, supervisor_shutdown});
688+
{'EXIT', _, _} = Exit ->
689+
%% TODO: remove before final commit, just for debugging what might have
690+
%% happend to the the ping_pong gen_server on OTP... \o/
691+
io:format("~n>>> TEST wait_ping_server/0 caught exit: ~w~n", [Exit]),
692+
wait_ping_server()
693+
after 2000 ->
694+
error({timeout, no_child_pid, ping_pong_server})
695+
end.
696+
697+
wait_child_pid(Name) ->
698+
receive
699+
Pid when is_pid(Pid) ->
700+
Pid;
701+
{'EXIT', _, shutdown} ->
702+
error({unexpected, supervisor_shutdown});
703+
{'EXIT', _, _} ->
704+
wait_child_pid(Name)
705+
after 1000 ->
706+
error({timeout, no_child_pid, Name})
707+
end.

0 commit comments

Comments
 (0)