Skip to content

Commit 511b037

Browse files
committed
Add support for supervisor intensity and period
Adds support for honoring intensity and period supervisor options, allowing the prevention of endless crash and restarts by misbehaving children. Adds a test to ensure the settings work as expected on AtomVM and match OTP. Closes #1915 Signed-off-by: Winford <[email protected]>
1 parent ad05bf7 commit 511b037

File tree

3 files changed

+167
-7
lines changed

3 files changed

+167
-7
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ instead `badarg`.
9292
- Fix a memory leak in distribution when a BEAM node would monitor a process by name.
9393
- Fix `list_to_integer`, it was likely buggy with integers close to INT64_MAX
9494
- Added missing support for supervisor `one_for_all` strategy.
95+
- Supervisor now honors period and intensity options.
9596

9697
## [0.6.7] - Unreleased
9798

libs/estdlib/src/supervisor.erl

Lines changed: 84 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,21 @@
9292
modules = [] :: [module()] | dynamic
9393
}).
9494
%% note: the list of children should always be kept in order, with first to start at the head.
95-
-record(state, {restart_strategy = one_for_one :: strategy(), children = [] :: [#child{}]}).
95+
-record(state, {
96+
restart_strategy = one_for_one :: strategy(),
97+
intensity = 3 :: non_neg_integer(),
98+
period = 5 :: pos_integer(),
99+
restart_count = 0 :: non_neg_integer(),
100+
restarts = [] :: [integer()],
101+
children = [] :: [#child{}]
102+
}).
103+
104+
%% Used to trim stale restarts when the 'intensity' value is large.
105+
%% The number of restarts before triggering a purge of restarts older
106+
%% than 'period', so stale restarts do not continue to consume ram for
107+
%% the sake of MCUs with limited memory. In the future a function
108+
%% could be used to set a sane default for the platform (OTP uses 1000).
109+
-define(STALE_RESTART_LIMIT, 100).
96110

97111
start_link(Module, Args) ->
98112
gen_server:start_link(?MODULE, {Module, Args}, []).
@@ -121,12 +135,23 @@ count_children(Supervisor) ->
121135
init({Mod, Args}) ->
122136
erlang:process_flag(trap_exit, true),
123137
case Mod:init(Args) of
124-
{ok, {{Strategy, _Intensity, _Period}, StartSpec}} ->
125-
State = init_state(StartSpec, #state{restart_strategy = Strategy}),
138+
{ok, {{Strategy, Intensity, Period}, StartSpec}} ->
139+
State = init_state(StartSpec, #state{
140+
restart_strategy = Strategy,
141+
intensity = Intensity,
142+
period = Period
143+
}),
126144
NewChildren = start_children(State#state.children, []),
127145
{ok, State#state{children = NewChildren}};
128-
{ok, {#{strategy := Strategy}, StartSpec}} ->
129-
State = init_state(StartSpec, #state{restart_strategy = Strategy}),
146+
{ok, {#{} = SupSpec, StartSpec}} ->
147+
Strategy = maps:get(strategy, SupSpec, one_for_one),
148+
Intensity = maps:get(intensity, SupSpec, 3),
149+
Period = maps:get(period, SupSpec, 5),
150+
State = init_state(StartSpec, #state{
151+
restart_strategy = Strategy,
152+
intensity = Intensity,
153+
period = Period
154+
}),
130155
NewChildren = start_children(State#state.children, []),
131156
{ok, State#state{children = NewChildren}};
132157
Error ->
@@ -321,7 +346,15 @@ handle_child_exit(Pid, Reason, State) ->
321346
#child{} = Child ->
322347
case should_restart(Reason, Child#child.restart) of
323348
true ->
324-
handle_restart_strategy(Child, State);
349+
case add_restart(State) of
350+
{ok, State1} ->
351+
handle_restart_strategy(Child, State1);
352+
{shutdown, State1} ->
353+
RemainingChildren = lists:keydelete(
354+
Pid, #child.pid, State1#state.children
355+
),
356+
{shutdown, State1#state{children = RemainingChildren}}
357+
end;
325358
false ->
326359
Children = lists:keydelete(Pid, #child.pid, State#state.children),
327360
{ok, State#state{children = Children}}
@@ -360,6 +393,8 @@ should_restart(Reason, transient) ->
360393

361394
loop_terminate([#child{pid = undefined} | Tail], AccRemaining) ->
362395
loop_terminate(Tail, AccRemaining);
396+
loop_terminate([#child{pid = {restarting, _}} | Tail], AccRemaining) ->
397+
loop_terminate(Tail, AccRemaining);
363398
loop_terminate([#child{pid = Pid} = Child | Tail], AccRemaining) when is_pid(Pid) ->
364399
do_terminate(Child),
365400
loop_terminate(Tail, [Pid | AccRemaining]);
@@ -454,6 +489,49 @@ wait_child_exit(#child{pid = Pid, shutdown = Timeout} = Child) ->
454489
{timeout, {no_exit, Child}}
455490
end.
456491

492+
add_restart(
493+
#state{
494+
intensity = Intensity, period = Period, restart_count = RestartCount, restarts = Restarts
495+
} = State
496+
) ->
497+
Now = erlang:monotonic_time(millisecond),
498+
Threshold = Now - Period * 1000,
499+
case can_restart(Intensity, Threshold, Restarts, RestartCount) of
500+
{true, RestartCount1, Restarts1} ->
501+
{ok, State#state{
502+
restarts = Restarts1 ++ [Now], restart_count = RestartCount1 + 1
503+
}};
504+
{false, RestartCount1, Restarts1} ->
505+
% TODO: log supervisor shutdown due to maximum intensity exceeded
506+
{shutdown, State#state{
507+
restarts = Restarts1, restart_count = RestartCount1
508+
}}
509+
end.
510+
511+
can_restart(0, _, _, _) ->
512+
{false, 0, []};
513+
can_restart(_, _, _, 0) ->
514+
{true, 0, []};
515+
can_restart(Intensity, Threshold, Restarts, RestartCount) when
516+
RestartCount >= ?STALE_RESTART_LIMIT
517+
->
518+
{NewCount, Restarts1} = trim_expired_restarts(Threshold, lists:sort(Restarts)),
519+
can_restart(Intensity, Threshold, Restarts1, NewCount);
520+
can_restart(Intensity, Threshold, [Restart | _] = Restarts, RestartCount) when
521+
RestartCount >= Intensity andalso Restart < Threshold
522+
->
523+
{NewCount, Restarts1} = trim_expired_restarts(Threshold, lists:sort(Restarts)),
524+
can_restart(Intensity, Threshold, Restarts1, NewCount);
525+
can_restart(Intensity, _, Restarts, RestartCount) when RestartCount >= Intensity ->
526+
{false, RestartCount, Restarts};
527+
can_restart(Intensity, _, Restarts, RestartCount) when RestartCount < Intensity ->
528+
{true, RestartCount, Restarts}.
529+
530+
trim_expired_restarts(Threshold, [Restart | Restarts]) when Restart < Threshold ->
531+
trim_expired_restarts(Threshold, Restarts);
532+
trim_expired_restarts(_Threshold, Restarts) ->
533+
{length(Restarts), Restarts}.
534+
457535
child_to_info(#child{id = Id, pid = Pid, type = Type, modules = Modules}) ->
458536
Child =
459537
case Pid of

tests/libs/estdlib/test_supervisor.erl

Lines changed: 82 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ test() ->
3636
ok = test_which_children(),
3737
ok = test_count_children(),
3838
ok = test_one_for_all(),
39+
ok = test_crash_limits(),
3940
ok.
4041

4142
test_basic_supervisor() ->
@@ -321,7 +322,19 @@ init({test_one_for_all, Parent}) ->
321322
modules => [notify_init_server]
322323
}
323324
],
324-
{ok, {#{strategy => one_for_all, intensity => 10000, period => 3600}, ChildSpecs}}.
325+
{ok, {#{strategy => one_for_all, intensity => 10000, period => 3600}, ChildSpecs}};
326+
init({test_crash_limits, Intensity, Period, Parent}) ->
327+
ChildSpec = [
328+
#{
329+
id => test_child,
330+
start => {ping_pong_server, start_link, [Parent]},
331+
restart => transient,
332+
shutdown => brutal_kill,
333+
type => worker,
334+
modules => [ping_pong_server]
335+
}
336+
],
337+
{ok, {#{strategy => one_for_one, intensity => Intensity, period => Period}, ChildSpec}}.
325338

326339
test_supervisor_order() ->
327340
{ok, SupPid} = supervisor:start_link(?MODULE, {test_supervisor_order, self()}),
@@ -440,3 +453,71 @@ test_one_for_all() ->
440453
unlink(SupPid),
441454
exit(SupPid, shutdown),
442455
ok.
456+
457+
test_crash_limits() ->
458+
%% Trap exits so this test doesn't shutdown with the supervisor
459+
process_flag(trap_exit, true),
460+
Intensity = 2,
461+
Period = 5,
462+
{ok, SupPid} = supervisor:start_link(
463+
{local, test_crash_limits}, ?MODULE, {test_crash_limits, Intensity, Period, self()}
464+
),
465+
Pid1 = get_ping_pong_pid(),
466+
gen_server:call(Pid1, {stop, test_crash1}),
467+
Pid2 = get_ping_pong_pid(),
468+
gen_server:cast(Pid2, {crash, test_crash2}),
469+
Pid3 = get_ping_pong_pid(),
470+
%% Wait until period expires so we can test that stale shutdowns are purged from the shutdown list
471+
timer:sleep(6000),
472+
473+
gen_server:call(Pid3, {stop, test_crash3}),
474+
Pid4 = get_ping_pong_pid(),
475+
gen_server:cast(Pid4, {crash, test_crash4}),
476+
Pid5 = get_ping_pong_pid(),
477+
478+
%% The next crash will reach the restart threshold and shutdown the supervisor
479+
gen_server:call(Pid5, {stop, test_crash5}),
480+
481+
%% Test supervisor has exited
482+
ok =
483+
receive
484+
{'EXIT', SupPid, shutdown} ->
485+
ok
486+
after 2000 ->
487+
error({supervisor_not_stopped, reached_max_restart_intensity})
488+
end,
489+
process_flag(trap_exit, false),
490+
491+
%% Test child crashed and was not restarted
492+
ok =
493+
try gen_server:call(Pid5, ping) of
494+
pong -> error(not_stopped, Pid5)
495+
catch
496+
exit:{noproc, _MFA} -> ok
497+
end,
498+
ok =
499+
try get_ping_pong_pid() of
500+
Pid6 when is_pid(Pid6) ->
501+
error({child_restarted, reached_max_restart_intensity})
502+
catch
503+
throw:timeout ->
504+
ok
505+
end,
506+
507+
ok =
508+
try erlang:process_info(SupPid, links) of
509+
{links, Links} when is_list(Links) ->
510+
error({not_stopped, reached_max_restart_intensity});
511+
undefined ->
512+
ok
513+
catch
514+
error:badarg ->
515+
ok
516+
end,
517+
ok.
518+
519+
get_ping_pong_pid() ->
520+
receive
521+
{ping_pong_server_ready, Pid} -> Pid
522+
after 2000 -> throw(timeout)
523+
end.

0 commit comments

Comments
 (0)