Skip to content

Commit ec0c6cc

Browse files
committed
Add support for supervisor intensity and period
Adds support for honoring intensity and period supervisor options, allowing the prevention of endless crash and restarts by misbehaving children. Adds a test to ensure the settings work as expected on AtomVM and match OTP. Closes #1915 Signed-off-by: Winford <[email protected]>
1 parent a7bb66e commit ec0c6cc

File tree

3 files changed

+165
-7
lines changed

3 files changed

+165
-7
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ instead `badarg`.
9393
- Fix a memory leak in distribution when a BEAM node would monitor a process by name.
9494
- Fix `list_to_integer`, it was likely buggy with integers close to INT64_MAX
9595
- Added missing support for supervisor `one_for_all` strategy.
96+
- Supervisor now honors period and intensity options.
9697

9798
## [0.6.7] - Unreleased
9899

libs/estdlib/src/supervisor.erl

Lines changed: 82 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,21 @@
9292
modules = [] :: [module()] | dynamic
9393
}).
9494
%% note: the list of children should always be kept in order, with last to start at the head.
95-
-record(state, {restart_strategy = one_for_one :: strategy(), children = [] :: [#child{}]}).
95+
-record(state, {
96+
restart_strategy = one_for_one :: strategy(),
97+
intensity = 1 :: non_neg_integer(),
98+
period = 5 :: pos_integer(),
99+
restart_count = 0 :: non_neg_integer(),
100+
restarts = [] :: [integer()],
101+
children = [] :: [#child{}]
102+
}).
103+
104+
%% Used to trim stale restarts when the 'intensity' value is large.
105+
%% The number of restarts before triggering a purge of restarts older
106+
%% than 'period', so stale restarts do not continue to consume ram for
107+
%% the sake of MCUs with limited memory. In the future a function
108+
%% could be used to set a sane default for the platform (OTP uses 1000).
109+
-define(STALE_RESTART_LIMIT, 100).
96110

97111
start_link(Module, Args) ->
98112
gen_server:start_link(?MODULE, {Module, Args}, []).
@@ -121,12 +135,23 @@ count_children(Supervisor) ->
121135
init({Mod, Args}) ->
122136
erlang:process_flag(trap_exit, true),
123137
case Mod:init(Args) of
124-
{ok, {{Strategy, _Intensity, _Period}, StartSpec}} ->
125-
State = init_state(StartSpec, #state{restart_strategy = Strategy}),
138+
{ok, {{Strategy, Intensity, Period}, StartSpec}} ->
139+
State = init_state(StartSpec, #state{
140+
restart_strategy = Strategy,
141+
intensity = Intensity,
142+
period = Period
143+
}),
126144
NewChildren = start_children(State#state.children, []),
127145
{ok, State#state{children = NewChildren}};
128-
{ok, {#{strategy := Strategy}, StartSpec}} ->
129-
State = init_state(StartSpec, #state{restart_strategy = Strategy}),
146+
{ok, {#{} = SupSpec, StartSpec}} ->
147+
Strategy = maps:get(strategy, SupSpec, one_for_one),
148+
Intensity = maps:get(intensity, SupSpec, 3),
149+
Period = maps:get(period, SupSpec, 5),
150+
State = init_state(StartSpec, #state{
151+
restart_strategy = Strategy,
152+
intensity = Intensity,
153+
period = Period
154+
}),
130155
NewChildren = start_children(State#state.children, []),
131156
{ok, State#state{children = NewChildren}};
132157
Error ->
@@ -322,7 +347,15 @@ handle_child_exit(Pid, Reason, State) ->
322347
#child{} = Child ->
323348
case should_restart(Reason, Child#child.restart) of
324349
true ->
325-
handle_restart_strategy(Child, State);
350+
case add_restart(State) of
351+
{ok, State1} ->
352+
handle_restart_strategy(Child, State1);
353+
{shutdown, State1} ->
354+
RemainingChildren = lists:keydelete(
355+
Pid, #child.pid, State1#state.children
356+
),
357+
{shutdown, State1#state{children = RemainingChildren}}
358+
end;
326359
false ->
327360
Children = lists:keydelete(Pid, #child.pid, State#state.children),
328361
{noreply, State#state{children = Children}}
@@ -370,6 +403,8 @@ should_restart(Reason, transient) ->
370403

371404
loop_terminate([#child{pid = undefined} | Tail], AccRemaining) ->
372405
loop_terminate(Tail, AccRemaining);
406+
loop_terminate([#child{pid = {restarting, _}} | Tail], AccRemaining) ->
407+
loop_terminate(Tail, AccRemaining);
373408
loop_terminate([#child{pid = Pid} = Child | Tail], AccRemaining) when is_pid(Pid) ->
374409
do_terminate(Child),
375410
loop_terminate(Tail, [Pid | AccRemaining]);
@@ -454,6 +489,47 @@ do_terminate_one_for_all([Child | Children], StopPids) ->
454489
do_terminate_one_for_all(Children, StopPids)
455490
end.
456491

492+
add_restart(
493+
#state{
494+
intensity = Intensity, period = Period, restart_count = RestartCount, restarts = Restarts
495+
} = State
496+
) ->
497+
Now = erlang:monotonic_time(millisecond),
498+
Threshold = Now - Period * 1000,
499+
case can_restart(Intensity, Threshold, Restarts, RestartCount) of
500+
{true, RestartCount1, Restarts1} ->
501+
{ok, State#state{
502+
restarts = Restarts1 ++ [Now], restart_count = RestartCount1 + 1
503+
}};
504+
{false, _RestartCount1, _Restarts1} ->
505+
% TODO: log supervisor shutdown due to maximum intensity exceeded
506+
{shutdown, State}
507+
end.
508+
509+
can_restart(0, _, _, _) ->
510+
{false, 0, []};
511+
can_restart(_, _, _, 0) ->
512+
{true, 0, []};
513+
can_restart(Intensity, Threshold, Restarts, RestartCount) when
514+
RestartCount >= ?STALE_RESTART_LIMIT
515+
->
516+
{NewCount, Restarts1} = trim_expired_restarts(Threshold, lists:sort(Restarts)),
517+
can_restart(Intensity, Threshold, Restarts1, NewCount);
518+
can_restart(Intensity, Threshold, [Restart | _] = Restarts, RestartCount) when
519+
RestartCount >= Intensity andalso Restart < Threshold
520+
->
521+
{NewCount, Restarts1} = trim_expired_restarts(Threshold, lists:sort(Restarts)),
522+
can_restart(Intensity, Threshold, Restarts1, NewCount);
523+
can_restart(Intensity, _, Restarts, RestartCount) when RestartCount >= Intensity ->
524+
{false, RestartCount, Restarts};
525+
can_restart(Intensity, _, Restarts, RestartCount) when RestartCount < Intensity ->
526+
{true, RestartCount, Restarts}.
527+
528+
trim_expired_restarts(Threshold, [Restart | Restarts]) when Restart < Threshold ->
529+
trim_expired_restarts(Threshold, Restarts);
530+
trim_expired_restarts(_Threshold, Restarts) ->
531+
{length(Restarts), Restarts}.
532+
457533
child_to_info(#child{id = Id, pid = Pid, type = Type, modules = Modules}) ->
458534
Child =
459535
case Pid of

tests/libs/estdlib/test_supervisor.erl

Lines changed: 82 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ test() ->
3636
ok = test_which_children(),
3737
ok = test_count_children(),
3838
ok = test_one_for_all(),
39+
ok = test_crash_limits(),
3940
ok.
4041

4142
test_basic_supervisor() ->
@@ -321,7 +322,19 @@ init({test_one_for_all, Parent}) ->
321322
modules => [notify_init_server]
322323
}
323324
],
324-
{ok, {#{strategy => one_for_all, intensity => 10000, period => 3600}, ChildSpecs}}.
325+
{ok, {#{strategy => one_for_all, intensity => 10000, period => 3600}, ChildSpecs}};
326+
init({test_crash_limits, Intensity, Period, Parent}) ->
327+
ChildSpec = [
328+
#{
329+
id => test_child,
330+
start => {ping_pong_server, start_link, [Parent]},
331+
restart => transient,
332+
shutdown => brutal_kill,
333+
type => worker,
334+
modules => [ping_pong_server]
335+
}
336+
],
337+
{ok, {#{strategy => one_for_one, intensity => Intensity, period => Period}, ChildSpec}}.
325338

326339
test_supervisor_order() ->
327340
{ok, SupPid} = supervisor:start_link(?MODULE, {test_supervisor_order, self()}),
@@ -440,3 +453,71 @@ test_one_for_all() ->
440453
unlink(SupPid),
441454
exit(SupPid, shutdown),
442455
ok.
456+
457+
test_crash_limits() ->
458+
%% Trap exits so this test doesn't shutdown with the supervisor
459+
process_flag(trap_exit, true),
460+
Intensity = 2,
461+
Period = 5,
462+
{ok, SupPid} = supervisor:start_link(
463+
{local, test_crash_limits}, ?MODULE, {test_crash_limits, Intensity, Period, self()}
464+
),
465+
Pid1 = get_ping_pong_pid(),
466+
gen_server:call(Pid1, {stop, test_crash1}),
467+
Pid2 = get_ping_pong_pid(),
468+
gen_server:cast(Pid2, {crash, test_crash2}),
469+
Pid3 = get_ping_pong_pid(),
470+
%% Wait until period expires so we can test that stale shutdowns are purged from the shutdown list
471+
timer:sleep(6000),
472+
473+
gen_server:call(Pid3, {stop, test_crash3}),
474+
Pid4 = get_ping_pong_pid(),
475+
gen_server:cast(Pid4, {crash, test_crash4}),
476+
Pid5 = get_ping_pong_pid(),
477+
478+
%% The next crash will reach the restart threshold and shutdown the supervisor
479+
gen_server:call(Pid5, {stop, test_crash5}),
480+
481+
%% Test supervisor has exited
482+
ok =
483+
receive
484+
{'EXIT', SupPid, shutdown} ->
485+
ok
486+
after 2000 ->
487+
error({supervisor_not_stopped, reached_max_restart_intensity})
488+
end,
489+
process_flag(trap_exit, false),
490+
491+
%% Test child crashed and was not restarted
492+
ok =
493+
try gen_server:call(Pid5, ping) of
494+
pong -> error(not_stopped, Pid5)
495+
catch
496+
exit:{noproc, _MFA} -> ok
497+
end,
498+
ok =
499+
try get_ping_pong_pid() of
500+
Pid6 when is_pid(Pid6) ->
501+
error({child_restarted, reached_max_restart_intensity})
502+
catch
503+
throw:timeout ->
504+
ok
505+
end,
506+
507+
ok =
508+
try erlang:process_info(SupPid, links) of
509+
{links, Links} when is_list(Links) ->
510+
error({not_stopped, reached_max_restart_intensity});
511+
undefined ->
512+
ok
513+
catch
514+
error:badarg ->
515+
ok
516+
end,
517+
ok.
518+
519+
get_ping_pong_pid() ->
520+
receive
521+
{ping_pong_server_ready, Pid} -> Pid
522+
after 2000 -> throw(timeout)
523+
end.

0 commit comments

Comments
 (0)