Skip to content

Commit cd25de8

Browse files
committed
Add support for supervisor intensity and period
Adds support for honoring intensity and period supervisor options, allowing the prevention of endless crash and restarts by misbehaving children. Adds a test to ensure the settings work as expected on AtomVM and match OTP. Closes #1915 Signed-off-by: Winford <[email protected]>
1 parent a7bb66e commit cd25de8

File tree

3 files changed

+167
-7
lines changed

3 files changed

+167
-7
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ instead `badarg`.
9393
- Fix a memory leak in distribution when a BEAM node would monitor a process by name.
9494
- Fix `list_to_integer`, it was likely buggy with integers close to INT64_MAX
9595
- Added missing support for supervisor `one_for_all` strategy.
96+
- Supervisor now honors period and intensity options.
9697

9798
## [0.6.7] - Unreleased
9899

libs/estdlib/src/supervisor.erl

Lines changed: 84 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,23 @@
9292
modules = [] :: [module()] | dynamic
9393
}).
9494
%% note: the list of children should always be kept in order, with last to start at the head.
95-
-record(state, {restart_strategy = one_for_one :: strategy(), children = [] :: [#child{}]}).
95+
-record(state, {
96+
restart_strategy = one_for_one :: strategy(),
97+
intensity = ?DEFAULT_INTENSITY :: non_neg_integer(),
98+
period = ?DEFAULT_PERIOD :: pos_integer(),
99+
restart_count = 0 :: non_neg_integer(),
100+
restarts = [] :: [integer()],
101+
children = [] :: [#child{}]
102+
}).
103+
104+
%% Used to trim stale restarts when the 'intensity' value is large.
105+
%% The number of restarts before triggering a purge of restarts older
106+
%% than 'period', so stale restarts do not continue to consume ram for
107+
%% the sake of MCUs with limited memory. In the future a function
108+
%% could be used to set a sane default for the platform (OTP uses 1000).
109+
-define(STALE_RESTART_LIMIT, 100).
110+
-define(DEFAULT_INTENSITY, 1).
111+
-define(DEFAULT_PERIOD, 5).
96112

97113
start_link(Module, Args) ->
98114
gen_server:start_link(?MODULE, {Module, Args}, []).
@@ -121,12 +137,23 @@ count_children(Supervisor) ->
121137
init({Mod, Args}) ->
122138
erlang:process_flag(trap_exit, true),
123139
case Mod:init(Args) of
124-
{ok, {{Strategy, _Intensity, _Period}, StartSpec}} ->
125-
State = init_state(StartSpec, #state{restart_strategy = Strategy}),
140+
{ok, {{Strategy, Intensity, Period}, StartSpec}} ->
141+
State = init_state(StartSpec, #state{
142+
restart_strategy = Strategy,
143+
intensity = Intensity,
144+
period = Period
145+
}),
126146
NewChildren = start_children(State#state.children, []),
127147
{ok, State#state{children = NewChildren}};
128-
{ok, {#{strategy := Strategy}, StartSpec}} ->
129-
State = init_state(StartSpec, #state{restart_strategy = Strategy}),
148+
{ok, {#{} = SupSpec, StartSpec}} ->
149+
Strategy = maps:get(strategy, SupSpec, one_for_one),
150+
Intensity = maps:get(intensity, SupSpec, ?DEFAULT_INTENSITY),
151+
Period = maps:get(period, SupSpec, ?DEFAULT_PERIOD),
152+
State = init_state(StartSpec, #state{
153+
restart_strategy = Strategy,
154+
intensity = Intensity,
155+
period = Period
156+
}),
130157
NewChildren = start_children(State#state.children, []),
131158
{ok, State#state{children = NewChildren}};
132159
Error ->
@@ -322,7 +349,15 @@ handle_child_exit(Pid, Reason, State) ->
322349
#child{} = Child ->
323350
case should_restart(Reason, Child#child.restart) of
324351
true ->
325-
handle_restart_strategy(Child, State);
352+
case add_restart(State) of
353+
{ok, State1} ->
354+
handle_restart_strategy(Child, State1);
355+
{shutdown, State1} ->
356+
RemainingChildren = lists:keydelete(
357+
Pid, #child.pid, State1#state.children
358+
),
359+
{shutdown, State1#state{children = RemainingChildren}}
360+
end;
326361
false ->
327362
Children = lists:keydelete(Pid, #child.pid, State#state.children),
328363
{noreply, State#state{children = Children}}
@@ -370,6 +405,8 @@ should_restart(Reason, transient) ->
370405

371406
loop_terminate([#child{pid = undefined} | Tail], AccRemaining) ->
372407
loop_terminate(Tail, AccRemaining);
408+
loop_terminate([#child{pid = {restarting, _}} | Tail], AccRemaining) ->
409+
loop_terminate(Tail, AccRemaining);
373410
loop_terminate([#child{pid = Pid} = Child | Tail], AccRemaining) when is_pid(Pid) ->
374411
do_terminate(Child),
375412
loop_terminate(Tail, [Pid | AccRemaining]);
@@ -454,6 +491,47 @@ do_terminate_one_for_all([Child | Children], StopPids) ->
454491
do_terminate_one_for_all(Children, StopPids)
455492
end.
456493

494+
add_restart(
495+
#state{
496+
intensity = Intensity, period = Period, restart_count = RestartCount, restarts = Restarts
497+
} = State
498+
) ->
499+
Now = erlang:monotonic_time(millisecond),
500+
Threshold = Now - Period * 1000,
501+
case can_restart(Intensity, Threshold, Restarts, RestartCount) of
502+
{true, RestartCount1, Restarts1} ->
503+
{ok, State#state{
504+
restarts = Restarts1 ++ [Now], restart_count = RestartCount1 + 1
505+
}};
506+
{false, _RestartCount1, _Restarts1} ->
507+
% TODO: log supervisor shutdown due to maximum intensity exceeded
508+
{shutdown, State}
509+
end.
510+
511+
can_restart(0, _, _, _) ->
512+
{false, 0, []};
513+
can_restart(_, _, _, 0) ->
514+
{true, 0, []};
515+
can_restart(Intensity, Threshold, Restarts, RestartCount) when
516+
RestartCount >= ?STALE_RESTART_LIMIT
517+
->
518+
{NewCount, Restarts1} = trim_expired_restarts(Threshold, lists:sort(Restarts)),
519+
can_restart(Intensity, Threshold, Restarts1, NewCount);
520+
can_restart(Intensity, Threshold, [Restart | _] = Restarts, RestartCount) when
521+
RestartCount >= Intensity andalso Restart < Threshold
522+
->
523+
{NewCount, Restarts1} = trim_expired_restarts(Threshold, lists:sort(Restarts)),
524+
can_restart(Intensity, Threshold, Restarts1, NewCount);
525+
can_restart(Intensity, _, Restarts, RestartCount) when RestartCount >= Intensity ->
526+
{false, RestartCount, Restarts};
527+
can_restart(Intensity, _, Restarts, RestartCount) when RestartCount < Intensity ->
528+
{true, RestartCount, Restarts}.
529+
530+
trim_expired_restarts(Threshold, [Restart | Restarts]) when Restart < Threshold ->
531+
trim_expired_restarts(Threshold, Restarts);
532+
trim_expired_restarts(_Threshold, Restarts) ->
533+
{length(Restarts), Restarts}.
534+
457535
child_to_info(#child{id = Id, pid = Pid, type = Type, modules = Modules}) ->
458536
Child =
459537
case Pid of

tests/libs/estdlib/test_supervisor.erl

Lines changed: 82 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ test() ->
3636
ok = test_which_children(),
3737
ok = test_count_children(),
3838
ok = test_one_for_all(),
39+
ok = test_crash_limits(),
3940
ok.
4041

4142
test_basic_supervisor() ->
@@ -321,7 +322,19 @@ init({test_one_for_all, Parent}) ->
321322
modules => [notify_init_server]
322323
}
323324
],
324-
{ok, {#{strategy => one_for_all, intensity => 10000, period => 3600}, ChildSpecs}}.
325+
{ok, {#{strategy => one_for_all, intensity => 10000, period => 3600}, ChildSpecs}};
326+
init({test_crash_limits, Intensity, Period, Parent}) ->
327+
ChildSpec = [
328+
#{
329+
id => test_child,
330+
start => {ping_pong_server, start_link, [Parent]},
331+
restart => transient,
332+
shutdown => brutal_kill,
333+
type => worker,
334+
modules => [ping_pong_server]
335+
}
336+
],
337+
{ok, {#{strategy => one_for_one, intensity => Intensity, period => Period}, ChildSpec}}.
325338

326339
test_supervisor_order() ->
327340
{ok, SupPid} = supervisor:start_link(?MODULE, {test_supervisor_order, self()}),
@@ -440,3 +453,71 @@ test_one_for_all() ->
440453
unlink(SupPid),
441454
exit(SupPid, shutdown),
442455
ok.
456+
457+
test_crash_limits() ->
458+
%% Trap exits so this test doesn't shutdown with the supervisor
459+
process_flag(trap_exit, true),
460+
Intensity = 2,
461+
Period = 5,
462+
{ok, SupPid} = supervisor:start_link(
463+
{local, test_crash_limits}, ?MODULE, {test_crash_limits, Intensity, Period, self()}
464+
),
465+
Pid1 = get_ping_pong_pid(),
466+
gen_server:call(Pid1, {stop, test_crash1}),
467+
Pid2 = get_ping_pong_pid(),
468+
gen_server:cast(Pid2, {crash, test_crash2}),
469+
Pid3 = get_ping_pong_pid(),
470+
%% Wait until period expires so we can test that stale shutdowns are purged from the shutdown list
471+
timer:sleep(6000),
472+
473+
gen_server:call(Pid3, {stop, test_crash3}),
474+
Pid4 = get_ping_pong_pid(),
475+
gen_server:cast(Pid4, {crash, test_crash4}),
476+
Pid5 = get_ping_pong_pid(),
477+
478+
%% The next crash will reach the restart threshold and shutdown the supervisor
479+
gen_server:call(Pid5, {stop, test_crash5}),
480+
481+
%% Test supervisor has exited
482+
ok =
483+
receive
484+
{'EXIT', SupPid, shutdown} ->
485+
ok
486+
after 2000 ->
487+
error({supervisor_not_stopped, reached_max_restart_intensity})
488+
end,
489+
process_flag(trap_exit, false),
490+
491+
%% Test child crashed and was not restarted
492+
ok =
493+
try gen_server:call(Pid5, ping) of
494+
pong -> error(not_stopped, Pid5)
495+
catch
496+
exit:{noproc, _MFA} -> ok
497+
end,
498+
ok =
499+
try get_ping_pong_pid() of
500+
Pid6 when is_pid(Pid6) ->
501+
error({child_restarted, reached_max_restart_intensity})
502+
catch
503+
throw:timeout ->
504+
ok
505+
end,
506+
507+
ok =
508+
try erlang:process_info(SupPid, links) of
509+
{links, Links} when is_list(Links) ->
510+
error({not_stopped, reached_max_restart_intensity});
511+
undefined ->
512+
ok
513+
catch
514+
error:badarg ->
515+
ok
516+
end,
517+
ok.
518+
519+
get_ping_pong_pid() ->
520+
receive
521+
{ping_pong_server_ready, Pid} -> Pid
522+
after 2000 -> throw(timeout)
523+
end.

0 commit comments

Comments
 (0)