Skip to content

Commit ad05bf7

Browse files
committed
Add support for supervisor one_for_all strategy
Adds support for handling restarts for the `one_for_all` strategy that was documented, but lacked implementation. Makes necessary changes to ensure children are always restarted in the same order they were originally started, and shutdown in reverse order with last child first, conforming to OTP behavior. Closes #1855 Signed-off-by: Winford <[email protected]>
1 parent 5e97803 commit ad05bf7

File tree

3 files changed

+197
-15
lines changed

3 files changed

+197
-15
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ instead `badarg`.
9191
- Fixed a bug where empty atom could not be created on some platforms, thus breaking receiving a message for a registered process from an OTP node.
9292
- Fix a memory leak in distribution when a BEAM node would monitor a process by name.
9393
- Fix `list_to_integer`, it was likely buggy with integers close to INT64_MAX
94+
- Added missing support for supervisor `one_for_all` strategy.
9495

9596
## [0.6.7] - Unreleased
9697

libs/estdlib/src/supervisor.erl

Lines changed: 101 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -83,18 +83,20 @@
8383
}.
8484

8585
-record(child, {
86-
pid = undefined,
86+
pid = undefined :: pid() | undefined | {restarting, pid()} | {restarting, undefined},
8787
id :: any(),
8888
start :: {module(), atom(), [any()] | undefined},
8989
restart :: restart(),
9090
shutdown :: shutdown(),
9191
type :: child_type(),
9292
modules = [] :: [module()] | dynamic
9393
}).
94-
-record(state, {restart_strategy :: strategy(), children = [] :: [#child{}]}).
94+
%% note: the list of children should always be kept in order, with first to start at the head.
95+
-record(state, {restart_strategy = one_for_one :: strategy(), children = [] :: [#child{}]}).
9596

9697
start_link(Module, Args) ->
9798
gen_server:start_link(?MODULE, {Module, Args}, []).
99+
98100
start_link(SupName, Module, Args) ->
99101
gen_server:start_link(SupName, ?MODULE, {Module, Args}, []).
100102

@@ -172,7 +174,7 @@ child_spec_to_record(#{id := ChildId, start := MFA} = ChildMap) ->
172174
init_state([ChildSpec | T], State) ->
173175
Child = child_spec_to_record(ChildSpec),
174176
NewChildren = [Child | State#state.children],
175-
init_state(T, #state{children = NewChildren});
177+
init_state(T, State#state{children = NewChildren});
176178
init_state([], State) ->
177179
State#state{children = lists:reverse(State#state.children)}.
178180

@@ -182,7 +184,8 @@ start_children([Child | T], StartedC) ->
182184
start_children(T, [Child#child{pid = Pid} | StartedC])
183185
end;
184186
start_children([], StartedC) ->
185-
StartedC.
187+
%% We should always keep the start list in order for later one_for_all restarts.
188+
lists:reverse(StartedC).
186189

187190
handle_call({start_child, ChildSpec}, _From, #state{children = Children} = State) ->
188191
Child = child_spec_to_record(ChildSpec),
@@ -196,7 +199,9 @@ handle_call({start_child, ChildSpec}, _From, #state{children = Children} = State
196199
case try_start(Child) of
197200
{ok, Pid, Result} ->
198201
UpdatedChild = Child#child{pid = Pid},
199-
{reply, Result, State#state{children = [UpdatedChild | Children]}};
202+
%% The last child to start should always be at the end of the child
203+
%% start list.
204+
{reply, Result, State#state{children = Children ++ [UpdatedChild]}};
200205
{error, _Reason} = ErrorT ->
201206
{reply, ErrorT, State}
202207
end
@@ -255,6 +260,9 @@ handle_cast(_Msg, State) ->
255260

256261
handle_info({'EXIT', Pid, Reason}, State) ->
257262
case handle_child_exit(Pid, Reason, State) of
263+
{ok, State1, restart_all_children} ->
264+
Children = State1#state.children,
265+
{noreply, State1, {timeout, 0, {restart_many_children, Children}}};
258266
{ok, State1} ->
259267
{noreply, State1};
260268
{shutdown, State1} ->
@@ -268,13 +276,28 @@ handle_info({ensure_killed, Pid}, State) ->
268276
exit(Pid, kill),
269277
{noreply, State}
270278
end;
279+
handle_info({restart_many_children, []}, State) ->
280+
{noreply, State};
281+
handle_info({restart_many_children, [#child{id = Id} = Child | Children] = ChildSpecs}, State) ->
282+
case try_start(Child) of
283+
{ok, NewPid, _Result} ->
284+
NewChild = Child#child{pid = NewPid},
285+
NewChildren = lists:keyreplace(
286+
Id, #child.id, State#state.children, NewChild
287+
),
288+
{noreply, State#state{children = NewChildren},
289+
{timeout, 0, {restart_many_children, Children}}};
290+
{error, _Reason} ->
291+
{noreply, State, {timeout, 0, {restart_many_children, ChildSpecs}}}
292+
end;
271293
handle_info(_Msg, State) ->
272294
%TODO: log unexpected message
273295
{noreply, State}.
274296

275297
%% @hidden
276298
terminate(_Reason, #state{children = Children} = State) ->
277-
RemainingChildren = loop_terminate(Children, []),
299+
%% Shutdown children last to first.
300+
RemainingChildren = loop_terminate(lists:reverse(Children), []),
278301
loop_wait_termination(RemainingChildren),
279302
{ok, State}.
280303

@@ -298,20 +321,33 @@ handle_child_exit(Pid, Reason, State) ->
298321
#child{} = Child ->
299322
case should_restart(Reason, Child#child.restart) of
300323
true ->
301-
case try_start(Child) of
302-
{ok, NewPid, _Result} ->
303-
NewChild = Child#child{pid = NewPid},
304-
Children = lists:keyreplace(
305-
Pid, #child.pid, State#state.children, NewChild
306-
),
307-
{ok, State#state{children = Children}}
308-
end;
324+
handle_restart_strategy(Child, State);
309325
false ->
310326
Children = lists:keydelete(Pid, #child.pid, State#state.children),
311327
{ok, State#state{children = Children}}
312328
end
313329
end.
314330

331+
handle_restart_strategy(
332+
#child{id = Id} = Child, #state{restart_strategy = one_for_one} = State
333+
) ->
334+
case try_start(Child) of
335+
{ok, NewPid, _Result} ->
336+
NewChild = Child#child{pid = NewPid},
337+
Children = lists:keyreplace(
338+
Id, #child.id, State#state.children, NewChild
339+
),
340+
{ok, State#state{children = Children}}
341+
end;
342+
handle_restart_strategy(
343+
#child{pid = Pid} = Child, #state{restart_strategy = one_for_all} = State
344+
) ->
345+
Children = lists:keyreplace(Pid, #child.pid, State#state.children, Child#child{
346+
pid = {restarting, Pid}
347+
}),
348+
{ok, NewChildren} = terminate_one_for_all_restart(Children),
349+
{ok, State#state{children = NewChildren}, restart_all_children}.
350+
315351
should_restart(_Reason, permanent) ->
316352
true;
317353
should_restart(_Reason, temporary) ->
@@ -367,6 +403,57 @@ try_start(#child{start = {M, F, Args}} = Record) ->
367403
{error, {{'EXIT', Error}, Record}}
368404
end.
369405

406+
terminate_one_for_all_restart(Children) ->
407+
%% Always shut down last child first
408+
do_terminate_one_for_all_restart(lists:reverse(Children), []).
409+
410+
do_terminate_one_for_all_restart([], NewChildren) ->
411+
{ok, lists:reverse(NewChildren)};
412+
do_terminate_one_for_all_restart([Child | Children], NewChildren) ->
413+
case Child of
414+
#child{restart = {terminating, temporary, From}} = Child when is_pid(From) ->
415+
do_terminate(Child),
416+
ok = wait_child_exit(Child),
417+
gen_server:reply(From, ok),
418+
do_terminate_one_for_all_restart(Children, NewChildren);
419+
#child{restart = {terminating, _Restart, From}} = Child when is_pid(From) ->
420+
do_terminate(Child),
421+
ok = wait_child_exit(Child),
422+
gen_server:reply(From, ok),
423+
do_terminate_one_for_all_restart(Children, [Child#child{pid = undefined} | NewChildren]);
424+
#child{pid = undefined, restart = temporary} = Child ->
425+
do_terminate_one_for_all_restart(Children, NewChildren);
426+
#child{pid = Pid, restart = temporary} = Child when is_pid(Pid) ->
427+
do_terminate(Child),
428+
ok = wait_child_exit(Child),
429+
do_terminate_one_for_all_restart(Children, NewChildren);
430+
#child{pid = undefined} = Child ->
431+
do_terminate_one_for_all_restart(Children, [Child | NewChildren]);
432+
#child{pid = {restarting, _Pid}} = Child ->
433+
do_terminate_one_for_all_restart(Children, [Child | NewChildren]);
434+
#child{pid = Pid} = Child when is_pid(Pid) ->
435+
do_terminate(Child),
436+
ok = wait_child_exit(Child),
437+
do_terminate_one_for_all_restart(Children, [
438+
Child#child{pid = {restarting, Pid}} | NewChildren
439+
])
440+
end.
441+
442+
wait_child_exit(#child{pid = Pid, shutdown = brutal_kill} = Child) ->
443+
receive
444+
{'EXIT', Pid, _Reason} ->
445+
ok
446+
after 500 ->
447+
{timeout, {no_exit, Child}}
448+
end;
449+
wait_child_exit(#child{pid = Pid, shutdown = Timeout} = Child) ->
450+
receive
451+
{'EXIT', Pid, _Reason} ->
452+
ok
453+
after Timeout + 100 ->
454+
{timeout, {no_exit, Child}}
455+
end.
456+
370457
child_to_info(#child{id = Id, pid = Pid, type = Type, modules = Modules}) ->
371458
Child =
372459
case Pid of

tests/libs/estdlib/test_supervisor.erl

Lines changed: 95 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ test() ->
3535
ok = test_terminate_timeout(),
3636
ok = test_which_children(),
3737
ok = test_count_children(),
38+
ok = test_one_for_all(),
3839
ok.
3940

4041
test_basic_supervisor() ->
@@ -292,7 +293,35 @@ init({test_supervisor_order, Parent}) ->
292293
],
293294
{ok, {{one_for_one, 10000, 3600}, ChildSpecs}};
294295
init({test_no_child, _Parent}) ->
295-
{ok, {#{strategy => one_for_one, intensity => 10000, period => 3600}, []}}.
296+
{ok, {#{strategy => one_for_one, intensity => 10000, period => 3600}, []}};
297+
init({test_one_for_all, Parent}) ->
298+
ChildSpecs = [
299+
#{
300+
id => ping_pong_1,
301+
start => {ping_pong_server, start_link, [Parent]},
302+
restart => permanent,
303+
shutdown => brutal_kill,
304+
type => worker,
305+
modules => [ping_pong_server]
306+
},
307+
#{
308+
id => ping_pong_2,
309+
start => {ping_pong_server, start_link, [Parent]},
310+
restart => transient,
311+
shutdown => brutal_kill,
312+
type => worker,
313+
modules => [ping_pong_server]
314+
},
315+
#{
316+
id => ready_0,
317+
start => {notify_init_server, start_link, [{Parent, ready_0}]},
318+
restart => temporary,
319+
shutdown => brutal_kill,
320+
type => worker,
321+
modules => [notify_init_server]
322+
}
323+
],
324+
{ok, {#{strategy => one_for_all, intensity => 10000, period => 3600}, ChildSpecs}}.
296325

297326
test_supervisor_order() ->
298327
{ok, SupPid} = supervisor:start_link(?MODULE, {test_supervisor_order, self()}),
@@ -346,3 +375,68 @@ test_which_children() ->
346375
unlink(SupPid),
347376
exit(SupPid, shutdown),
348377
ok.
378+
379+
test_one_for_all() ->
380+
{ok, SupPid} = supervisor:start_link({local, allforone}, ?MODULE, {test_one_for_all, self()}),
381+
% Collect startup message from permanent ping_pong_server
382+
Server_1 = get_and_test_server(),
383+
% Collect startup message from transient ping_pong_server
384+
Server_2 = get_and_test_server(),
385+
% Collect startup message from temporary notify_init_server
386+
ready_0 =
387+
receive
388+
Msg1 -> Msg1
389+
after 1000 -> error({timeout, {start, ready_0}})
390+
end,
391+
392+
[{specs, 3}, {active, 3}, {supervisors, 0}, {workers, 3}] = supervisor:count_children(SupPid),
393+
394+
%% Monitor transient Server_2 to make sure it is stopped, and restarted when
395+
%% permanent Server_1 is shutdown.
396+
MonRef = monitor(process, Server_2),
397+
ok = gen_server:call(Server_1, {stop, test_crash}),
398+
%% Server_2 should exit before the first child is restarted, but exit messages from
399+
%% monitored processes may take some time to be received so we may get the message
400+
%% from the first restarted child first.
401+
First =
402+
receive
403+
{'DOWN', MonRef, process, Server_2, killed} ->
404+
down;
405+
{ping_pong_server_ready, Restart1} when is_pid(Restart1) ->
406+
ready
407+
after 1000 ->
408+
error({timeout, restart_after_crash})
409+
end,
410+
ok =
411+
case First of
412+
down ->
413+
receive
414+
{ping_pong_server_ready, Restart_1} when is_pid(Restart_1) -> ok
415+
after 1000 ->
416+
error({timeout, restart_after_crash})
417+
end;
418+
ready ->
419+
receive
420+
{'DOWN', MonRef, process, Server_2, killed} -> ok
421+
after 1000 ->
422+
error({timeout, restart_after_crash})
423+
end
424+
end,
425+
426+
demonitor(MonRef),
427+
428+
% Collect startup message from restarted transient ping_pong_server child
429+
_Restart_2 = get_and_test_server(),
430+
% Make sure temporary notify_init_server is not restarted
431+
no_start =
432+
receive
433+
ready_0 -> error({error, restarted_temporary})
434+
after 1000 -> no_start
435+
end,
436+
437+
% Ensure correct number of children
438+
[{specs, 2}, {active, 2}, {supervisors, 0}, {workers, 2}] = supervisor:count_children(SupPid),
439+
440+
unlink(SupPid),
441+
exit(SupPid, shutdown),
442+
ok.

0 commit comments

Comments
 (0)