Skip to content

Commit 24c0297

Browse files
committed
Add support for supervisor one_for_all strategy
Adds support for handling restarts for the `one_for_all` strategy that was documented, but lacked implementation. Makes necessary changes to ensure children are always restarted in the same order they were originally started, and shutdown in reverse order with last child first, conforming to OTP behavior. Closes #1855 Signed-off-by: Winford <[email protected]>
1 parent 5e97803 commit 24c0297

File tree

3 files changed

+228
-15
lines changed

3 files changed

+228
-15
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,7 @@ instead `badarg`.
9191
- Fixed a bug where empty atom could not be created on some platforms, thus breaking receiving a message for a registered process from an OTP node.
9292
- Fix a memory leak in distribution when a BEAM node would monitor a process by name.
9393
- Fix `list_to_integer`, it was likely buggy with integers close to INT64_MAX
94+
- Added missing support for supervisor `one_for_all` strategy.
9495

9596
## [0.6.7] - Unreleased
9697

libs/estdlib/src/supervisor.erl

Lines changed: 132 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -83,18 +83,20 @@
8383
}.
8484

8585
-record(child, {
86-
pid = undefined,
86+
pid = undefined :: pid() | undefined | {restarting, pid()} | {restarting, undefined},
8787
id :: any(),
8888
start :: {module(), atom(), [any()] | undefined},
8989
restart :: restart(),
9090
shutdown :: shutdown(),
9191
type :: child_type(),
9292
modules = [] :: [module()] | dynamic
9393
}).
94-
-record(state, {restart_strategy :: strategy(), children = [] :: [#child{}]}).
94+
%% note: the list of children should always be kept in order, with first to start at the head.
95+
-record(state, {restart_strategy = one_for_one :: strategy(), children = [] :: [#child{}]}).
9596

9697
start_link(Module, Args) ->
9798
gen_server:start_link(?MODULE, {Module, Args}, []).
99+
98100
start_link(SupName, Module, Args) ->
99101
gen_server:start_link(SupName, ?MODULE, {Module, Args}, []).
100102

@@ -172,7 +174,7 @@ child_spec_to_record(#{id := ChildId, start := MFA} = ChildMap) ->
172174
init_state([ChildSpec | T], State) ->
173175
Child = child_spec_to_record(ChildSpec),
174176
NewChildren = [Child | State#state.children],
175-
init_state(T, #state{children = NewChildren});
177+
init_state(T, State#state{children = NewChildren});
176178
init_state([], State) ->
177179
State#state{children = lists:reverse(State#state.children)}.
178180

@@ -182,21 +184,26 @@ start_children([Child | T], StartedC) ->
182184
start_children(T, [Child#child{pid = Pid} | StartedC])
183185
end;
184186
start_children([], StartedC) ->
185-
StartedC.
187+
%% We should always keep the start list in order for later one_for_all restarts.
188+
lists:reverse(StartedC).
186189

187190
handle_call({start_child, ChildSpec}, _From, #state{children = Children} = State) ->
188191
Child = child_spec_to_record(ChildSpec),
189192
#child{id = ID} = Child,
190193
case lists:keyfind(ID, #child.id, State#state.children) of
191194
#child{pid = undefined} ->
192195
{reply, {error, already_present}, State};
196+
#child{pid = {restarting, _Pid}} ->
197+
{reply, {error, {already_started, restarting}}, State};
193198
#child{pid = Pid} ->
194199
{reply, {error, {already_started, Pid}}, State};
195200
false ->
196201
case try_start(Child) of
197202
{ok, Pid, Result} ->
198203
UpdatedChild = Child#child{pid = Pid},
199-
{reply, Result, State#state{children = [UpdatedChild | Children]}};
204+
%% The last child to start should always be at the end of the child
205+
%% start list.
206+
{reply, Result, State#state{children = Children ++ [UpdatedChild]}};
200207
{error, _Reason} = ErrorT ->
201208
{reply, ErrorT, State}
202209
end
@@ -257,6 +264,9 @@ handle_info({'EXIT', Pid, Reason}, State) ->
257264
case handle_child_exit(Pid, Reason, State) of
258265
{ok, State1} ->
259266
{noreply, State1};
267+
{ok, State1, restart_all_children} ->
268+
Children = State1#state.children,
269+
{noreply, State1, {timeout, 0, {restart_many_children, Children}}};
260270
{shutdown, State1} ->
261271
{stop, shutdown, State1}
262272
end;
@@ -268,13 +278,28 @@ handle_info({ensure_killed, Pid}, State) ->
268278
exit(Pid, kill),
269279
{noreply, State}
270280
end;
281+
handle_info({restart_many_children, []}, State) ->
282+
{noreply, State};
283+
handle_info({restart_many_children, [#child{id = Id} = Child | Children] = ChildSpecs}, State) ->
284+
case try_start(Child) of
285+
{ok, NewPid, _Result} ->
286+
NewChild = Child#child{pid = NewPid},
287+
NewChildren = lists:keyreplace(
288+
Id, #child.id, State#state.children, NewChild
289+
),
290+
{noreply, State#state{children = NewChildren},
291+
{timeout, 0, {restart_many_children, Children}}};
292+
{error, _Reason} ->
293+
{noreply, State, {timeout, 0, {restart_many_children, ChildSpecs}}}
294+
end;
271295
handle_info(_Msg, State) ->
272296
%TODO: log unexpected message
273297
{noreply, State}.
274298

275299
%% @hidden
276300
terminate(_Reason, #state{children = Children} = State) ->
277-
RemainingChildren = loop_terminate(Children, []),
301+
%% Shutdown children last to first.
302+
RemainingChildren = loop_terminate(lists:reverse(Children), []),
278303
loop_wait_termination(RemainingChildren),
279304
{ok, State}.
280305

@@ -298,20 +323,38 @@ handle_child_exit(Pid, Reason, State) ->
298323
#child{} = Child ->
299324
case should_restart(Reason, Child#child.restart) of
300325
true ->
301-
case try_start(Child) of
302-
{ok, NewPid, _Result} ->
303-
NewChild = Child#child{pid = NewPid},
304-
Children = lists:keyreplace(
305-
Pid, #child.pid, State#state.children, NewChild
306-
),
307-
{ok, State#state{children = Children}}
308-
end;
326+
handle_restart_strategy(Child, State);
309327
false ->
310328
Children = lists:keydelete(Pid, #child.pid, State#state.children),
311329
{ok, State#state{children = Children}}
312330
end
313331
end.
314332

333+
handle_restart_strategy(
334+
#child{id = Id} = Child, #state{restart_strategy = one_for_one} = State
335+
) ->
336+
case try_start(Child) of
337+
{ok, NewPid, _Result} ->
338+
NewChild = Child#child{pid = NewPid},
339+
Children = lists:keyreplace(
340+
Id, #child.id, State#state.children, NewChild
341+
),
342+
{ok, State#state{children = Children}}
343+
end;
344+
handle_restart_strategy(
345+
#child{pid = Pid} = Child, #state{restart_strategy = one_for_all} = State
346+
) ->
347+
Children = lists:keyreplace(Pid, #child.pid, State#state.children, Child#child{
348+
pid = {restarting, Pid}
349+
}),
350+
case terminate_one_for_all_restart(Children) of
351+
{ok, NewChildren} ->
352+
ok;
353+
{ok, NewChildren, WaitExit} ->
354+
ok = loop_wait_termination(WaitExit)
355+
end,
356+
{ok, State#state{children = NewChildren}, restart_all_children}.
357+
315358
should_restart(_Reason, permanent) ->
316359
true;
317360
should_restart(_Reason, temporary) ->
@@ -367,6 +410,81 @@ try_start(#child{start = {M, F, Args}} = Record) ->
367410
{error, {{'EXIT', Error}, Record}}
368411
end.
369412

413+
terminate_one_for_all_restart(Children) ->
414+
%% Always shut down last child first
415+
do_terminate_one_for_all_restart(lists:reverse(Children), [], []).
416+
417+
do_terminate_one_for_all_restart([], NewChildren, []) ->
418+
%% Do not reverse the list here, it was reversed before being accumulated
419+
%% and is now in correct startup order.
420+
{ok, NewChildren};
421+
do_terminate_one_for_all_restart([], NewChildren, WaitExit) ->
422+
%% Do not reverse the list here, it was reversed before being accumulated
423+
%% and is now in correct startup order.
424+
{ok, NewChildren, WaitExit};
425+
do_terminate_one_for_all_restart([Child | Children], NewChildren, WaitExit) ->
426+
case Child of
427+
#child{restart = {terminating, temporary, From}} = Child when is_pid(From) ->
428+
do_terminate(Child),
429+
case verify_shutdown(Child) of
430+
true ->
431+
do_terminate_one_for_all_restart(Children, NewChildren, WaitExit);
432+
false ->
433+
do_terminate_one_for_all_restart(Children, NewChildren, [Child | WaitExit])
434+
end;
435+
#child{restart = {terminating, _Restart, From}} = Child when is_pid(From) ->
436+
do_terminate(Child),
437+
case verify_shutdown(Child) of
438+
true ->
439+
do_terminate_one_for_all_restart(Children, [Child#child{pid = undefined} | NewChildren], WaitExit);
440+
false ->
441+
do_terminate_one_for_all_restart(Children, [Child | NewChildren], [Child | WaitExit])
442+
end;
443+
#child{pid = undefined, restart = temporary} = Child ->
444+
do_terminate_one_for_all_restart(Children, NewChildren, WaitExit);
445+
#child{pid = undefined} = Child ->
446+
do_terminate_one_for_all_restart(Children, [Child | NewChildren], WaitExit);
447+
#child{pid = {restarting, _Pid}} = Child ->
448+
do_terminate_one_for_all_restart(Children, [Child | NewChildren], WaitExit);
449+
#child{pid = Pid, restart = temporary} = Child when is_pid(Pid) ->
450+
do_terminate(Child),
451+
case verify_shutdown(Child) of
452+
true ->
453+
do_terminate_one_for_all_restart(Children, NewChildren, WaitExit);
454+
false ->
455+
do_terminate_one_for_all_restart(Children, NewChildren, [Child | WaitExit])
456+
end;
457+
#child{pid = Pid} = Child when is_pid(Pid) ->
458+
do_terminate(Child),
459+
case verify_shutdown(Child) of
460+
true ->
461+
do_terminate_one_for_all_restart(Children, [Child#child{pid = {restarting, Pid}} | NewChildren], WaitExit);
462+
false ->
463+
do_terminate_one_for_all_restart(Children, [Child#child{pid = {restarting, Pid}} | NewChildren], [Child | WaitExit])
464+
end
465+
end.
466+
467+
verify_shutdown(#child{pid = Pid, shutdown = brutal_kill} = _Child) ->
468+
receive
469+
{'EXIT', Pid, _Reason} ->
470+
true
471+
after 100 ->
472+
false
473+
end;
474+
verify_shutdown(#child{pid = Pid, shutdown = Timeout} = _Child) ->
475+
receive
476+
{'EXIT', Pid, _Reason} ->
477+
true
478+
after Timeout ->
479+
exit(Pid, kill),
480+
receive
481+
{'EXIT', Pid, killed} ->
482+
true
483+
after 100 ->
484+
false
485+
end
486+
end.
487+
370488
child_to_info(#child{id = Id, pid = Pid, type = Type, modules = Modules}) ->
371489
Child =
372490
case Pid of

tests/libs/estdlib/test_supervisor.erl

Lines changed: 95 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ test() ->
3535
ok = test_terminate_timeout(),
3636
ok = test_which_children(),
3737
ok = test_count_children(),
38+
ok = test_one_for_all(),
3839
ok.
3940

4041
test_basic_supervisor() ->
@@ -292,7 +293,35 @@ init({test_supervisor_order, Parent}) ->
292293
],
293294
{ok, {{one_for_one, 10000, 3600}, ChildSpecs}};
294295
init({test_no_child, _Parent}) ->
295-
{ok, {#{strategy => one_for_one, intensity => 10000, period => 3600}, []}}.
296+
{ok, {#{strategy => one_for_one, intensity => 10000, period => 3600}, []}};
297+
init({test_one_for_all, Parent}) ->
298+
ChildSpecs = [
299+
#{
300+
id => ping_pong_1,
301+
start => {ping_pong_server, start_link, [Parent]},
302+
restart => permanent,
303+
shutdown => brutal_kill,
304+
type => worker,
305+
modules => [ping_pong_server]
306+
},
307+
#{
308+
id => ping_pong_2,
309+
start => {ping_pong_server, start_link, [Parent]},
310+
restart => transient,
311+
shutdown => brutal_kill,
312+
type => worker,
313+
modules => [ping_pong_server]
314+
},
315+
#{
316+
id => ready_0,
317+
start => {notify_init_server, start_link, [{Parent, ready_0}]},
318+
restart => temporary,
319+
shutdown => brutal_kill,
320+
type => worker,
321+
modules => [notify_init_server]
322+
}
323+
],
324+
{ok, {#{strategy => one_for_all, intensity => 10000, period => 3600}, ChildSpecs}}.
296325

297326
test_supervisor_order() ->
298327
{ok, SupPid} = supervisor:start_link(?MODULE, {test_supervisor_order, self()}),
@@ -346,3 +375,68 @@ test_which_children() ->
346375
unlink(SupPid),
347376
exit(SupPid, shutdown),
348377
ok.
378+
379+
test_one_for_all() ->
380+
{ok, SupPid} = supervisor:start_link({local, allforone}, ?MODULE, {test_one_for_all, self()}),
381+
% Collect startup message from permanent ping_pong_server
382+
Server_1 = get_and_test_server(),
383+
% Collect startup message from transient ping_pong_server
384+
Server_2 = get_and_test_server(),
385+
% Collect startup message from temporary notify_init_server
386+
ready_0 =
387+
receive
388+
Msg1 -> Msg1
389+
after 1000 -> error({timeout, {start, ready_0}})
390+
end,
391+
392+
[{specs, 3}, {active, 3}, {supervisors, 0}, {workers, 3}] = supervisor:count_children(SupPid),
393+
394+
%% Monitor transient Server_2 to make sure it is stopped, and restarted when
395+
%% permanent Server_1 is shutdown.
396+
MonRef = monitor(process, Server_2),
397+
ok = gen_server:call(Server_1, {stop, test_crash}),
398+
%% Server_2 should exit before the first child is restarted, but exit messages from
399+
%% monitored processes may take some time to be received so we may get the message
400+
%% from the first restarted child first.
401+
First =
402+
receive
403+
{'DOWN', MonRef, process, Server_2, killed} ->
404+
down;
405+
{ping_pong_server_ready, Restart1} when is_pid(Restart1) ->
406+
ready
407+
after 1000 ->
408+
error({timeout, restart_after_crash})
409+
end,
410+
ok =
411+
case First of
412+
down ->
413+
receive
414+
{ping_pong_server_ready, Restart_1} when is_pid(Restart_1) -> ok
415+
after 1000 ->
416+
error({timeout, restart_after_crash})
417+
end;
418+
ready ->
419+
receive
420+
{'DOWN', MonRef, process, Server_2, killed} -> ok
421+
after 1000 ->
422+
error({timeout, restart_after_crash})
423+
end
424+
end,
425+
426+
demonitor(MonRef),
427+
428+
% Collect startup message from restarted transient ping_pong_server child
429+
_Restart_2 = get_and_test_server(),
430+
% Make sure temporary notify_init_server is not restarted
431+
no_start =
432+
receive
433+
ready_0 -> error({error, restarted_temporary})
434+
after 1000 -> no_start
435+
end,
436+
437+
% Ensure correct number of children
438+
[{specs, 2}, {active, 2}, {supervisors, 0}, {workers, 2}] = supervisor:count_children(SupPid),
439+
440+
unlink(SupPid),
441+
exit(SupPid, shutdown),
442+
ok.

0 commit comments

Comments
 (0)