Skip to content

Commit a7bb66e

Browse files
committed
Add support for supervisor one_for_all strategy
Adds support for handling restarts for the `one_for_all` strategy that was documented, but lacked implementation. Makes necessary changes to ensure children are always restarted in the same order they were originally started, and shutdown in reverse order with last child first, conforming to OTP behavior. Closes #1855 Signed-off-by: Winford <[email protected]>
1 parent 39b9dbc commit a7bb66e

File tree

3 files changed

+206
-24
lines changed

3 files changed

+206
-24
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ instead `badarg`.
9292
- Fixed a bug where empty atom could not be created on some platforms, thus breaking receiving a message for a registered process from an OTP node.
9393
- Fix a memory leak in distribution when a BEAM node would monitor a process by name.
9494
- Fix `list_to_integer`, it was likely buggy with integers close to INT64_MAX
95+
- Added missing support for supervisor `one_for_all` strategy.
9596

9697
## [0.6.7] - Unreleased
9798

libs/estdlib/src/supervisor.erl

Lines changed: 110 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -83,18 +83,20 @@
8383
}.
8484

8585
-record(child, {
86-
pid = undefined,
86+
pid = undefined :: pid() | undefined | {restarting, pid()} | {restarting, undefined},
8787
id :: any(),
8888
start :: {module(), atom(), [any()] | undefined},
8989
restart :: restart(),
9090
shutdown :: shutdown(),
9191
type :: child_type(),
9292
modules = [] :: [module()] | dynamic
9393
}).
94-
-record(state, {restart_strategy :: strategy(), children = [] :: [#child{}]}).
94+
%% note: the list of children should always be kept in order, with last to start at the head.
95+
-record(state, {restart_strategy = one_for_one :: strategy(), children = [] :: [#child{}]}).
9596

9697
start_link(Module, Args) ->
9798
gen_server:start_link(?MODULE, {Module, Args}, []).
99+
98100
start_link(SupName, Module, Args) ->
99101
gen_server:start_link(SupName, ?MODULE, {Module, Args}, []).
100102

@@ -172,7 +174,7 @@ child_spec_to_record(#{id := ChildId, start := MFA} = ChildMap) ->
172174
init_state([ChildSpec | T], State) ->
173175
Child = child_spec_to_record(ChildSpec),
174176
NewChildren = [Child | State#state.children],
175-
init_state(T, #state{children = NewChildren});
177+
init_state(T, State#state{children = NewChildren});
176178
init_state([], State) ->
177179
State#state{children = lists:reverse(State#state.children)}.
178180

@@ -190,12 +192,16 @@ handle_call({start_child, ChildSpec}, _From, #state{children = Children} = State
190192
case lists:keyfind(ID, #child.id, State#state.children) of
191193
#child{pid = undefined} ->
192194
{reply, {error, already_present}, State};
195+
#child{pid = {restarting, _Pid}} ->
196+
{reply, {error, restarting}, State};
193197
#child{pid = Pid} ->
194198
{reply, {error, {already_started, Pid}}, State};
195199
false ->
196200
case try_start(Child) of
197201
{ok, Pid, Result} ->
198202
UpdatedChild = Child#child{pid = Pid},
203+
%% The last child to start should always be at the end of the child
204+
%% start list.
199205
{reply, Result, State#state{children = [UpdatedChild | Children]}};
200206
{error, _Reason} = ErrorT ->
201207
{reply, ErrorT, State}
@@ -226,6 +232,8 @@ handle_call({restart_child, ID}, _From, #state{children = Children} = State) ->
226232
{error, _Reason} = ErrorT ->
227233
{reply, ErrorT, State}
228234
end;
235+
#child{pid = {restarting, _}} ->
236+
{reply, {error, restarting}, State};
229237
#child{} ->
230238
{reply, {error, running}, State};
231239
false ->
@@ -236,6 +244,8 @@ handle_call({delete_child, ID}, _From, #state{children = Children} = State) ->
236244
#child{pid = undefined} ->
237245
NewChildren = lists:keydelete(ID, #child.id, Children),
238246
{reply, ok, State#state{children = NewChildren}};
247+
#child{pid = {restarting, _}} ->
248+
{reply, {error, restarting}, State};
239249
#child{} ->
240250
{reply, {error, running}, State};
241251
false ->
@@ -254,12 +264,7 @@ handle_cast(_Msg, State) ->
254264
{noreply, State}.
255265

256266
handle_info({'EXIT', Pid, Reason}, State) ->
257-
case handle_child_exit(Pid, Reason, State) of
258-
{ok, State1} ->
259-
{noreply, State1};
260-
{shutdown, State1} ->
261-
{stop, shutdown, State1}
262-
end;
267+
handle_child_exit(Pid, Reason, State);
263268
handle_info({ensure_killed, Pid}, State) ->
264269
case lists:keyfind(Pid, #child.pid, State#state.children) of
265270
false ->
@@ -268,12 +273,31 @@ handle_info({ensure_killed, Pid}, State) ->
268273
exit(Pid, kill),
269274
{noreply, State}
270275
end;
276+
handle_info({restart_many_children, []}, State) ->
277+
{noreply, State};
278+
handle_info(
279+
{restart_many_children, [#child{pid = {restarting, _Pid0} = Pid} = Child | Children]}, State
280+
) ->
281+
case try_start(Child) of
282+
{ok, NewPid, _Result} ->
283+
NewChild = Child#child{pid = NewPid},
284+
NewChildren = lists:keyreplace(
285+
Pid, #child.pid, State#state.children, NewChild
286+
),
287+
{noreply, State#state{children = NewChildren},
288+
{timeout, 0, {restart_many_children, Children}}};
289+
{error, Reason} ->
290+
handle_child_exit(Pid, Reason, State)
291+
end;
292+
handle_info({restart_many_children, [#child{pid = undefined} = _Child | Children]}, State) ->
293+
{noreply, State, {timeout, 0, {restart_many_children, Children}}};
271294
handle_info(_Msg, State) ->
272295
%TODO: log unexpected message
273296
{noreply, State}.
274297

275298
%% @hidden
276299
terminate(_Reason, #state{children = Children} = State) ->
300+
%% Shutdown children last to first.
277301
RemainingChildren = loop_terminate(Children, []),
278302
loop_wait_termination(RemainingChildren),
279303
{ok, State}.
@@ -284,34 +308,56 @@ terminate(_Reason, #state{children = Children} = State) ->
284308
handle_child_exit(Pid, Reason, State) ->
285309
case lists:keyfind(Pid, #child.pid, State#state.children) of
286310
false ->
287-
{ok, State};
311+
{noreply, State};
288312
#child{restart = {terminating, temporary, From}} ->
289313
gen_server:reply(From, ok),
290314
NewChildren = lists:keydelete(Pid, #child.pid, State#state.children),
291-
{ok, State#state{children = NewChildren}};
315+
{noreply, State#state{children = NewChildren}};
292316
#child{restart = {terminating, Restart, From}} = Child ->
293317
gen_server:reply(From, ok),
294318
NewChildren = lists:keyreplace(Pid, #child.pid, State#state.children, Child#child{
295319
pid = undefined, restart = Restart
296320
}),
297-
{ok, State#state{children = NewChildren}};
321+
{noreply, State#state{children = NewChildren}};
298322
#child{} = Child ->
299323
case should_restart(Reason, Child#child.restart) of
300324
true ->
301-
case try_start(Child) of
302-
{ok, NewPid, _Result} ->
303-
NewChild = Child#child{pid = NewPid},
304-
Children = lists:keyreplace(
305-
Pid, #child.pid, State#state.children, NewChild
306-
),
307-
{ok, State#state{children = Children}}
308-
end;
325+
handle_restart_strategy(Child, State);
309326
false ->
310327
Children = lists:keydelete(Pid, #child.pid, State#state.children),
311-
{ok, State#state{children = Children}}
328+
{noreply, State#state{children = Children}}
312329
end
313330
end.
314331

332+
handle_restart_strategy(
333+
#child{id = Id} = Child, #state{restart_strategy = one_for_one} = State
334+
) ->
335+
case try_start(Child) of
336+
{ok, NewPid, _Result} ->
337+
NewChild = Child#child{pid = NewPid},
338+
Children = lists:keyreplace(
339+
Id, #child.id, State#state.children, NewChild
340+
),
341+
{noreply, State#state{children = Children}}
342+
end;
343+
handle_restart_strategy(
344+
#child{pid = Pid} = Child, #state{restart_strategy = one_for_all} = State
345+
) ->
346+
Children =
347+
case Pid of
348+
{restarting, _} ->
349+
State#state.children;
350+
Pid when is_pid(Pid) ->
351+
lists:keyreplace(Pid, #child.pid, State#state.children, Child#child{
352+
pid = {restarting, Pid}
353+
})
354+
end,
355+
ok = terminate_one_for_all(Children),
356+
{ok, NewChildren} = get_restart_children(Children),
357+
%% NewChildren is startup order (first at head) and needs to be reversed to keep Children in correct order in #state{}
358+
{noreply, State#state{children = lists:reverse(NewChildren)},
359+
{timeout, 0, {restart_many_children, NewChildren}}}.
360+
315361
should_restart(_Reason, permanent) ->
316362
true;
317363
should_restart(_Reason, temporary) ->
@@ -341,8 +387,7 @@ loop_wait_termination(RemainingChildren0) ->
341387
case lists:member(Pid, RemainingChildren0) of
342388
true ->
343389
exit(Pid, kill),
344-
RemainingChildren1 = lists:delete(Pid, RemainingChildren0),
345-
loop_wait_termination(RemainingChildren1);
390+
loop_wait_termination(RemainingChildren0);
346391
false ->
347392
loop_wait_termination(RemainingChildren0)
348393
end
@@ -367,6 +412,48 @@ try_start(#child{start = {M, F, Args}} = Record) ->
367412
{error, {{'EXIT', Error}, Record}}
368413
end.
369414

415+
get_restart_children(Children) ->
416+
get_restart_children(Children, []).
417+
418+
get_restart_children([], NewChildren) ->
419+
{ok, NewChildren};
420+
get_restart_children([Child | Children], NewChildren) ->
421+
case Child of
422+
#child{restart = {terminating, temporary, _From}} ->
423+
get_restart_children(Children, NewChildren);
424+
#child{restart = {terminating, _Restart, _From}} ->
425+
get_restart_children(Children, [Child | NewChildren]);
426+
#child{pid = undefined, restart = temporary} ->
427+
get_restart_children(Children, NewChildren);
428+
#child{pid = undefined} = Child ->
429+
get_restart_children(Children, [Child | NewChildren]);
430+
#child{pid = {restarting, _Pid}} = Child ->
431+
get_restart_children(Children, [Child | NewChildren]);
432+
#child{pid = Pid, restart = temporary} = Child when is_pid(Pid) ->
433+
get_restart_children(Children, NewChildren);
434+
#child{pid = Pid} = Child when is_pid(Pid) ->
435+
get_restart_children(Children, [Child#child{pid = {restarting, Pid}} | NewChildren])
436+
end.
437+
438+
terminate_one_for_all(Children) ->
439+
%% Always shut down last child first
440+
do_terminate_one_for_all(Children, []).
441+
442+
do_terminate_one_for_all([], StopPids) ->
443+
ok = loop_wait_termination(StopPids),
444+
%% After accumulation NewChildren are in correct order for restart.
445+
ok;
446+
do_terminate_one_for_all([Child | Children], StopPids) ->
447+
case Child of
448+
#child{pid = Pid} = Child when is_pid(Pid) ->
449+
do_terminate(Child),
450+
do_terminate_one_for_all(Children, [Pid | StopPids]);
451+
#child{pid = undefined} ->
452+
do_terminate_one_for_all(Children, StopPids);
453+
#child{pid = {restarting, _Pid}} = Child ->
454+
do_terminate_one_for_all(Children, StopPids)
455+
end.
456+
370457
child_to_info(#child{id = Id, pid = Pid, type = Type, modules = Modules}) ->
371458
Child =
372459
case Pid of

tests/libs/estdlib/test_supervisor.erl

Lines changed: 95 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ test() ->
3535
ok = test_terminate_timeout(),
3636
ok = test_which_children(),
3737
ok = test_count_children(),
38+
ok = test_one_for_all(),
3839
ok.
3940

4041
test_basic_supervisor() ->
@@ -292,7 +293,35 @@ init({test_supervisor_order, Parent}) ->
292293
],
293294
{ok, {{one_for_one, 10000, 3600}, ChildSpecs}};
294295
init({test_no_child, _Parent}) ->
295-
{ok, {#{strategy => one_for_one, intensity => 10000, period => 3600}, []}}.
296+
{ok, {#{strategy => one_for_one, intensity => 10000, period => 3600}, []}};
297+
init({test_one_for_all, Parent}) ->
298+
ChildSpecs = [
299+
#{
300+
id => ping_pong_1,
301+
start => {ping_pong_server, start_link, [Parent]},
302+
restart => permanent,
303+
shutdown => brutal_kill,
304+
type => worker,
305+
modules => [ping_pong_server]
306+
},
307+
#{
308+
id => ping_pong_2,
309+
start => {ping_pong_server, start_link, [Parent]},
310+
restart => transient,
311+
shutdown => brutal_kill,
312+
type => worker,
313+
modules => [ping_pong_server]
314+
},
315+
#{
316+
id => ready_0,
317+
start => {notify_init_server, start_link, [{Parent, ready_0}]},
318+
restart => temporary,
319+
shutdown => brutal_kill,
320+
type => worker,
321+
modules => [notify_init_server]
322+
}
323+
],
324+
{ok, {#{strategy => one_for_all, intensity => 10000, period => 3600}, ChildSpecs}}.
296325

297326
test_supervisor_order() ->
298327
{ok, SupPid} = supervisor:start_link(?MODULE, {test_supervisor_order, self()}),
@@ -346,3 +375,68 @@ test_which_children() ->
346375
unlink(SupPid),
347376
exit(SupPid, shutdown),
348377
ok.
378+
379+
test_one_for_all() ->
380+
{ok, SupPid} = supervisor:start_link({local, allforone}, ?MODULE, {test_one_for_all, self()}),
381+
% Collect startup message from permanent ping_pong_server
382+
Server_1 = get_and_test_server(),
383+
% Collect startup message from transient ping_pong_server
384+
Server_2 = get_and_test_server(),
385+
% Collect startup message from temporary notify_init_server
386+
ready_0 =
387+
receive
388+
Msg1 -> Msg1
389+
after 1000 -> error({timeout, {start, ready_0}})
390+
end,
391+
392+
[{specs, 3}, {active, 3}, {supervisors, 0}, {workers, 3}] = supervisor:count_children(SupPid),
393+
394+
%% Monitor transient Server_2 to make sure it is stopped, and restarted when
395+
%% permanent Server_1 is shutdown.
396+
MonRef = monitor(process, Server_2),
397+
ok = gen_server:call(Server_1, {stop, test_crash}),
398+
%% Server_2 should exit before the first child is restarted, but exit messages from
399+
%% monitored processes may take some time to be received so we may get the message
400+
%% from the first restarted child first.
401+
First =
402+
receive
403+
{'DOWN', MonRef, process, Server_2, killed} ->
404+
down;
405+
{ping_pong_server_ready, Restart1} when is_pid(Restart1) ->
406+
ready
407+
after 1000 ->
408+
error({timeout, restart_after_crash})
409+
end,
410+
ok =
411+
case First of
412+
down ->
413+
receive
414+
{ping_pong_server_ready, Restart_1} when is_pid(Restart_1) -> ok
415+
after 1000 ->
416+
error({timeout, restart_after_crash})
417+
end;
418+
ready ->
419+
receive
420+
{'DOWN', MonRef, process, Server_2, killed} -> ok
421+
after 1000 ->
422+
error({timeout, restart_after_crash})
423+
end
424+
end,
425+
426+
demonitor(MonRef),
427+
428+
% Collect startup message from restarted transient ping_pong_server child
429+
_Restart_2 = get_and_test_server(),
430+
% Make sure temporary notify_init_server is not restarted
431+
no_start =
432+
receive
433+
ready_0 -> error({error, restarted_temporary})
434+
after 1000 -> no_start
435+
end,
436+
437+
% Ensure correct number of children
438+
[{specs, 2}, {active, 2}, {supervisors, 0}, {workers, 2}] = supervisor:count_children(SupPid),
439+
440+
unlink(SupPid),
441+
exit(SupPid, shutdown),
442+
ok.

0 commit comments

Comments
 (0)