Skip to content

Commit 0acfe6b

Browse files
committed
Add support for supervisor one_for_all strategy
Adds support for handling restarts for the `one_for_all` strategy that was documented, but lacked implementation. Makes necessary changes to ensure children are always restarted in the same order they were originally started, and shutdown in reverse order with last child first, conforming to OTP behavior. Closes #1855 Signed-off-by: Winford <[email protected]>
1 parent 39b9dbc commit 0acfe6b

File tree

3 files changed

+251
-25
lines changed

3 files changed

+251
-25
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ instead `badarg`.
9292
- Fixed a bug where empty atom could not be created on some platforms, thus breaking receiving a message for a registered process from an OTP node.
9393
- Fix a memory leak in distribution when a BEAM node would monitor a process by name.
9494
- Fix `list_to_integer`, it was likely buggy with integers close to INT64_MAX
95+
- Added missing support for supervisor `one_for_all` strategy.
9596

9697
## [0.6.7] - Unreleased
9798

libs/estdlib/src/supervisor.erl

Lines changed: 155 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -83,18 +83,20 @@
8383
}.
8484

8585
-record(child, {
86-
pid = undefined,
86+
pid = undefined :: pid() | undefined | {restarting, pid()} | {restarting, undefined},
8787
id :: any(),
8888
start :: {module(), atom(), [any()] | undefined},
8989
restart :: restart(),
9090
shutdown :: shutdown(),
9191
type :: child_type(),
9292
modules = [] :: [module()] | dynamic
9393
}).
94-
-record(state, {restart_strategy :: strategy(), children = [] :: [#child{}]}).
94+
%% note: the list of children should always be kept in order, with first to start at the head.
95+
-record(state, {restart_strategy = one_for_one :: strategy(), children = [] :: [#child{}]}).
9596

9697
start_link(Module, Args) ->
9798
gen_server:start_link(?MODULE, {Module, Args}, []).
99+
98100
start_link(SupName, Module, Args) ->
99101
gen_server:start_link(SupName, ?MODULE, {Module, Args}, []).
100102

@@ -172,7 +174,7 @@ child_spec_to_record(#{id := ChildId, start := MFA} = ChildMap) ->
172174
init_state([ChildSpec | T], State) ->
173175
Child = child_spec_to_record(ChildSpec),
174176
NewChildren = [Child | State#state.children],
175-
init_state(T, #state{children = NewChildren});
177+
init_state(T, State#state{children = NewChildren});
176178
init_state([], State) ->
177179
State#state{children = lists:reverse(State#state.children)}.
178180

@@ -182,21 +184,26 @@ start_children([Child | T], StartedC) ->
182184
start_children(T, [Child#child{pid = Pid} | StartedC])
183185
end;
184186
start_children([], StartedC) ->
185-
StartedC.
187+
%% We should always keep the start list in order for later one_for_all restarts.
188+
lists:reverse(StartedC).
186189

187190
handle_call({start_child, ChildSpec}, _From, #state{children = Children} = State) ->
188191
Child = child_spec_to_record(ChildSpec),
189192
#child{id = ID} = Child,
190193
case lists:keyfind(ID, #child.id, State#state.children) of
191194
#child{pid = undefined} ->
192195
{reply, {error, already_present}, State};
196+
#child{pid = {restarting, _Pid}} ->
197+
{reply, {error, restarting}, State};
193198
#child{pid = Pid} ->
194199
{reply, {error, {already_started, Pid}}, State};
195200
false ->
196201
case try_start(Child) of
197202
{ok, Pid, Result} ->
198203
UpdatedChild = Child#child{pid = Pid},
199-
{reply, Result, State#state{children = [UpdatedChild | Children]}};
204+
%% The last child to start should always be at the end of the child
205+
%% start list.
206+
{reply, Result, State#state{children = Children ++ [UpdatedChild]}};
200207
{error, _Reason} = ErrorT ->
201208
{reply, ErrorT, State}
202209
end
@@ -226,6 +233,8 @@ handle_call({restart_child, ID}, _From, #state{children = Children} = State) ->
226233
{error, _Reason} = ErrorT ->
227234
{reply, ErrorT, State}
228235
end;
236+
#child{pid = {restarting, _}} = Child ->
237+
{reply, {error, restarting}, State};
229238
#child{} ->
230239
{reply, {error, running}, State};
231240
false ->
@@ -236,6 +245,8 @@ handle_call({delete_child, ID}, _From, #state{children = Children} = State) ->
236245
#child{pid = undefined} ->
237246
NewChildren = lists:keydelete(ID, #child.id, Children),
238247
{reply, ok, State#state{children = NewChildren}};
248+
#child{pid = {restarting, _}} = Child ->
249+
{reply, {error, restarting}, State};
239250
#child{} ->
240251
{reply, {error, running}, State};
241252
false ->
@@ -254,12 +265,7 @@ handle_cast(_Msg, State) ->
254265
{noreply, State}.
255266

256267
handle_info({'EXIT', Pid, Reason}, State) ->
257-
case handle_child_exit(Pid, Reason, State) of
258-
{ok, State1} ->
259-
{noreply, State1};
260-
{shutdown, State1} ->
261-
{stop, shutdown, State1}
262-
end;
268+
handle_child_exit(Pid, Reason, State);
263269
handle_info({ensure_killed, Pid}, State) ->
264270
case lists:keyfind(Pid, #child.pid, State#state.children) of
265271
false ->
@@ -268,13 +274,28 @@ handle_info({ensure_killed, Pid}, State) ->
268274
exit(Pid, kill),
269275
{noreply, State}
270276
end;
277+
handle_info({restart_many_children, []}, State) ->
278+
{noreply, State};
279+
handle_info({restart_many_children, [#child{pid = Pid} = Child | Children]}, State) ->
280+
case try_start(Child) of
281+
{ok, NewPid, _Result} ->
282+
NewChild = Child#child{pid = NewPid},
283+
NewChildren = lists:keyreplace(
284+
Pid, #child.pid, State#state.children, NewChild
285+
),
286+
{noreply, State#state{children = NewChildren},
287+
{timeout, 0, {restart_many_children, Children}}};
288+
{error, Reason} ->
289+
handle_child_exit(Pid, Reason, State)
290+
end;
271291
handle_info(_Msg, State) ->
272292
%TODO: log unexpected message
273293
{noreply, State}.
274294

275295
%% @hidden
276296
terminate(_Reason, #state{children = Children} = State) ->
277-
RemainingChildren = loop_terminate(Children, []),
297+
%% Shutdown children last to first.
298+
RemainingChildren = loop_terminate(lists:reverse(Children), []),
278299
loop_wait_termination(RemainingChildren),
279300
{ok, State}.
280301

@@ -284,34 +305,59 @@ terminate(_Reason, #state{children = Children} = State) ->
284305
handle_child_exit(Pid, Reason, State) ->
285306
case lists:keyfind(Pid, #child.pid, State#state.children) of
286307
false ->
287-
{ok, State};
308+
{noreply, State};
288309
#child{restart = {terminating, temporary, From}} ->
289310
gen_server:reply(From, ok),
290311
NewChildren = lists:keydelete(Pid, #child.pid, State#state.children),
291-
{ok, State#state{children = NewChildren}};
312+
{noreply, State#state{children = NewChildren}};
292313
#child{restart = {terminating, Restart, From}} = Child ->
293314
gen_server:reply(From, ok),
294315
NewChildren = lists:keyreplace(Pid, #child.pid, State#state.children, Child#child{
295316
pid = undefined, restart = Restart
296317
}),
297-
{ok, State#state{children = NewChildren}};
318+
{noreply, State#state{children = NewChildren}};
298319
#child{} = Child ->
299320
case should_restart(Reason, Child#child.restart) of
300321
true ->
301-
case try_start(Child) of
302-
{ok, NewPid, _Result} ->
303-
NewChild = Child#child{pid = NewPid},
304-
Children = lists:keyreplace(
305-
Pid, #child.pid, State#state.children, NewChild
306-
),
307-
{ok, State#state{children = Children}}
308-
end;
322+
handle_restart_strategy(Child, State);
309323
false ->
310324
Children = lists:keydelete(Pid, #child.pid, State#state.children),
311-
{ok, State#state{children = Children}}
325+
{noreply, State#state{children = Children}}
312326
end
313327
end.
314328

329+
handle_restart_strategy(
330+
#child{id = Id} = Child, #state{restart_strategy = one_for_one} = State
331+
) ->
332+
case try_start(Child) of
333+
{ok, NewPid, _Result} ->
334+
NewChild = Child#child{pid = NewPid},
335+
Children = lists:keyreplace(
336+
Id, #child.id, State#state.children, NewChild
337+
),
338+
{noreply, State#state{children = Children}}
339+
end;
340+
handle_restart_strategy(
341+
#child{pid = Pid} = Child, #state{restart_strategy = one_for_all} = State
342+
) ->
343+
Children =
344+
case Pid of
345+
{restarting, _} ->
346+
State#state.children;
347+
Pid when is_pid(Pid) ->
348+
lists:keyreplace(Pid, #child.pid, State#state.children, Child#child{
349+
pid = {restarting, Pid}
350+
})
351+
end,
352+
case terminate_one_for_all_restart(Children) of
353+
{ok, NewChildren} ->
354+
ok;
355+
{ok, NewChildren, WaitExit} ->
356+
ok = loop_wait_termination(WaitExit)
357+
end,
358+
{noreply, State#state{children = NewChildren},
359+
{timeout, 0, {restart_many_children, NewChildren}}}.
360+
315361
should_restart(_Reason, permanent) ->
316362
true;
317363
should_restart(_Reason, temporary) ->
@@ -367,6 +413,91 @@ try_start(#child{start = {M, F, Args}} = Record) ->
367413
{error, {{'EXIT', Error}, Record}}
368414
end.
369415

416+
terminate_one_for_all_restart(Children) ->
417+
%% Always shut down last child first
418+
do_terminate_one_for_all_restart(lists:reverse(Children), [], []).
419+
420+
do_terminate_one_for_all_restart([], NewChildren, []) ->
421+
%% Do not reverse the list here, it was reversed before being accumulated
422+
%% and is now in correct startup order.
423+
{ok, NewChildren};
424+
do_terminate_one_for_all_restart([], NewChildren, WaitExit) ->
425+
%% Do not reverse the list here, it was reversed before being accumulated
426+
%% and is now in correct startup order.
427+
{ok, NewChildren, WaitExit};
428+
do_terminate_one_for_all_restart([Child | Children], NewChildren, WaitExit) ->
429+
case Child of
430+
#child{restart = {terminating, temporary, From}} = Child when is_pid(From) ->
431+
do_terminate(Child),
432+
case verify_shutdown(Child) of
433+
true ->
434+
do_terminate_one_for_all_restart(Children, NewChildren, WaitExit);
435+
false ->
436+
do_terminate_one_for_all_restart(Children, NewChildren, [Child | WaitExit])
437+
end;
438+
#child{restart = {terminating, _Restart, From}} = Child when is_pid(From) ->
439+
do_terminate(Child),
440+
case verify_shutdown(Child) of
441+
true ->
442+
do_terminate_one_for_all_restart(
443+
Children, [Child#child{pid = undefined} | NewChildren], WaitExit
444+
);
445+
false ->
446+
do_terminate_one_for_all_restart(Children, [Child | NewChildren], [
447+
Child | WaitExit
448+
])
449+
end;
450+
#child{pid = undefined, restart = temporary} = Child ->
451+
do_terminate_one_for_all_restart(Children, NewChildren, WaitExit);
452+
#child{pid = undefined} = Child ->
453+
do_terminate_one_for_all_restart(Children, [Child | NewChildren], WaitExit);
454+
#child{pid = {restarting, _Pid}} = Child ->
455+
do_terminate_one_for_all_restart(Children, [Child | NewChildren], WaitExit);
456+
#child{pid = Pid, restart = temporary} = Child when is_pid(Pid) ->
457+
do_terminate(Child),
458+
case verify_shutdown(Child) of
459+
true ->
460+
do_terminate_one_for_all_restart(Children, NewChildren, WaitExit);
461+
false ->
462+
do_terminate_one_for_all_restart(Children, NewChildren, [Child | WaitExit])
463+
end;
464+
#child{pid = Pid} = Child when is_pid(Pid) ->
465+
do_terminate(Child),
466+
case verify_shutdown(Child) of
467+
true ->
468+
do_terminate_one_for_all_restart(
469+
Children, [Child#child{pid = {restarting, Pid}} | NewChildren], WaitExit
470+
);
471+
false ->
472+
do_terminate_one_for_all_restart(
473+
Children, [Child#child{pid = {restarting, Pid}} | NewChildren], [
474+
Child | WaitExit
475+
]
476+
)
477+
end
478+
end.
479+
480+
verify_shutdown(#child{pid = Pid, shutdown = brutal_kill} = _Child) ->
481+
receive
482+
{'EXIT', Pid, _Reason} ->
483+
true
484+
after 100 ->
485+
false
486+
end;
487+
verify_shutdown(#child{pid = Pid, shutdown = Timeout} = _Child) ->
488+
receive
489+
{'EXIT', Pid, _Reason} ->
490+
true
491+
after Timeout ->
492+
exit(Pid, kill),
493+
receive
494+
{'EXIT', Pid, killed} ->
495+
true
496+
after 100 ->
497+
false
498+
end
499+
end.
500+
370501
child_to_info(#child{id = Id, pid = Pid, type = Type, modules = Modules}) ->
371502
Child =
372503
case Pid of

0 commit comments

Comments
 (0)