@@ -235,7 +235,12 @@ restart_child(Pid, Reason, #state{restart_strategy = one_for_one} = State) ->
235235 Children = lists :keyreplace (
236236 Pid , # child .pid , State1 # state .children , NewChild
237237 ),
238- {ok , State1 # state {children = Children }}
238+ {ok , State1 # state {children = Children }};
239+ {error , _ } ->
240+ erlang :send_after (
241+ 50 , self (), {try_again_restart , Child # child .id }
242+ ),
243+ {ok , State1 }
239244 end ;
240245 {shutdown , State1 } ->
241246 RemainingChildren = lists :keydelete (
@@ -271,8 +276,9 @@ restart_child(Pid, Reason, #state{restart_strategy = one_for_all} = State) ->
271276 case restart_many_children (Child , Siblings ) of
272277 {ok , NewChildren } ->
273278 {ok , State1 # state {children = NewChildren }};
274- _Error ->
275- {shutdown , State1 }
279+ {ok , NewChildren , RetryID } ->
280+ erlang :send_after (50 , self (), {try_again_restart , RetryID }),
281+ {ok , State1 # state {children = NewChildren }}
276282 end ;
277283 {shutdown , State1 } ->
278284 RemainingChildren = lists :keydelete (
@@ -380,6 +386,26 @@ handle_info({ensure_killed, Pid}, State) ->
380386 exit (Pid , kill ),
381387 {noreply , State }
382388 end ;
389+ handle_info ({try_again_restart , Id }, State ) ->
390+ Child = lists :keyfind (Id , # child .id , State # state .children ),
391+ case add_restart (State ) of
392+ {ok , State1 } ->
393+ case try_start (Child ) of
394+ {ok , NewPid , _Result } ->
395+ UpdatedChildren = lists :keyreplace (
396+ Id , Child # child .id , State # state .children , Child # child {pid = NewPid }
397+ ),
398+ {noreply , State # state {children = UpdatedChildren }};
399+ {error , {_ , _ }} ->
400+ erlang :send_after (50 , self (), {try_again_restart , Id }),
401+ {noreply , State1 }
402+ end ;
403+ {shutdown , State1 } ->
404+ RemainingChildren = lists :keydelete (
405+ Id , # child .id , State1 # state .children
406+ ),
407+ {stop , shutdown , State1 # state {children = RemainingChildren }}
408+ end ;
383409handle_info (_Msg , State ) ->
384410 % TODO: log unexpected message
385411 {noreply , State }.
@@ -488,7 +514,7 @@ trim_expired_restarts(_Threshold, Restarts) ->
488514restart_many_children (# child {pid = Pid } = Child , Children ) ->
489515 Siblings = lists :keydelete (Pid , # child .pid , Children ),
490516 {ok , Children1 } = terminate_many_children (Siblings , [Child # child {pid = {restarting , Pid }}]),
491- do_restart_children (Children1 , []).
517+ do_restart_children (Children1 , [], [] ).
492518
493519terminate_many_children ([], NewChildren ) ->
494520 {ok , lists :reverse (NewChildren )};
@@ -510,26 +536,49 @@ terminate_many_children([Child | Children], NewChildren) ->
510536 ])
511537 end .
512538
513- do_restart_children ([], NewChildren ) ->
539+ do_restart_children ([], NewChildren , [] ) ->
514540 {ok , lists :reverse (NewChildren )};
515- do_restart_children ([# child {pid = Pid } = Child | Children ], NewChildren ) ->
541+ do_restart_children ([], NewChildren , [RetryChild | T ] = RetryChildren ) ->
542+ if
543+ length (T ) =:= 0 ->
544+ {ok , {lists :reverse (NewChildren ), RetryChild # child .id }};
545+ true ->
546+ ok = differed_try_again (RetryChildren ),
547+ {ok , lists :reverse (NewChildren )}
548+ end ;
549+ do_restart_children ([# child {pid = Pid } = Child | Children ], NewChildren , RetryChildren ) ->
516550 case Pid of
517551 {restarting , _ } ->
518552 case try_start (Child ) of
519553 {ok , Pid1 , {ok , Pid1 }} ->
520- do_restart_children (Children , [Child # child {pid = Pid1 } | NewChildren ]);
554+ do_restart_children (
555+ Children , [Child # child {pid = Pid1 } | NewChildren ], RetryChildren
556+ );
521557 {ok , Pid1 , {ok , Pid1 , _Result }} ->
522- do_restart_children (Children , [Child # child {pid = Pid1 } | NewChildren ]);
558+ do_restart_children (
559+ Children , [Child # child {pid = Pid1 } | NewChildren ], RetryChildren
560+ );
523561 {ok , undefined , {ok , undefined }} ->
524- do_restart_children (Children , [Child # child {pid = undefined } | NewChildren ]);
525- {error , _ } = Error ->
526- Error
562+ do_restart_children (
563+ Children , [Child # child {pid = undefined } | NewChildren ], RetryChildren
564+ );
565+ {error , _ } ->
566+ do_restart_children (Children , NewChildren , [Child | RetryChildren ])
527567 end ;
528568 _ ->
529569 % retain previous ignore children without starting them
530- do_restart_children (Children , [Child | NewChildren ])
570+ do_restart_children (Children , [Child | NewChildren ], RetryChildren )
531571 end .
532572
573+ % % Schedules "try again" restarts at 50ms intervals when multiple children have failed to restart
574+ % % on the first attempt. This is an accumulated (reverse start order) list, so the children that
575+ % % should start last get the longest delay before sending the try_again_restart request.
576+ differed_try_again ([]) ->
577+ ok ;
578+ differed_try_again ([Child | Children ] = RetryChildren ) ->
579+ erlang :send_after (50 * length (RetryChildren ), self (), {try_again_restart , Child # child .id }),
580+ differed_try_again (Children ).
581+
533582child_to_info (# child {id = Id , pid = Pid , type = Type , modules = Modules }) ->
534583 Child =
535584 case Pid of
0 commit comments