@@ -432,8 +432,7 @@ start_cluster(System, ServerConfigs)
432432 {error , cluster_not_formed }.
433433start_cluster (System , [#{cluster_name := ClusterName } | _ ] = ServerConfigs ,
434434 Timeout ) when is_atom (System ) ->
435- {Started , NotStarted } =
436- ra_lib :partition_parallel (
435+ case ra_lib :partition_parallel (
437436 fun (C ) ->
438437 case start_server (System , C ) of
439438 ok -> true ;
@@ -442,39 +441,46 @@ start_cluster(System, [#{cluster_name := ClusterName} | _] = ServerConfigs,
442441 [C , Err ]),
443442 false
444443 end
445- end , ServerConfigs ),
446- case Started of
447- [] ->
448- ? ERR (" ra: failed to form a new cluster ~w . "
449- " No servers were successfully started." ,
450- [ClusterName ]),
451- {error , cluster_not_formed };
452- _ ->
444+ end , ServerConfigs ) of
445+ {ok , Started , NotStarted } ->
446+ case Started of
447+ [] ->
448+ ? ERR (" ra: failed to form a new cluster ~w . "
449+ " No servers were successfully started." ,
450+ [ClusterName ]),
451+ {error , cluster_not_formed };
452+ _ ->
453+ StartedIds = sort_by_local ([I || #{id := I } <- Started ], []),
454+ NotStartedIds = [I || #{id := I } <- NotStarted ],
455+ % % try triggering elections until one succeeds
456+ % % TODO: handle case where no election was successfully triggered
457+ {value , TriggeredId } = lists :search (fun (N ) ->
458+ ok == trigger_election (N )
459+ end , StartedIds ),
460+ % % the triggered id is likely to become the leader so try that first
461+ case members (TriggeredId ,
462+ length (ServerConfigs ) * Timeout ) of
463+ {ok , _ , Leader } ->
464+ ? INFO (" ra: started cluster ~ts with ~b servers. "
465+ " ~b servers failed to start: ~w . Leader: ~w " ,
466+ [ClusterName , length (ServerConfigs ),
467+ length (NotStarted ), NotStartedIds ,
468+ Leader ]),
469+ % we have a functioning cluster
470+ {ok , StartedIds , NotStartedIds };
471+ Err ->
472+ ? WARN (" ra: failed to form new cluster ~w . "
473+ " Error: ~w " , [ClusterName , Err ]),
474+ _ = [force_delete_server (System , N ) || N <- StartedIds ],
475+ % we do not have a functioning cluster
476+ {error , cluster_not_formed }
477+ end
478+ end ;
479+ {error , {partition_parallel_timeout , Started , _ }} ->
453480 StartedIds = sort_by_local ([I || #{id := I } <- Started ], []),
454- NotStartedIds = [I || #{id := I } <- NotStarted ],
455- % % try triggering elections until one succeeds
456- % % TODO: handle case where no election was successfully triggered
457- {value , TriggeredId } = lists :search (fun (N ) ->
458- ok == trigger_election (N )
459- end , StartedIds ),
460- % % the triggered id is likely to become the leader so try that first
461- case members (TriggeredId ,
462- length (ServerConfigs ) * Timeout ) of
463- {ok , _ , Leader } ->
464- ? INFO (" ra: started cluster ~ts with ~b servers. "
465- " ~b servers failed to start: ~w . Leader: ~w " ,
466- [ClusterName , length (ServerConfigs ),
467- length (NotStarted ), NotStartedIds ,
468- Leader ]),
469- % we have a functioning cluster
470- {ok , StartedIds , NotStartedIds };
471- Err ->
472- ? WARN (" ra: failed to form new cluster ~w . "
473- " Error: ~w " , [ClusterName , Err ]),
474- _ = [force_delete_server (System , N ) || N <- StartedIds ],
475- % we do not have a functioning cluster
476- {error , cluster_not_formed }
477- end
481+ ? WARN (" ra: a member of cluster ~w failed to start within the expected time interval (~w )" , [ClusterName , Timeout ]),
482+ _ = [force_delete_server (System , N ) || N <- StartedIds ],
483+ {error , cluster_not_formed }
478484 end .
479485
480486% % @doc Starts an individual ra server of a cluster.
0 commit comments