@@ -258,8 +258,12 @@ func (s *storagedFailover) checkPodsAfterRestart(nc *v1alpha1.NebulaCluster) ([]
258258
259259 // Get pod information
260260 pod , err := s .clientSet .Pod ().GetPod (nc .Namespace , podName )
261- if err != nil && ! apierrors .IsNotFound (err ) {
262- return nil , err
261+ if err != nil {
262+ if ! apierrors .IsNotFound (err ) {
263+ return nil , err
264+ } else {
265+ continue // Skip pod if already terminated but in the process of starting.
266+ }
263267 }
264268
265269 // Wait if the pod is terminating
@@ -456,7 +460,7 @@ func (s *storagedFailover) balanceStorageLeader(nc *v1alpha1.NebulaCluster) erro
456460 return nil
457461}
458462
459- // check if there are more than 2 failure hosts in the same part
463+ // check if there are more than replicas/ 2 failure hosts in the same part
460464func (s * storagedFailover ) hasMultipleFailuresInSamePart (nc * v1alpha1.NebulaCluster , failureHosts []string ) (bool , error ) {
461465 options , err := nebula .ClientOptions (nc , nebula .SetIsMeta (true ))
462466 if err != nil {
@@ -511,7 +515,7 @@ func (s *storagedFailover) hasMultipleFailuresInSamePart(nc *v1alpha1.NebulaClus
511515 peerSet := sets .NewString (peers ... )
512516 interSection := peerSet .Intersection (failureHostsSet )
513517
514- if interSection .Len () >= 2 {
518+ if interSection .Len () > len ( peers ) / 2 {
515519 if atomic .CompareAndSwapInt32 (& foundResult , 0 , 1 ) {
516520 klog .Infof ("space %d part %d has more than 2 failure hosts: %v" ,
517521 spaceID , part .PartID , interSection .List ())
0 commit comments