4545import com .google .common .collect .ImmutableSet ;
4646import org .apache .helix .ConfigAccessor ;
4747import org .apache .helix .HelixException ;
48+ import org .apache .helix .PropertyKey ;
4849import org .apache .helix .manager .zk .ZKHelixDataAccessor ;
4950import org .apache .helix .model .CurrentState ;
5051import org .apache .helix .model .ExternalView ;
@@ -98,6 +99,8 @@ public class MaintenanceManagementService {
9899 // maintain the backward compatibility with users who don't use MaintenanceManagementServiceBuilder
99100 // to create the MaintenanceManagementService object.
100101 private List <HealthCheck > _skipStoppableHealthCheckList = Collections .emptyList ();
102+ // default value false to maintain backward compatibility
103+ private boolean _skipCustomChecksIfNoLiveness = false ;
101104
102105 public MaintenanceManagementService (ZKHelixDataAccessor dataAccessor ,
103106 ConfigAccessor configAccessor , boolean skipZKRead , String namespace ) {
@@ -152,7 +155,7 @@ public MaintenanceManagementService(ZKHelixDataAccessor dataAccessor,
152155 private MaintenanceManagementService (ZKHelixDataAccessor dataAccessor ,
153156 ConfigAccessor configAccessor , CustomRestClient customRestClient , boolean skipZKRead ,
154157 Set <String > nonBlockingHealthChecks , Set <StoppableCheck .Category > skipHealthCheckCategories ,
155- List <HealthCheck > skipStoppableHealthCheckList , String namespace ) {
158+ List <HealthCheck > skipStoppableHealthCheckList , String namespace , boolean skipCustomChecksIfNoLiveness ) {
156159 _dataAccessor =
157160 new HelixDataAccessorWrapper (dataAccessor , customRestClient ,
158161 namespace );
@@ -166,6 +169,7 @@ private MaintenanceManagementService(ZKHelixDataAccessor dataAccessor,
166169 _skipStoppableHealthCheckList = skipStoppableHealthCheckList == null ? Collections .emptyList ()
167170 : skipStoppableHealthCheckList ;
168171 _namespace = namespace ;
172+ _skipCustomChecksIfNoLiveness = skipCustomChecksIfNoLiveness ;
169173 }
170174
171175 /**
@@ -502,15 +506,20 @@ private List<String> batchCustomInstanceStoppableCheck(String clusterId, List<St
502506 return instances ;
503507 }
504508
509+ // Skip performing a custom check on any dead instance if the user set _skipCustomCheckIfInstanceNotAlive
510+ // to true.
511+ List <String > instanceIdsForCustomCheck = filterOutDeadInstancesIfNeeded (instances );
512+
505513 // If the config has exactUrl and the CLUSTER level customer check is not skipped, we will
506514 // perform the custom check at cluster level.
507515 if (restConfig .getCompleteConfiguredHealthUrl ().isPresent ()) {
508- if (_skipHealthCheckCategories .contains (StoppableCheck .Category .CUSTOM_AGGREGATED_CHECK )) {
516+ if (_skipHealthCheckCategories .contains (StoppableCheck .Category .CUSTOM_AGGREGATED_CHECK )
517+ || instanceIdsForCustomCheck .isEmpty ()) {
509518 return instances ;
510519 }
511520
512521 Map <String , StoppableCheck > clusterLevelCustomCheckResult =
513- performAggregatedCustomCheck (clusterId , instances ,
522+ performAggregatedCustomCheck (clusterId , instanceIdsForCustomCheck ,
514523 restConfig .getCompleteConfiguredHealthUrl ().get (), customPayLoads ,
515524 toBeStoppedInstances );
516525 List <String > instancesForNextCheck = new ArrayList <>();
@@ -526,7 +535,7 @@ private List<String> batchCustomInstanceStoppableCheck(String clusterId, List<St
526535
527536 // Reaching here means the rest config requires instances/partition level checks. We will
528537 // perform the custom check at instance/partition level if they are not skipped.
529- List <String > instancesForCustomPartitionLevelChecks = instances ;
538+ List <String > instancesForCustomPartitionLevelChecks = instanceIdsForCustomCheck ;
530539 if (!_skipHealthCheckCategories .contains (StoppableCheck .Category .CUSTOM_INSTANCE_CHECK )) {
531540 Map <String , Future <StoppableCheck >> customInstanceLevelChecks = instances .stream ().collect (
532541 Collectors .toMap (Function .identity (), instance -> POOL .submit (
@@ -560,6 +569,42 @@ private List<String> batchCustomInstanceStoppableCheck(String clusterId, List<St
560569 return instancesForCustomPartitionLevelChecks ;
561570 }
562571
572+ /**
573+ * Helper Methods
574+ * <p>
575+ * If users set skipCustomCheckIfInstanceNotAlive to true, filter out dead instances
576+ * to avoid running custom checks on them.
577+ *
578+ * @param instanceIds the list of instances
579+ * @return either the original list or a filtered list of only live instances
580+ */
581+ private List <String > filterOutDeadInstancesIfNeeded (List <String > instanceIds ) {
582+ if (!_skipCustomChecksIfNoLiveness ) {
583+ // We are not skipping the not-alive check, so just return all instances.
584+ return instanceIds ;
585+ }
586+
587+ // Retrieve the set of currently live instances
588+ PropertyKey .Builder keyBuilder = _dataAccessor .keyBuilder ();
589+ List <String > liveNodes = _dataAccessor .getChildNames (keyBuilder .liveInstances ());
590+
591+ // Filter out instances that are not in the live list
592+ List <String > filtered = new ArrayList <>();
593+ List <String > skipped = new ArrayList <>();
594+ for (String instanceId : instanceIds ) {
595+ if (liveNodes .contains (instanceId )) {
596+ filtered .add (instanceId );
597+ } else {
598+ skipped .add (instanceId );
599+ }
600+ }
601+
602+ if (!skipped .isEmpty ()) {
603+ LOG .info ("Skipping any custom checks for instances due to liveness: {}" , skipped );
604+ }
605+ return filtered ;
606+ }
607+
563608 private Map <String , MaintenanceManagementInstanceInfo > batchInstanceHealthCheck (String clusterId ,
564609 List <String > instances , List <String > healthChecks , Map <String , String > healthCheckConfig ) {
565610 List <String > instancesForNext = new ArrayList <>(instances );
@@ -890,6 +935,7 @@ private void addMinActiveReplicaChecks(String clusterId, Map<String, Future<Stop
890935 public static class MaintenanceManagementServiceBuilder {
891936 private ConfigAccessor _configAccessor ;
892937 private boolean _skipZKRead ;
938+ private boolean _skipCustomChecksIfNoLiveness = false ;
893939 private String _namespace ;
894940 private ZKHelixDataAccessor _dataAccessor ;
895941 private CustomRestClient _customRestClient ;
@@ -942,11 +988,17 @@ public MaintenanceManagementServiceBuilder setSkipStoppableHealthCheckList(
942988 return this ;
943989 }
944990
991+ public MaintenanceManagementServiceBuilder setSkipCustomChecksIfNoLiveness (
992+ boolean skipCustomChecksIfNoLiveness ) {
993+ _skipCustomChecksIfNoLiveness = skipCustomChecksIfNoLiveness ;
994+ return this ;
995+ }
996+
945997 public MaintenanceManagementService build () {
946998 validate ();
947999 return new MaintenanceManagementService (_dataAccessor , _configAccessor , _customRestClient ,
9481000 _skipZKRead , _nonBlockingHealthChecks , _skipHealthCheckCategories ,
949- _skipStoppableHealthCheckList , _namespace );
1001+ _skipStoppableHealthCheckList , _namespace , _skipCustomChecksIfNoLiveness );
9501002 }
9511003
9521004 private void validate () throws IllegalArgumentException {
0 commit comments