Skip to content

Commit e6adf5b

Browse files
authored
Exclude Custom Checks When Instance Is Not Alive (#2993)
Exclude Custom Checks When Instance Is Not Alive
1 parent 0f95ef3 commit e6adf5b

File tree

3 files changed

+112
-6
lines changed

3 files changed

+112
-6
lines changed

helix-rest/src/main/java/org/apache/helix/rest/clusterMaintenanceService/MaintenanceManagementService.java

Lines changed: 57 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@
4545
import com.google.common.collect.ImmutableSet;
4646
import org.apache.helix.ConfigAccessor;
4747
import org.apache.helix.HelixException;
48+
import org.apache.helix.PropertyKey;
4849
import org.apache.helix.manager.zk.ZKHelixDataAccessor;
4950
import org.apache.helix.model.CurrentState;
5051
import org.apache.helix.model.ExternalView;
@@ -98,6 +99,8 @@ public class MaintenanceManagementService {
9899
// maintain the backward compatibility with users who don't use MaintenanceManagementServiceBuilder
99100
// to create the MaintenanceManagementService object.
100101
private List<HealthCheck> _skipStoppableHealthCheckList = Collections.emptyList();
102+
// default value false to maintain backward compatibility
103+
private boolean _skipCustomChecksIfNoLiveness = false;
101104

102105
public MaintenanceManagementService(ZKHelixDataAccessor dataAccessor,
103106
ConfigAccessor configAccessor, boolean skipZKRead, String namespace) {
@@ -152,7 +155,7 @@ public MaintenanceManagementService(ZKHelixDataAccessor dataAccessor,
152155
private MaintenanceManagementService(ZKHelixDataAccessor dataAccessor,
153156
ConfigAccessor configAccessor, CustomRestClient customRestClient, boolean skipZKRead,
154157
Set<String> nonBlockingHealthChecks, Set<StoppableCheck.Category> skipHealthCheckCategories,
155-
List<HealthCheck> skipStoppableHealthCheckList, String namespace) {
158+
List<HealthCheck> skipStoppableHealthCheckList, String namespace, boolean skipCustomChecksIfNoLiveness) {
156159
_dataAccessor =
157160
new HelixDataAccessorWrapper(dataAccessor, customRestClient,
158161
namespace);
@@ -166,6 +169,7 @@ private MaintenanceManagementService(ZKHelixDataAccessor dataAccessor,
166169
_skipStoppableHealthCheckList = skipStoppableHealthCheckList == null ? Collections.emptyList()
167170
: skipStoppableHealthCheckList;
168171
_namespace = namespace;
172+
_skipCustomChecksIfNoLiveness = skipCustomChecksIfNoLiveness;
169173
}
170174

171175
/**
@@ -502,15 +506,20 @@ private List<String> batchCustomInstanceStoppableCheck(String clusterId, List<St
502506
return instances;
503507
}
504508

509+
// Skip performing a custom check on any dead instance if the user set _skipCustomCheckIfInstanceNotAlive
510+
// to true.
511+
List<String> instanceIdsForCustomCheck = filterOutDeadInstancesIfNeeded(instances);
512+
505513
// If the config has exactUrl and the CLUSTER level customer check is not skipped, we will
506514
// perform the custom check at cluster level.
507515
if (restConfig.getCompleteConfiguredHealthUrl().isPresent()) {
508-
if (_skipHealthCheckCategories.contains(StoppableCheck.Category.CUSTOM_AGGREGATED_CHECK)) {
516+
if (_skipHealthCheckCategories.contains(StoppableCheck.Category.CUSTOM_AGGREGATED_CHECK)
517+
|| instanceIdsForCustomCheck.isEmpty()) {
509518
return instances;
510519
}
511520

512521
Map<String, StoppableCheck> clusterLevelCustomCheckResult =
513-
performAggregatedCustomCheck(clusterId, instances,
522+
performAggregatedCustomCheck(clusterId, instanceIdsForCustomCheck,
514523
restConfig.getCompleteConfiguredHealthUrl().get(), customPayLoads,
515524
toBeStoppedInstances);
516525
List<String> instancesForNextCheck = new ArrayList<>();
@@ -526,7 +535,7 @@ private List<String> batchCustomInstanceStoppableCheck(String clusterId, List<St
526535

527536
// Reaching here means the rest config requires instances/partition level checks. We will
528537
// perform the custom check at instance/partition level if they are not skipped.
529-
List<String> instancesForCustomPartitionLevelChecks = instances;
538+
List<String> instancesForCustomPartitionLevelChecks = instanceIdsForCustomCheck;
530539
if (!_skipHealthCheckCategories.contains(StoppableCheck.Category.CUSTOM_INSTANCE_CHECK)) {
531540
Map<String, Future<StoppableCheck>> customInstanceLevelChecks = instances.stream().collect(
532541
Collectors.toMap(Function.identity(), instance -> POOL.submit(
@@ -560,6 +569,42 @@ private List<String> batchCustomInstanceStoppableCheck(String clusterId, List<St
560569
return instancesForCustomPartitionLevelChecks;
561570
}
562571

572+
/**
573+
* Helper Methods
574+
* <p>
575+
* If users set skipCustomCheckIfInstanceNotAlive to true, filter out dead instances
576+
* to avoid running custom checks on them.
577+
*
578+
* @param instanceIds the list of instances
579+
* @return either the original list or a filtered list of only live instances
580+
*/
581+
private List<String> filterOutDeadInstancesIfNeeded(List<String> instanceIds) {
582+
if (!_skipCustomChecksIfNoLiveness) {
583+
// We are not skipping the not-alive check, so just return all instances.
584+
return instanceIds;
585+
}
586+
587+
// Retrieve the set of currently live instances
588+
PropertyKey.Builder keyBuilder = _dataAccessor.keyBuilder();
589+
List<String> liveNodes = _dataAccessor.getChildNames(keyBuilder.liveInstances());
590+
591+
// Filter out instances that are not in the live list
592+
List<String> filtered = new ArrayList<>();
593+
List<String> skipped = new ArrayList<>();
594+
for (String instanceId : instanceIds) {
595+
if (liveNodes.contains(instanceId)) {
596+
filtered.add(instanceId);
597+
} else {
598+
skipped.add(instanceId);
599+
}
600+
}
601+
602+
if (!skipped.isEmpty()) {
603+
LOG.info("Skipping any custom checks for instances due to liveness: {}", skipped);
604+
}
605+
return filtered;
606+
}
607+
563608
private Map<String, MaintenanceManagementInstanceInfo> batchInstanceHealthCheck(String clusterId,
564609
List<String> instances, List<String> healthChecks, Map<String, String> healthCheckConfig) {
565610
List<String> instancesForNext = new ArrayList<>(instances);
@@ -890,6 +935,7 @@ private void addMinActiveReplicaChecks(String clusterId, Map<String, Future<Stop
890935
public static class MaintenanceManagementServiceBuilder {
891936
private ConfigAccessor _configAccessor;
892937
private boolean _skipZKRead;
938+
private boolean _skipCustomChecksIfNoLiveness = false;
893939
private String _namespace;
894940
private ZKHelixDataAccessor _dataAccessor;
895941
private CustomRestClient _customRestClient;
@@ -942,11 +988,17 @@ public MaintenanceManagementServiceBuilder setSkipStoppableHealthCheckList(
942988
return this;
943989
}
944990

991+
public MaintenanceManagementServiceBuilder setSkipCustomChecksIfNoLiveness(
992+
boolean skipCustomChecksIfNoLiveness) {
993+
_skipCustomChecksIfNoLiveness = skipCustomChecksIfNoLiveness;
994+
return this;
995+
}
996+
945997
public MaintenanceManagementService build() {
946998
validate();
947999
return new MaintenanceManagementService(_dataAccessor, _configAccessor, _customRestClient,
9481000
_skipZKRead, _nonBlockingHealthChecks, _skipHealthCheckCategories,
949-
_skipStoppableHealthCheckList, _namespace);
1001+
_skipStoppableHealthCheckList, _namespace, _skipCustomChecksIfNoLiveness);
9501002
}
9511003

9521004
private void validate() throws IllegalArgumentException {

helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/InstancesAccessor.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ public enum InstancesProperties {
7373
online,
7474
disabled,
7575
selection_base,
76+
skip_custom_check_if_instance_not_alive,
7677
zone_order,
7778
to_be_stopped_instances,
7879
skip_stoppable_check_list,
@@ -296,6 +297,13 @@ private Response batchGetStoppableInstances(String clusterId, JsonNode node, boo
296297
}
297298
}
298299

300+
boolean skipCustomChecksIfNoLiveness = false;
301+
if (node.get(InstancesProperties.skip_custom_check_if_instance_not_alive.name()) != null) {
302+
skipCustomChecksIfNoLiveness = node.get(
303+
InstancesAccessor.InstancesProperties.skip_custom_check_if_instance_not_alive.name())
304+
.asBoolean();
305+
}
306+
299307
ClusterTopology clusterTopology = clusterService.getClusterTopology(clusterId);
300308
if (selectionBase != InstanceHealthSelectionBase.non_zone_based) {
301309
if (!clusterService.isClusterTopologyAware(clusterId)) {
@@ -335,6 +343,7 @@ private Response batchGetStoppableInstances(String clusterId, JsonNode node, boo
335343
.setSkipHealthCheckCategories(skipHealthCheckCategories)
336344
.setNamespace(namespace)
337345
.setSkipStoppableHealthCheckList(skipStoppableCheckList)
346+
.setSkipCustomChecksIfNoLiveness(skipCustomChecksIfNoLiveness)
338347
.build();
339348

340349
StoppableInstancesSelector stoppableInstancesSelector =

helix-rest/src/test/java/org/apache/helix/rest/server/TestInstancesAccessor.java

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
import com.fasterxml.jackson.databind.JsonNode;
3535
import com.google.common.collect.ImmutableMap;
3636
import com.google.common.collect.ImmutableSet;
37+
import org.apache.helix.ConfigAccessor;
3738
import org.apache.helix.TestHelper;
3839
import org.apache.helix.constants.InstanceConstants;
3940
import org.apache.helix.model.ClusterConfig;
@@ -248,7 +249,51 @@ public void testInstanceStoppableCrossZoneBasedWithSelectedCheckList() throws IO
248249
System.out.println("End test :" + TestHelper.getTestMethodName());
249250
}
250251

251-
@Test(dependsOnMethods = "testInstanceStoppableCrossZoneBasedWithSelectedCheckList")
252+
@Test(dependsOnMethods = "testCrossZoneStoppableWithoutZoneOrder")
253+
public void testSkipCustomChecksIfInstanceNotAlive() throws JsonProcessingException {
254+
System.out.println("Start test :" + TestHelper.getTestMethodName());
255+
256+
// Instance 4 and 5 in stoppable cluster 1 are not alive
257+
String content = String.format(
258+
"{\"%s\":\"%s\",\"%s\":[\"%s\",\"%s\", \"%s\"], \"%s\":[\"%s\", \"%s\", \"%s\"], \"%s"
259+
+ "\": \"%b\"}",
260+
InstancesAccessor.InstancesProperties.selection_base.name(),
261+
InstancesAccessor.InstanceHealthSelectionBase.cross_zone_based.name(),
262+
InstancesAccessor.InstancesProperties.instances.name(), "instance4", "instance5", "invalidInstance",
263+
InstancesAccessor.InstancesProperties.skip_stoppable_check_list.name(), "INSTANCE_NOT_ALIVE", "EMPTY_RESOURCE_ASSIGNMENT", "INSTANCE_NOT_STABLE",
264+
InstancesAccessor.InstancesProperties.skip_custom_check_if_instance_not_alive.name(), true);
265+
266+
// Set the dummy custom checks for the cluster. The custom checks should be skipped.
267+
ConfigAccessor configAccessor = new ConfigAccessor(ZK_ADDR);
268+
Assert.assertNull(configAccessor.getRESTConfig(STOPPABLE_CLUSTER));
269+
RESTConfig restConfig = new RESTConfig(STOPPABLE_CLUSTER);
270+
restConfig.set(RESTConfig.SimpleFields.CUSTOMIZED_HEALTH_URL, "TEST_URL");
271+
configAccessor.setRESTConfig(STOPPABLE_CLUSTER, restConfig);
272+
Assert.assertEquals(restConfig, configAccessor.getRESTConfig(STOPPABLE_CLUSTER));
273+
274+
// Even if we don't skip custom stoppable checks, the instance is not alive so it should be stoppable
275+
Response response = new JerseyUriRequestBuilder(
276+
"clusters/{}/instances?command=stoppable").format(
277+
STOPPABLE_CLUSTER).post(this, Entity.entity(content, MediaType.APPLICATION_JSON_TYPE));
278+
JsonNode jsonNode = OBJECT_MAPPER.readTree(response.readEntity(String.class));
279+
Set<String> stoppableSet = getStringSet(jsonNode,
280+
InstancesAccessor.InstancesProperties.instance_stoppable_parallel.name());
281+
Assert.assertTrue(stoppableSet.contains("instance4"));
282+
Assert.assertTrue(stoppableSet.contains("instance5"));
283+
JsonNode nonStoppableInstances = jsonNode.get(
284+
InstancesAccessor.InstancesProperties.instance_not_stoppable_with_reasons.name());
285+
286+
Assert.assertEquals(getStringSet(nonStoppableInstances, "invalidInstance"),
287+
ImmutableSet.of("HELIX:INSTANCE_NOT_EXIST"));
288+
289+
// After the test finishes, remove the dummy custom checks REST config
290+
configAccessor.deleteRESTConfig(STOPPABLE_CLUSTER);
291+
Assert.assertNull(configAccessor.getRESTConfig(STOPPABLE_CLUSTER));
292+
293+
System.out.println("End test :" + TestHelper.getTestMethodName());
294+
}
295+
296+
@Test(dependsOnMethods = "testSkipCustomChecksIfInstanceNotAlive")
252297
public void testInstanceStoppableCrossZoneBasedWithEvacuatingInstances() throws IOException {
253298
System.out.println("Start test :" + TestHelper.getTestMethodName());
254299
String content = String.format(

0 commit comments

Comments
 (0)