diff --git a/docs/changelog/136141.yaml b/docs/changelog/136141.yaml new file mode 100644 index 0000000000000..d568e85afe5b2 --- /dev/null +++ b/docs/changelog/136141.yaml @@ -0,0 +1,6 @@ +pr: 136141 +summary: Add settings for health indicator `shard_capacity` thresholds +area: Health +type: enhancement +issues: + - 116697 diff --git a/docs/reference/elasticsearch/configuration-reference/health-diagnostic-settings.md b/docs/reference/elasticsearch/configuration-reference/health-diagnostic-settings.md index eebec8399af52..16d146f537d32 100644 --- a/docs/reference/elasticsearch/configuration-reference/health-diagnostic-settings.md +++ b/docs/reference/elasticsearch/configuration-reference/health-diagnostic-settings.md @@ -47,4 +47,8 @@ The following are the *expert-level* settings available for configuring an inter `health.periodic_logger.poll_interval` : ([Dynamic](docs-content://deploy-manage/stack-settings.md#dynamic-cluster-setting), [time unit value](/reference/elasticsearch/rest-apis/api-conventions.md#time-units)) How often {{es}} logs the health status of the cluster and of each health indicator as observed by the Health API. Defaults to `60s` (60 seconds). +`health.shard_capacity.unhealthy_threshold.yellow` {applies_to}`stack: ga 9.3` +: ([Dynamic](docs-content://deploy-manage/stack-settings.md#dynamic-cluster-setting)) The minimum number of additional shards the cluster must still be able to allocate (on data or frozen nodes) for shard capacity health to remain `GREEN`. If fewer are available, health becomes `YELLOW`. Must be greater than `health.shard_capacity.unhealthy_threshold.red`. Defaults to `10`. +`health.shard_capacity.unhealthy_threshold.red` {applies_to}`stack: ga 9.3` +: ([Dynamic](docs-content://deploy-manage/stack-settings.md#dynamic-cluster-setting)) The minimum number of additional shards the cluster must still be able to allocate (on data or frozen nodes) below which shard capacity health becomes `RED`. Must be less than `health.shard_capacity.unhealthy_threshold.yellow`. Defaults to `5`. diff --git a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/health/30_feature.yml b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/health/30_feature.yml index 335d02421b0a1..b9a081e40187e 100644 --- a/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/health/30_feature.yml +++ b/rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/health/30_feature.yml @@ -23,3 +23,59 @@ - match: { indicators.master_is_stable.status: "green" } - match: { indicators.master_is_stable.symptom: "The cluster has a stable master node" } - is_false: indicators.master_is_stable.details +--- +"cluster health test for shard capacity settings": + - requires: + cluster_features: [ "health.shard_capacity.unhealthy_threshold_settings" ] + reason: "these relevant settings are added in 9.3" + - do: + health_report: + feature: shards_capacity + - is_true: cluster_name + - match: { indicators.shards_capacity.status: "green" } + + # set very large threshold to force the indicator to go yellow + - do: + cluster.put_settings: + body: + persistent: + health.shard_capacity.unhealthy_threshold.yellow: 100000000 + flat_settings: true + - match: { persistent: { health.shard_capacity.unhealthy_threshold.yellow: "100000000" } } + + - do: + health_report: + feature: shards_capacity + - is_true: cluster_name + - match: { indicators.shards_capacity.status: "yellow" } + + # set very large threshold to force the indicator to go red + - do: + cluster.put_settings: + body: + persistent: + health.shard_capacity.unhealthy_threshold.red: 90000000 + flat_settings: true + - match: { persistent: { health.shard_capacity.unhealthy_threshold.red: "90000000" } } + + - do: + health_report: + feature: shards_capacity + - is_true: cluster_name + - match: { indicators.shards_capacity.status: "red" } + + # set back to default + - do: + cluster.put_settings: + body: + persistent: + health.shard_capacity.unhealthy_threshold.yellow: 10 + health.shard_capacity.unhealthy_threshold.red: 5 + flat_settings: true + - match: { acknowledged: true } + + - do: + health_report: + feature: shards_capacity + - is_true: cluster_name + - match: { indicators.shards_capacity.status: "green" } diff --git a/server/src/main/java/module-info.java b/server/src/main/java/module-info.java index 2987b3849e663..b7696c0c8dce3 100644 --- a/server/src/main/java/module-info.java +++ b/server/src/main/java/module-info.java @@ -436,7 +436,8 @@ org.elasticsearch.search.retriever.RetrieversFeatures, org.elasticsearch.action.admin.cluster.stats.ClusterStatsFeatures, org.elasticsearch.ingest.IngestFeatures, - org.elasticsearch.action.admin.indices.resolve.ResolveIndexFeatures; + org.elasticsearch.action.admin.indices.resolve.ResolveIndexFeatures, + org.elasticsearch.health.HealthFeatures; uses org.elasticsearch.plugins.internal.SettingsExtension; uses RestExtension; diff --git a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java index 7006b5adbe886..b2703fcb493f0 100644 --- a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java @@ -82,6 +82,7 @@ import org.elasticsearch.gateway.PersistedClusterStateService; import org.elasticsearch.health.HealthPeriodicLogger; import org.elasticsearch.health.node.LocalHealthMonitor; +import org.elasticsearch.health.node.ShardsCapacityHealthIndicatorService; import org.elasticsearch.health.node.action.TransportHealthNodeAction; import org.elasticsearch.health.node.selection.HealthNodeTaskExecutor; import org.elasticsearch.http.HttpTransportSettings; @@ -650,6 +651,8 @@ public void apply(Settings value, Settings current, Settings previous) { WriteLoadConstraintSettings.WRITE_LOAD_DECIDER_HIGH_UTILIZATION_THRESHOLD_SETTING, WriteLoadConstraintSettings.WRITE_LOAD_DECIDER_HIGH_UTILIZATION_DURATION_SETTING, WriteLoadConstraintSettings.WRITE_LOAD_DECIDER_QUEUE_LATENCY_THRESHOLD_SETTING, - WriteLoadConstraintSettings.WRITE_LOAD_DECIDER_REROUTE_INTERVAL_SETTING + WriteLoadConstraintSettings.WRITE_LOAD_DECIDER_REROUTE_INTERVAL_SETTING, + ShardsCapacityHealthIndicatorService.SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW, + ShardsCapacityHealthIndicatorService.SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED ); } diff --git a/server/src/main/java/org/elasticsearch/health/HealthFeatures.java b/server/src/main/java/org/elasticsearch/health/HealthFeatures.java new file mode 100644 index 0000000000000..7af361bed6ea2 --- /dev/null +++ b/server/src/main/java/org/elasticsearch/health/HealthFeatures.java @@ -0,0 +1,25 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the "Elastic License + * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side + * Public License v 1"; you may not use this file except in compliance with, at + * your election, the "Elastic License 2.0", the "GNU Affero General Public + * License v3.0 only", or the "Server Side Public License, v 1". + */ + +package org.elasticsearch.health; + +import org.elasticsearch.features.FeatureSpecification; +import org.elasticsearch.features.NodeFeature; + +import java.util.Set; + +import static org.elasticsearch.health.node.ShardsCapacityHealthIndicatorService.SHARD_CAPACITY_UNHEALTHY_THRESHOLD_SETTINGS; + +public class HealthFeatures implements FeatureSpecification { + + @Override + public Set getTestFeatures() { + return Set.of(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_SETTINGS); + } +} diff --git a/server/src/main/java/org/elasticsearch/health/node/ShardsCapacityHealthIndicatorService.java b/server/src/main/java/org/elasticsearch/health/node/ShardsCapacityHealthIndicatorService.java index f9c0c2bf2a536..618b88de9817f 100644 --- a/server/src/main/java/org/elasticsearch/health/node/ShardsCapacityHealthIndicatorService.java +++ b/server/src/main/java/org/elasticsearch/health/node/ShardsCapacityHealthIndicatorService.java @@ -16,6 +16,8 @@ import org.elasticsearch.common.ReferenceDocs; import org.elasticsearch.common.TriFunction; import org.elasticsearch.common.settings.Setting; +import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.features.NodeFeature; import org.elasticsearch.health.Diagnosis; import org.elasticsearch.health.HealthIndicatorDetails; import org.elasticsearch.health.HealthIndicatorImpact; @@ -27,18 +29,24 @@ import org.elasticsearch.indices.ShardLimitValidator; import java.util.ArrayList; +import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; +import java.util.Locale; +import java.util.Map; /** * This indicator reports health data about the shard capacity across the cluster. * - *

+ * The indicator will report: - * * RED when there's room for less than 5 shards (either data or frozen nodes) - * * YELLOW when there's room for less than 10 shards (either data or frozen nodes) - * * GREEN otherwise - *

+ * * * Although the `max_shard_per_node(.frozen)?` information is scoped by Node, we use the information from master because there is where * the available room for new shards is checked before creating new indices. @@ -89,12 +97,96 @@ public class ShardsCapacityHealthIndicatorService implements HealthIndicatorServ "frozen" ); + public static final NodeFeature SHARD_CAPACITY_UNHEALTHY_THRESHOLD_SETTINGS = new NodeFeature( + "health.shard_capacity.unhealthy_threshold_settings" + ); + + public static final Setting SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW = Setting.intSetting( + "health.shard_capacity.unhealthy_threshold.yellow", + 10, + 1, + new Setting.Validator<>() { + @Override + public void validate(Integer value) {} + + @Override + public void validate(Integer value, Map, Object> settings) { + Integer redThreshold = (Integer) settings.get(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED); + if (value <= redThreshold) { + throw new IllegalArgumentException( + String.format( + Locale.ROOT, + "Setting [%s] (%d) must be greater than [%s] (%d)", + SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.getKey(), + value, + SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.getKey(), + redThreshold + ) + ); + } + } + + @Override + public Iterator> settings() { + final List> settings = List.of(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED); + return settings.iterator(); + } + }, + Setting.Property.Dynamic, + Setting.Property.NodeScope + ); + + public static final Setting SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED = Setting.intSetting( + "health.shard_capacity.unhealthy_threshold.red", + 5, + 1, + new Setting.Validator<>() { + @Override + public void validate(Integer value) {} + + @Override + public void validate(Integer value, Map, Object> settings) { + Integer yellowThreshold = (Integer) settings.get(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW); + if (value >= yellowThreshold) { + throw new IllegalArgumentException( + String.format( + Locale.ROOT, + "Setting [%s] (%d) must be less than [%s] (%d)", + SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.getKey(), + value, + SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.getKey(), + yellowThreshold + ) + ); + } + } + + @Override + public Iterator> settings() { + final List> settings = List.of(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW); + return settings.iterator(); + } + }, + Setting.Property.Dynamic, + Setting.Property.NodeScope + ); + private final ClusterService clusterService; private final List shardLimitGroups; - public ShardsCapacityHealthIndicatorService(ClusterService clusterService) { + private int unhealthyThresholdYellow; + private int unhealthyThresholdRed; + + public ShardsCapacityHealthIndicatorService(ClusterService clusterService, Settings settings) { this.clusterService = clusterService; + this.unhealthyThresholdYellow = SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.get(settings); + this.unhealthyThresholdRed = SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.get(settings); this.shardLimitGroups = ShardLimitValidator.applicableLimitGroups(DiscoveryNode.isStateless(clusterService.getSettings())); + + clusterService.getClusterSettings() + .addSettingsUpdateConsumer(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW, this::setUnhealthyThresholdYellow); + clusterService.getClusterSettings() + .addSettingsUpdateConsumer(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED, this::setUnhealthyThresholdRed); } @Override @@ -117,7 +209,9 @@ public HealthIndicatorResult calculate(boolean verbose, int maxAffectedResources ShardLimitValidator.getShardLimitPerNode(limitGroup, shardLimitsMetadata), state.nodes(), state.metadata(), - limitGroup::checkShardLimit + limitGroup::checkShardLimit, + unhealthyThresholdYellow, + unhealthyThresholdRed ) ) .toList(); @@ -166,7 +260,9 @@ private HealthIndicatorResult mergeIndicators(boolean verbose, List results) { + static HealthIndicatorDetails buildDetails( + List results, + int unhealthyThresholdYellow, + int unhealthyThresholdRed + ) { return (builder, params) -> { builder.startObject(); for (var result : results) { @@ -202,6 +304,12 @@ static HealthIndicatorDetails buildDetails(List resu } builder.endObject(); } + { + builder.startObject("settings"); + builder.field(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.getKey(), unhealthyThresholdYellow); + builder.field(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.getKey(), unhealthyThresholdRed); + builder.endObject(); + } builder.endObject(); return builder; }; @@ -217,6 +325,14 @@ private HealthIndicatorResult unknownIndicator() { ); } + private void setUnhealthyThresholdYellow(int value) { + this.unhealthyThresholdYellow = value; + } + + private void setUnhealthyThresholdRed(int value) { + this.unhealthyThresholdRed = value; + } + private static String nodeTypeFroLimitGroup(ShardLimitValidator.LimitGroup limitGroup) { return switch (limitGroup) { case NORMAL -> "data"; diff --git a/server/src/main/java/org/elasticsearch/node/NodeConstruction.java b/server/src/main/java/org/elasticsearch/node/NodeConstruction.java index 6c4331e97f609..fc40ed87da2bd 100644 --- a/server/src/main/java/org/elasticsearch/node/NodeConstruction.java +++ b/server/src/main/java/org/elasticsearch/node/NodeConstruction.java @@ -1473,7 +1473,7 @@ private Module loadDiagnosticServices( new StableMasterHealthIndicatorService(coordinationDiagnosticsService, clusterService), new RepositoryIntegrityHealthIndicatorService(clusterService), new DiskHealthIndicatorService(clusterService), - new ShardsCapacityHealthIndicatorService(clusterService), + new ShardsCapacityHealthIndicatorService(clusterService, settings), new FileSettingsHealthIndicatorService() ); var pluginHealthIndicatorServices = pluginsService.filterPlugins(HealthPlugin.class) diff --git a/server/src/main/resources/META-INF/services/org.elasticsearch.features.FeatureSpecification b/server/src/main/resources/META-INF/services/org.elasticsearch.features.FeatureSpecification index 42bf3c942daaf..24202416b22f6 100644 --- a/server/src/main/resources/META-INF/services/org.elasticsearch.features.FeatureSpecification +++ b/server/src/main/resources/META-INF/services/org.elasticsearch.features.FeatureSpecification @@ -21,3 +21,4 @@ org.elasticsearch.cluster.routing.RoutingFeatures org.elasticsearch.action.admin.cluster.stats.ClusterStatsFeatures org.elasticsearch.ingest.IngestFeatures org.elasticsearch.action.admin.indices.resolve.ResolveIndexFeatures +org.elasticsearch.health.HealthFeatures diff --git a/server/src/test/java/org/elasticsearch/health/node/ShardsCapacityHealthIndicatorServiceStatelessTests.java b/server/src/test/java/org/elasticsearch/health/node/ShardsCapacityHealthIndicatorServiceStatelessTests.java index bc757b71b9b9a..de055a2fc451f 100644 --- a/server/src/test/java/org/elasticsearch/health/node/ShardsCapacityHealthIndicatorServiceStatelessTests.java +++ b/server/src/test/java/org/elasticsearch/health/node/ShardsCapacityHealthIndicatorServiceStatelessTests.java @@ -43,6 +43,7 @@ import java.util.Set; import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_CREATION_DATE; +import static org.elasticsearch.common.settings.Settings.EMPTY; import static org.elasticsearch.health.HealthStatus.RED; import static org.elasticsearch.health.HealthStatus.YELLOW; import static org.elasticsearch.indices.ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE; @@ -81,7 +82,7 @@ public static void tearDownThreadPool() { public void testIndicatorYieldsGreenInCaseThereIsRoom() throws IOException { int maxShardsPerNode = randomValidMaxShards(); var clusterService = createClusterService(maxShardsPerNode, 1, 1, createIndex(maxShardsPerNode / 4)); - var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO); + var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, EMPTY).calculate(true, HealthInfo.EMPTY_HEALTH_INFO); assertEquals(HealthStatus.GREEN, indicatorResult.status()); assertTrue(indicatorResult.impacts().isEmpty()); @@ -94,7 +95,9 @@ public void testIndicatorYieldsGreenInCaseThereIsRoom() throws IOException { "index", Map.of("max_shards_in_cluster", maxShardsPerNode), "search", - Map.of("max_shards_in_cluster", maxShardsPerNode) + Map.of("max_shards_in_cluster", maxShardsPerNode), + "settings", + Map.of("health.shard_capacity.unhealthy_threshold.yellow", 10, "health.shard_capacity.unhealthy_threshold.red", 5) ) ) ); @@ -112,7 +115,10 @@ public void doTestIndicatorYieldsYellowInCaseThereIsNotEnoughRoom(HealthStatus s { // Only index does not have enough space var clusterService = createClusterService(maxShardsPerNode, 1, 2, createIndex(indexNumShards)); - var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO); + var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, EMPTY).calculate( + true, + HealthInfo.EMPTY_HEALTH_INFO + ); assertEquals(indicatorResult.status(), status); assertEquals( @@ -123,7 +129,10 @@ public void doTestIndicatorYieldsYellowInCaseThereIsNotEnoughRoom(HealthStatus s { // Only search does not have enough space var clusterService = createClusterService(maxShardsPerNode, 2, 1, createIndex(indexNumShards)); - var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO); + var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, EMPTY).calculate( + true, + HealthInfo.EMPTY_HEALTH_INFO + ); assertEquals(indicatorResult.status(), status); assertEquals( @@ -134,7 +143,10 @@ public void doTestIndicatorYieldsYellowInCaseThereIsNotEnoughRoom(HealthStatus s { // Both data and frozen nodes does not have enough space var clusterService = createClusterService(maxShardsPerNode, 1, 1, createIndex(indexNumShards)); - var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO); + var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, EMPTY).calculate( + true, + HealthInfo.EMPTY_HEALTH_INFO + ); assertEquals(indicatorResult.status(), status); assertEquals( diff --git a/server/src/test/java/org/elasticsearch/health/node/ShardsCapacityHealthIndicatorServiceTests.java b/server/src/test/java/org/elasticsearch/health/node/ShardsCapacityHealthIndicatorServiceTests.java index ed46f6a50cafa..9d11bc9b6d46d 100644 --- a/server/src/test/java/org/elasticsearch/health/node/ShardsCapacityHealthIndicatorServiceTests.java +++ b/server/src/test/java/org/elasticsearch/health/node/ShardsCapacityHealthIndicatorServiceTests.java @@ -48,12 +48,15 @@ import java.util.stream.Stream; import static org.elasticsearch.cluster.metadata.IndexMetadata.SETTING_CREATION_DATE; +import static org.elasticsearch.common.settings.Settings.EMPTY; import static org.elasticsearch.health.HealthStatus.GREEN; import static org.elasticsearch.health.HealthStatus.RED; import static org.elasticsearch.health.HealthStatus.YELLOW; import static org.elasticsearch.health.node.ShardsCapacityHealthIndicatorService.RED_INDICATOR_IMPACTS; import static org.elasticsearch.health.node.ShardsCapacityHealthIndicatorService.SHARDS_MAX_CAPACITY_REACHED_DATA_NODES; import static org.elasticsearch.health.node.ShardsCapacityHealthIndicatorService.SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES; +import static org.elasticsearch.health.node.ShardsCapacityHealthIndicatorService.SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED; +import static org.elasticsearch.health.node.ShardsCapacityHealthIndicatorService.SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW; import static org.elasticsearch.health.node.ShardsCapacityHealthIndicatorService.YELLOW_INDICATOR_IMPACTS; import static org.elasticsearch.health.node.ShardsCapacityHealthIndicatorService.calculateFrom; import static org.elasticsearch.indices.ShardLimitValidator.FROZEN_GROUP; @@ -119,7 +122,7 @@ public void testNoShardsCapacityMetadata() throws IOException { createIndexInDataNode(100) ) ); - var target = new ShardsCapacityHealthIndicatorService(clusterService); + var target = new ShardsCapacityHealthIndicatorService(clusterService, EMPTY); var indicatorResult = target.calculate(true, HealthInfo.EMPTY_HEALTH_INFO); assertEquals(indicatorResult.status(), HealthStatus.UNKNOWN); @@ -133,7 +136,7 @@ public void testIndicatorYieldsGreenInCaseThereIsRoom() throws IOException { int maxShardsPerNode = randomValidMaxShards(); int maxShardsPerNodeFrozen = randomValidMaxShards(); var clusterService = createClusterService(maxShardsPerNode, maxShardsPerNodeFrozen, createIndexInDataNode(maxShardsPerNode / 4)); - var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO); + var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, EMPTY).calculate(true, HealthInfo.EMPTY_HEALTH_INFO); assertEquals(indicatorResult.status(), HealthStatus.GREEN); assertTrue(indicatorResult.impacts().isEmpty()); @@ -146,7 +149,9 @@ public void testIndicatorYieldsGreenInCaseThereIsRoom() throws IOException { "data", Map.of("max_shards_in_cluster", maxShardsPerNode), "frozen", - Map.of("max_shards_in_cluster", maxShardsPerNodeFrozen) + Map.of("max_shards_in_cluster", maxShardsPerNodeFrozen), + "settings", + Map.of("health.shard_capacity.unhealthy_threshold.yellow", 10, "health.shard_capacity.unhealthy_threshold.red", 5) ) ) ); @@ -181,7 +186,10 @@ public void testIndicatorYieldsYellowInCaseThereIsNotEnoughRoom() throws IOExcep // Only data_nodes does not have enough space int maxShardsPerNodeFrozen = randomValidMaxShards(); var clusterService = createClusterService(25, maxShardsPerNodeFrozen, createIndexInDataNode(4)); - var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO); + var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, EMPTY).calculate( + true, + HealthInfo.EMPTY_HEALTH_INFO + ); assertEquals(indicatorResult.status(), YELLOW); assertEquals(indicatorResult.symptom(), "Cluster is close to reaching the configured maximum number of shards for data nodes."); @@ -195,7 +203,9 @@ public void testIndicatorYieldsYellowInCaseThereIsNotEnoughRoom() throws IOExcep "data", Map.of("max_shards_in_cluster", 25, "current_used_shards", 8), "frozen", - Map.of("max_shards_in_cluster", maxShardsPerNodeFrozen) + Map.of("max_shards_in_cluster", maxShardsPerNodeFrozen), + "settings", + Map.of("health.shard_capacity.unhealthy_threshold.yellow", 10, "health.shard_capacity.unhealthy_threshold.red", 5) ) ) ); @@ -204,7 +214,10 @@ public void testIndicatorYieldsYellowInCaseThereIsNotEnoughRoom() throws IOExcep // Only frozen_nodes does not have enough space int maxShardsPerNode = randomValidMaxShards(); var clusterService = createClusterService(maxShardsPerNode, 25, createIndexInFrozenNode(4)); - var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO); + var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, EMPTY).calculate( + true, + HealthInfo.EMPTY_HEALTH_INFO + ); assertEquals(indicatorResult.status(), YELLOW); assertEquals( @@ -221,7 +234,9 @@ public void testIndicatorYieldsYellowInCaseThereIsNotEnoughRoom() throws IOExcep "data", Map.of("max_shards_in_cluster", maxShardsPerNode), "frozen", - Map.of("max_shards_in_cluster", 25, "current_used_shards", 8) + Map.of("max_shards_in_cluster", 25, "current_used_shards", 8), + "settings", + Map.of("health.shard_capacity.unhealthy_threshold.yellow", 10, "health.shard_capacity.unhealthy_threshold.red", 5) ) ) ); @@ -229,7 +244,10 @@ public void testIndicatorYieldsYellowInCaseThereIsNotEnoughRoom() throws IOExcep { // Both data and frozen nodes does not have enough space var clusterService = createClusterService(25, 25, createIndexInDataNode(4), createIndexInFrozenNode(4)); - var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO); + var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, EMPTY).calculate( + true, + HealthInfo.EMPTY_HEALTH_INFO + ); assertEquals(indicatorResult.status(), YELLOW); assertEquals( @@ -248,7 +266,9 @@ public void testIndicatorYieldsYellowInCaseThereIsNotEnoughRoom() throws IOExcep "data", Map.of("max_shards_in_cluster", 25, "current_used_shards", 8), "frozen", - Map.of("max_shards_in_cluster", 25, "current_used_shards", 8) + Map.of("max_shards_in_cluster", 25, "current_used_shards", 8), + "settings", + Map.of("health.shard_capacity.unhealthy_threshold.yellow", 10, "health.shard_capacity.unhealthy_threshold.red", 5) ) ) ); @@ -260,7 +280,10 @@ public void testIndicatorYieldsRedInCaseThereIsNotEnoughRoom() throws IOExceptio // Only data_nodes does not have enough space int maxShardsPerNodeFrozen = randomValidMaxShards(); var clusterService = createClusterService(25, maxShardsPerNodeFrozen, createIndexInDataNode(11)); - var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO); + var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, EMPTY).calculate( + true, + HealthInfo.EMPTY_HEALTH_INFO + ); assertEquals(indicatorResult.status(), RED); assertEquals(indicatorResult.symptom(), "Cluster is close to reaching the configured maximum number of shards for data nodes."); @@ -274,7 +297,9 @@ public void testIndicatorYieldsRedInCaseThereIsNotEnoughRoom() throws IOExceptio "data", Map.of("max_shards_in_cluster", 25, "current_used_shards", 22), "frozen", - Map.of("max_shards_in_cluster", maxShardsPerNodeFrozen) + Map.of("max_shards_in_cluster", maxShardsPerNodeFrozen), + "settings", + Map.of("health.shard_capacity.unhealthy_threshold.yellow", 10, "health.shard_capacity.unhealthy_threshold.red", 5) ) ) ); @@ -283,7 +308,10 @@ public void testIndicatorYieldsRedInCaseThereIsNotEnoughRoom() throws IOExceptio // Only frozen_nodes does not have enough space int maxShardsPerNode = randomValidMaxShards(); var clusterService = createClusterService(maxShardsPerNode, 25, createIndexInFrozenNode(11)); - var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO); + var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, EMPTY).calculate( + true, + HealthInfo.EMPTY_HEALTH_INFO + ); assertEquals(indicatorResult.status(), RED); assertEquals( @@ -300,7 +328,9 @@ public void testIndicatorYieldsRedInCaseThereIsNotEnoughRoom() throws IOExceptio "data", Map.of("max_shards_in_cluster", maxShardsPerNode), "frozen", - Map.of("max_shards_in_cluster", 25, "current_used_shards", 22) + Map.of("max_shards_in_cluster", 25, "current_used_shards", 22), + "settings", + Map.of("health.shard_capacity.unhealthy_threshold.yellow", 10, "health.shard_capacity.unhealthy_threshold.red", 5) ) ) ); @@ -308,7 +338,10 @@ public void testIndicatorYieldsRedInCaseThereIsNotEnoughRoom() throws IOExceptio { // Both data and frozen nodes does not have enough space var clusterService = createClusterService(25, 25, createIndexInDataNode(11), createIndexInFrozenNode(11)); - var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO); + var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, EMPTY).calculate( + true, + HealthInfo.EMPTY_HEALTH_INFO + ); assertEquals(indicatorResult.status(), RED); assertEquals( @@ -327,13 +360,84 @@ public void testIndicatorYieldsRedInCaseThereIsNotEnoughRoom() throws IOExceptio "data", Map.of("max_shards_in_cluster", 25, "current_used_shards", 22), "frozen", - Map.of("max_shards_in_cluster", 25, "current_used_shards", 22) + Map.of("max_shards_in_cluster", 25, "current_used_shards", 22), + "settings", + Map.of("health.shard_capacity.unhealthy_threshold.yellow", 10, "health.shard_capacity.unhealthy_threshold.red", 5) ) ) ); } } + public void testUnhealthyThresholdSettings() { + { + // default values + Settings settings = Settings.builder().build(); + assertEquals(10, SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.get(settings).intValue()); + assertEquals(5, SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.get(settings).intValue()); + } + { + Integer randomYellowThreshold = randomIntBetween(2, Integer.MAX_VALUE); + Integer randomRedThreshold = randomYellowThreshold - 1; + Settings settings = Settings.builder() + .put(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.getKey(), randomRedThreshold) + .put(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.getKey(), randomYellowThreshold) + .build(); + assertEquals(randomYellowThreshold, SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.get(settings)); + assertEquals(randomRedThreshold, SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.get(settings)); + } + { + // invalid - same values + int threshold = randomIntBetween(1, Integer.MAX_VALUE); + Settings settings = Settings.builder() + .put(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.getKey(), threshold) + .put(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.getKey(), threshold) + .build(); + expectThrows( + IllegalArgumentException.class, + () -> ShardsCapacityHealthIndicatorService.SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.get(settings) + ); + expectThrows( + IllegalArgumentException.class, + () -> ShardsCapacityHealthIndicatorService.SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.get(settings) + ); + } + { + // invalid - yellow threshold is lower than red threshold + int randomYellowThreshold = randomIntBetween(1, Integer.MAX_VALUE - 1); + int randomRedThreshold = randomYellowThreshold + 1; + Settings settings = Settings.builder() + .put(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.getKey(), randomRedThreshold) + .put(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.getKey(), randomYellowThreshold) + .build(); + expectThrows( + IllegalArgumentException.class, + () -> ShardsCapacityHealthIndicatorService.SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.get(settings) + ); + expectThrows( + IllegalArgumentException.class, + () -> ShardsCapacityHealthIndicatorService.SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.get(settings) + ); + } + { + // invalid - non-positive values + int randomYellowThreshold = randomIntBetween(Integer.MIN_VALUE + 1, 0); + int randomRedThreshold = randomYellowThreshold - 1; + Settings settings = Settings.builder() + .put(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.getKey(), randomRedThreshold) + .put(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.getKey(), randomYellowThreshold) + .build(); + expectThrows( + IllegalArgumentException.class, + () -> ShardsCapacityHealthIndicatorService.SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.get(settings) + ); + expectThrows( + IllegalArgumentException.class, + () -> ShardsCapacityHealthIndicatorService.SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.get(settings) + ); + } + } + public void testCalculateMethods() { var mockedState = ClusterState.EMPTY_STATE; var randomMaxShardsPerNodeSetting = randomInt(); @@ -354,22 +458,49 @@ public void testCalculateMethods() { randomFrom(ShardLimitValidator.LimitGroup.values()) ); }; - + var randomYellowThreshold = randomIntBetween(2, Integer.MAX_VALUE); + var randomRedThreshold = randomIntBetween(1, randomYellowThreshold); assertEquals( - calculateFrom(randomMaxShardsPerNodeSetting, mockedState.nodes(), mockedState.metadata(), checkerWrapper.apply(5)).status(), - RED + RED, + calculateFrom( + randomMaxShardsPerNodeSetting, + mockedState.nodes(), + mockedState.metadata(), + checkerWrapper.apply(randomRedThreshold), + randomYellowThreshold, + randomRedThreshold + ).status() ); assertEquals( - calculateFrom(randomMaxShardsPerNodeSetting, mockedState.nodes(), mockedState.metadata(), checkerWrapper.apply(10)).status(), - YELLOW + YELLOW, + calculateFrom( + randomMaxShardsPerNodeSetting, + mockedState.nodes(), + mockedState.metadata(), + checkerWrapper.apply(randomYellowThreshold), + randomYellowThreshold, + randomRedThreshold + ).status() ); - // Let's cover the holes :) - Stream.of(randomIntBetween(1, 4), randomIntBetween(6, 9), randomIntBetween(11, Integer.MAX_VALUE)) + Stream.of( + randomIntBetween(0, randomRedThreshold - 1), + randomIntBetween(randomRedThreshold + 1, randomYellowThreshold - 1), + randomIntBetween(randomYellowThreshold + 1, Integer.MAX_VALUE) + ) .map(checkerWrapper) - .map(checker -> calculateFrom(randomMaxShardsPerNodeSetting, mockedState.nodes(), mockedState.metadata(), checker)) + .map( + checker -> calculateFrom( + randomMaxShardsPerNodeSetting, + mockedState.nodes(), + mockedState.metadata(), + checker, + randomYellowThreshold, + randomRedThreshold + ) + ) .map(ShardsCapacityHealthIndicatorService.StatusResult::status) - .forEach(status -> assertEquals(status, GREEN)); + .forEach(status -> assertEquals(GREEN, status)); } // We expose the indicator name and the diagnoses in the x-pack usage API. In order to index them properly in a telemetry index @@ -389,7 +520,10 @@ public void testMappedFieldsForTelemetry() { public void testSkippingFieldsWhenVerboseIsFalse() { int maxShardsPerNodeFrozen = randomValidMaxShards(); var clusterService = createClusterService(25, maxShardsPerNodeFrozen, createIndexInDataNode(11)); - var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(false, HealthInfo.EMPTY_HEALTH_INFO); + var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, EMPTY).calculate( + false, + HealthInfo.EMPTY_HEALTH_INFO + ); assertEquals(indicatorResult.status(), RED); assertEquals(indicatorResult.symptom(), "Cluster is close to reaching the configured maximum number of shards for data nodes.");