Skip to content

Commit b3ebfd1

Browse files
authored
Add settings for shard_capacity health indicator thresholds (#136141)
1 parent 6489bd3 commit b3ebfd1

File tree

14 files changed

+403
-49
lines changed

14 files changed

+403
-49
lines changed

docs/changelog/136141.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
pr: 136141
2+
summary: Add settings for health indicator `shard_capacity` thresholds
3+
area: Health
4+
type: enhancement
5+
issues:
6+
- 116697

docs/reference/elasticsearch/configuration-reference/health-diagnostic-settings.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,4 +47,8 @@ The following are the *expert-level* settings available for configuring an inter
4747
`health.periodic_logger.poll_interval`
4848
: ([Dynamic](docs-content://deploy-manage/stack-settings.md#dynamic-cluster-setting), [time unit value](/reference/elasticsearch/rest-apis/api-conventions.md#time-units)) How often {{es}} logs the health status of the cluster and of each health indicator as observed by the Health API. Defaults to `60s` (60 seconds).
4949

50+
`health.shard_capacity.unhealthy_threshold.yellow` {applies_to}`stack: ga 9.3`
51+
: ([Dynamic](docs-content://deploy-manage/stack-settings.md#dynamic-cluster-setting)) The minimum number of additional shards the cluster must still be able to allocate (on data or frozen nodes) for shard capacity health to remain `GREEN`. If fewer are available, health becomes `YELLOW`. Must be greater than `health.shard_capacity.unhealthy_threshold.red`. Defaults to `10`.
5052

53+
`health.shard_capacity.unhealthy_threshold.red` {applies_to}`stack: ga 9.3`
54+
: ([Dynamic](docs-content://deploy-manage/stack-settings.md#dynamic-cluster-setting)) The minimum number of additional shards the cluster must still be able to allocate (on data or frozen nodes) below which shard capacity health becomes `RED`. Must be less than `health.shard_capacity.unhealthy_threshold.yellow`. Defaults to `5`.

server/src/internalClusterTest/java/org/elasticsearch/health/HealthMetadataServiceIT.java

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@
3030
import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING;
3131
import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING;
3232
import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING;
33+
import static org.elasticsearch.health.node.ShardsCapacityHealthIndicatorService.SETTING_SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED;
34+
import static org.elasticsearch.health.node.ShardsCapacityHealthIndicatorService.SETTING_SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW;
3335
import static org.elasticsearch.indices.ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE;
3436
import static org.elasticsearch.indices.ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN;
3537
import static org.elasticsearch.test.NodeRoles.onlyRoles;
@@ -55,7 +57,12 @@ public void testEachMasterPublishesTheirThresholds() throws Exception {
5557
ByteSizeValue randomBytes = ByteSizeValue.ofBytes(randomLongBetween(6, 19));
5658
String customWatermark = percentageMode ? randomIntBetween(86, 94) + "%" : randomBytes.toString();
5759
ByteSizeValue customMaxHeadroom = percentageMode ? randomBytes : ByteSizeValue.MINUS_ONE;
58-
var customShardLimits = new HealthMetadata.ShardLimits(randomIntBetween(1, 1000), randomIntBetween(1001, 2000));
60+
var customShardLimits = new HealthMetadata.ShardLimits(
61+
randomIntBetween(1, 1000),
62+
randomIntBetween(1001, 2000),
63+
randomIntBetween(101, 200),
64+
randomIntBetween(1, 100)
65+
);
5966
String nodeName = startNode(internalCluster, customWatermark, customMaxHeadroom.toString(), customShardLimits);
6067
watermarkByNode.put(nodeName, customWatermark);
6168
maxHeadroomByNode.put(nodeName, customMaxHeadroom);
@@ -111,7 +118,9 @@ public void testWatermarkSettingUpdate() throws Exception {
111118
ByteSizeValue initialMaxHeadroom = percentageMode ? randomBytes : ByteSizeValue.MINUS_ONE;
112119
HealthMetadata.ShardLimits initialShardLimits = new HealthMetadata.ShardLimits(
113120
randomIntBetween(1, 1000),
114-
randomIntBetween(1001, 2000)
121+
randomIntBetween(1001, 2000),
122+
randomIntBetween(101, 200),
123+
randomIntBetween(1, 100)
115124
);
116125
for (int i = 0; i < numberOfNodes; i++) {
117126
startNode(internalCluster, initialWatermark, initialMaxHeadroom.toString(), initialShardLimits);
@@ -128,7 +137,9 @@ public void testWatermarkSettingUpdate() throws Exception {
128137
ByteSizeValue updatedFloodStageMaxHeadroom = percentageMode ? randomBytes : ByteSizeValue.MINUS_ONE;
129138
HealthMetadata.ShardLimits updatedShardLimits = new HealthMetadata.ShardLimits(
130139
randomIntBetween(3000, 4000),
131-
randomIntBetween(4001, 5000)
140+
randomIntBetween(4001, 5000),
141+
randomIntBetween(101, 200),
142+
randomIntBetween(1, 100)
132143
);
133144

134145
ensureStableCluster(numberOfNodes);
@@ -146,7 +157,9 @@ public void testWatermarkSettingUpdate() throws Exception {
146157
.put(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), updatedHighWatermark)
147158
.put(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), updatedFloodStageWatermark)
148159
.put(SETTING_CLUSTER_MAX_SHARDS_PER_NODE.getKey(), updatedShardLimits.maxShardsPerNode())
149-
.put(SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN.getKey(), updatedShardLimits.maxShardsPerNodeFrozen());
160+
.put(SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN.getKey(), updatedShardLimits.maxShardsPerNodeFrozen())
161+
.put(SETTING_SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.getKey(), updatedShardLimits.shardCapacityUnhealthyThresholdYellow())
162+
.put(SETTING_SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.getKey(), updatedShardLimits.shardCapacityUnhealthyThresholdRed());
150163

151164
if (percentageMode) {
152165
settingsBuilder.put(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), updatedLowMaxHeadroom)
@@ -214,7 +227,12 @@ public void testHealthNodeToggleEnabled() throws Exception {
214227
ByteSizeValue randomBytes = ByteSizeValue.ofBytes(randomLongBetween(6, 19));
215228
String customWatermark = percentageMode ? randomIntBetween(86, 94) + "%" : randomBytes.toString();
216229
ByteSizeValue customMaxHeadroom = percentageMode ? randomBytes : ByteSizeValue.MINUS_ONE;
217-
var customShardLimits = new HealthMetadata.ShardLimits(randomIntBetween(1, 1000), randomIntBetween(1001, 2000));
230+
var customShardLimits = new HealthMetadata.ShardLimits(
231+
randomIntBetween(1, 1000),
232+
randomIntBetween(1001, 2000),
233+
randomIntBetween(101, 200),
234+
randomIntBetween(1, 100)
235+
);
218236
String nodeName = startNode(internalCluster, customWatermark, customMaxHeadroom.toString(), customShardLimits);
219237
watermarkByNode.put(nodeName, customWatermark);
220238
maxHeadroomByNode.put(nodeName, customMaxHeadroom);
@@ -270,6 +288,8 @@ private String startNode(
270288
.put(createWatermarkSettings(customWatermark, customMaxHeadroom))
271289
.put(SETTING_CLUSTER_MAX_SHARDS_PER_NODE.getKey(), customShardLimits.maxShardsPerNode())
272290
.put(SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN.getKey(), customShardLimits.maxShardsPerNodeFrozen())
291+
.put(SETTING_SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.getKey(), customShardLimits.shardCapacityUnhealthyThresholdYellow())
292+
.put(SETTING_SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.getKey(), customShardLimits.shardCapacityUnhealthyThresholdRed())
273293
.build()
274294
);
275295
}

server/src/internalClusterTest/java/org/elasticsearch/health/node/ShardsCapacityHealthIndicatorServiceIT.java

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
import java.util.List;
2323
import java.util.concurrent.atomic.AtomicReference;
2424

25+
import static org.elasticsearch.health.node.ShardsCapacityHealthIndicatorService.SETTING_SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED;
26+
import static org.elasticsearch.health.node.ShardsCapacityHealthIndicatorService.SETTING_SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW;
2527
import static org.elasticsearch.indices.ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE;
2628
import static org.hamcrest.Matchers.empty;
2729
import static org.hamcrest.Matchers.hasSize;
@@ -79,6 +81,37 @@ public void testRed() throws Exception {
7981
assertThat(result.impacts(), hasSize(2));
8082
}
8183

84+
public void testUnhealthyThresholds() throws Exception {
85+
// baseline
86+
assertEquals(HealthStatus.GREEN, fetchShardsCapacityIndicatorResult().status());
87+
88+
// set very large threshold to force the indicator to go yellow
89+
updateClusterSettings(Settings.builder().put(SETTING_SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.getKey(), 100000000));
90+
assertBusy(() -> {
91+
// waits for settings to propagate to health metadata
92+
assertEquals(HealthStatus.YELLOW, fetchShardsCapacityIndicatorResult().status());
93+
});
94+
95+
// set very large threshold to force the indicator to go red
96+
updateClusterSettings(Settings.builder().put(SETTING_SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.getKey(), 90000000));
97+
assertBusy(() -> {
98+
// waits for settings to propagate to health metadata
99+
assertEquals(HealthStatus.RED, fetchShardsCapacityIndicatorResult().status());
100+
});
101+
102+
// reset thresholds to defaults
103+
updateClusterSettings(
104+
Settings.builder()
105+
.putNull(SETTING_SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.getKey())
106+
.putNull(SETTING_SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.getKey())
107+
);
108+
assertBusy(() -> {
109+
// waits for settings to propagate to health metadata
110+
assertEquals(HealthStatus.GREEN, fetchShardsCapacityIndicatorResult().status());
111+
});
112+
113+
}
114+
82115
private void createIndex(int shards, int replicas) {
83116
createIndex(INDEX_NAME, indexSettings(shards, replicas).build());
84117
}
@@ -129,6 +162,14 @@ private void waitForShardLimitsMetadata(String node) throws Exception {
129162
"max_shards_per_node.frozen setting must be greater than 0",
130163
healthMetadata.getShardLimitsMetadata().maxShardsPerNodeFrozen() > 0
131164
);
165+
assertTrue(
166+
"health.shard_capacity.unhealthy_threshold.yellow setting must be greater than 0",
167+
healthMetadata.getShardLimitsMetadata().shardCapacityUnhealthyThresholdYellow() > 0
168+
);
169+
assertTrue(
170+
"health.shard_capacity.unhealthy_threshold.red setting must be greater than 0",
171+
healthMetadata.getShardLimitsMetadata().shardCapacityUnhealthyThresholdRed() > 0
172+
);
132173
});
133174
}
134175
}

server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@
8282
import org.elasticsearch.gateway.PersistedClusterStateService;
8383
import org.elasticsearch.health.HealthPeriodicLogger;
8484
import org.elasticsearch.health.node.LocalHealthMonitor;
85+
import org.elasticsearch.health.node.ShardsCapacityHealthIndicatorService;
8586
import org.elasticsearch.health.node.action.TransportHealthNodeAction;
8687
import org.elasticsearch.health.node.selection.HealthNodeTaskExecutor;
8788
import org.elasticsearch.http.HttpTransportSettings;
@@ -656,6 +657,8 @@ public void apply(Settings value, Settings current, Settings previous) {
656657
WriteLoadConstraintSettings.WRITE_LOAD_DECIDER_REROUTE_INTERVAL_SETTING,
657658
WriteLoadConstraintSettings.WRITE_LOAD_DECIDER_MINIMUM_LOGGING_INTERVAL,
658659
SamplingService.TTL_POLL_INTERVAL_SETTING,
659-
BlobStoreRepository.MAX_HEAP_SIZE_FOR_SNAPSHOT_DELETION_SETTING
660+
BlobStoreRepository.MAX_HEAP_SIZE_FOR_SNAPSHOT_DELETION_SETTING,
661+
ShardsCapacityHealthIndicatorService.SETTING_SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW,
662+
ShardsCapacityHealthIndicatorService.SETTING_SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED
660663
);
661664
}

server/src/main/java/org/elasticsearch/health/metadata/HealthMetadata.java

Lines changed: 45 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -156,28 +156,51 @@ public HealthMetadata build() {
156156
* Contains the thresholds needed to determine the health of a cluster when it comes to the amount of room available to create new
157157
* shards. These values are determined by the elected master.
158158
*/
159-
public record ShardLimits(int maxShardsPerNode, int maxShardsPerNodeFrozen) implements ToXContentFragment, Writeable {
159+
public record ShardLimits(
160+
int maxShardsPerNode,
161+
int maxShardsPerNodeFrozen,
162+
int shardCapacityUnhealthyThresholdYellow,
163+
int shardCapacityUnhealthyThresholdRed
164+
) implements ToXContentFragment, Writeable {
160165

161166
private static final String TYPE = "shard_limits";
162167
private static final ParseField MAX_SHARDS_PER_NODE = new ParseField("max_shards_per_node");
163168
private static final ParseField MAX_SHARDS_PER_NODE_FROZEN = new ParseField("max_shards_per_node_frozen");
169+
private static final ParseField SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW_FIELD = new ParseField(
170+
"shard_capacity_unhealthy_threshold_yellow"
171+
);
172+
private static final ParseField SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED_FIELD = new ParseField(
173+
"shard_capacity_unhealthy_threshold_red"
174+
);
164175
static final TransportVersion VERSION_SUPPORTING_SHARD_LIMIT_FIELDS = TransportVersions.V_8_8_0;
176+
static final TransportVersion VERSION_SHARD_CAPACITY_UNHEALTH_THRESHOLDS = TransportVersion.fromName(
177+
"shard_capacity_unhealthy_thresholds"
178+
);
165179

166180
static ShardLimits readFrom(StreamInput in) throws IOException {
167-
return new ShardLimits(in.readInt(), in.readInt());
181+
return in.getTransportVersion().supports(VERSION_SHARD_CAPACITY_UNHEALTH_THRESHOLDS)
182+
? new ShardLimits(in.readInt(), in.readInt(), in.readInt(), in.readInt())
183+
// defaults from older versions
184+
: new ShardLimits(in.readInt(), in.readInt(), 10, 5);
168185
}
169186

170187
@Override
171188
public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException {
172189
builder.field(MAX_SHARDS_PER_NODE.getPreferredName(), maxShardsPerNode);
173190
builder.field(MAX_SHARDS_PER_NODE_FROZEN.getPreferredName(), maxShardsPerNodeFrozen);
191+
builder.field(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW_FIELD.getPreferredName(), shardCapacityUnhealthyThresholdYellow);
192+
builder.field(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED_FIELD.getPreferredName(), shardCapacityUnhealthyThresholdRed);
174193
return builder;
175194
}
176195

177196
@Override
178197
public void writeTo(StreamOutput out) throws IOException {
179198
out.writeInt(maxShardsPerNode);
180199
out.writeInt(maxShardsPerNodeFrozen);
200+
if (out.getTransportVersion().supports(VERSION_SHARD_CAPACITY_UNHEALTH_THRESHOLDS)) {
201+
out.writeInt(shardCapacityUnhealthyThresholdYellow);
202+
out.writeInt(shardCapacityUnhealthyThresholdRed);
203+
}
181204
}
182205

183206
public static Builder newBuilder() {
@@ -192,12 +215,16 @@ public static class Builder {
192215

193216
private int maxShardsPerNode;
194217
private int maxShardsPerNodeFrozen;
218+
private int shardCapacityUnhealthyThresholdYellow;
219+
private int shardCapacityUnhealthyThresholdRed;
195220

196221
private Builder() {}
197222

198223
private Builder(ShardLimits shardLimits) {
199224
this.maxShardsPerNode = shardLimits.maxShardsPerNode;
200225
this.maxShardsPerNodeFrozen = shardLimits.maxShardsPerNodeFrozen;
226+
this.shardCapacityUnhealthyThresholdYellow = shardLimits.shardCapacityUnhealthyThresholdYellow;
227+
this.shardCapacityUnhealthyThresholdRed = shardLimits.shardCapacityUnhealthyThresholdRed;
201228
}
202229

203230
public Builder maxShardsPerNode(int maxShardsPerNode) {
@@ -210,8 +237,23 @@ public Builder maxShardsPerNodeFrozen(int maxShardsPerNodeFrozen) {
210237
return this;
211238
}
212239

240+
public Builder shardCapacityUnhealthyThresholdYellow(int shardCapacityUnhealthyThresholdYellow) {
241+
this.shardCapacityUnhealthyThresholdYellow = shardCapacityUnhealthyThresholdYellow;
242+
return this;
243+
}
244+
245+
public Builder shardCapacityUnhealthyThresholdRed(int shardCapacityUnhealthyThresholdRed) {
246+
this.shardCapacityUnhealthyThresholdRed = shardCapacityUnhealthyThresholdRed;
247+
return this;
248+
}
249+
213250
public ShardLimits build() {
214-
return new ShardLimits(maxShardsPerNode, maxShardsPerNodeFrozen);
251+
return new ShardLimits(
252+
maxShardsPerNode,
253+
maxShardsPerNodeFrozen,
254+
shardCapacityUnhealthyThresholdYellow,
255+
shardCapacityUnhealthyThresholdRed
256+
);
215257
}
216258
}
217259
}

server/src/main/java/org/elasticsearch/health/metadata/HealthMetadataService.java

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,8 @@
3737
import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING;
3838
import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING;
3939
import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING;
40+
import static org.elasticsearch.health.node.ShardsCapacityHealthIndicatorService.SETTING_SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED;
41+
import static org.elasticsearch.health.node.ShardsCapacityHealthIndicatorService.SETTING_SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW;
4042
import static org.elasticsearch.health.node.selection.HealthNodeTaskExecutor.ENABLED_SETTING;
4143
import static org.elasticsearch.indices.ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE;
4244
import static org.elasticsearch.indices.ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN;
@@ -109,7 +111,12 @@ private void registerListeners() {
109111
)
110112
);
111113

112-
Stream.of(SETTING_CLUSTER_MAX_SHARDS_PER_NODE, SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN)
114+
Stream.of(
115+
SETTING_CLUSTER_MAX_SHARDS_PER_NODE,
116+
SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN,
117+
SETTING_SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW,
118+
SETTING_SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED
119+
)
113120
.forEach(
114121
setting -> clusterSettings.addSettingsUpdateConsumer(
115122
setting,
@@ -225,6 +232,10 @@ private void updateOnShardLimitsSettingsUpdated(String settingName, Integer valu
225232
shardLimitsBuilder.maxShardsPerNode(value);
226233
} else if (SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN.getKey().equals(settingName)) {
227234
shardLimitsBuilder.maxShardsPerNodeFrozen(value);
235+
} else if (SETTING_SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.getKey().equals(settingName)) {
236+
shardLimitsBuilder.shardCapacityUnhealthyThresholdYellow(value);
237+
} else if (SETTING_SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.getKey().equals(settingName)) {
238+
shardLimitsBuilder.shardCapacityUnhealthyThresholdRed(value);
228239
}
229240

230241
this.localHealthMetadata = healthMetadataBuilder.shardLimits(shardLimitsBuilder.build()).build();
@@ -242,7 +253,9 @@ private static HealthMetadata initialHealthMetadata(Settings settings) {
242253
),
243254
new HealthMetadata.ShardLimits(
244255
SETTING_CLUSTER_MAX_SHARDS_PER_NODE.get(settings),
245-
SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN.get(settings)
256+
SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN.get(settings),
257+
SETTING_SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.get(settings),
258+
SETTING_SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.get(settings)
246259
)
247260
);
248261
}

0 commit comments

Comments
 (0)