-
Notifications
You must be signed in to change notification settings - Fork 25.6k
Add settings for health indicator shard_capacity thresholds #136141
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 15 commits
8f929a8
8b2c232
f1cfb53
a5f0f3b
8a19577
3413b8d
f36113d
e5c7579
a9ba533
3f9e01c
1a585f2
23dee97
22c4ad7
975d6f4
c0d243c
0239fc1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
pr: 136141 | ||
summary: Add settings for health indicator `shard_capacity` thresholds | ||
area: Health | ||
type: enhancement | ||
issues: | ||
- 116697 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
/* | ||
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
* or more contributor license agreements. Licensed under the "Elastic License | ||
* 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side | ||
* Public License v 1"; you may not use this file except in compliance with, at | ||
* your election, the "Elastic License 2.0", the "GNU Affero General Public | ||
* License v3.0 only", or the "Server Side Public License, v 1". | ||
*/ | ||
|
||
package org.elasticsearch.health; | ||
|
||
import org.elasticsearch.features.FeatureSpecification; | ||
import org.elasticsearch.features.NodeFeature; | ||
|
||
import java.util.Set; | ||
|
||
import static org.elasticsearch.health.node.ShardsCapacityHealthIndicatorService.SHARD_CAPACITY_UNHEALTHY_THRESHOLD_SETTINGS; | ||
|
||
public class HealthFeatures implements FeatureSpecification { | ||
|
||
@Override | ||
public Set<NodeFeature> getTestFeatures() { | ||
return Set.of(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_SETTINGS); | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,6 +16,8 @@ | |
import org.elasticsearch.common.ReferenceDocs; | ||
import org.elasticsearch.common.TriFunction; | ||
import org.elasticsearch.common.settings.Setting; | ||
import org.elasticsearch.common.settings.Settings; | ||
import org.elasticsearch.features.NodeFeature; | ||
import org.elasticsearch.health.Diagnosis; | ||
import org.elasticsearch.health.HealthIndicatorDetails; | ||
import org.elasticsearch.health.HealthIndicatorImpact; | ||
|
@@ -27,18 +29,24 @@ | |
import org.elasticsearch.indices.ShardLimitValidator; | ||
|
||
import java.util.ArrayList; | ||
import java.util.Iterator; | ||
import java.util.LinkedHashSet; | ||
import java.util.List; | ||
import java.util.Locale; | ||
import java.util.Map; | ||
|
||
/** | ||
* This indicator reports health data about the shard capacity across the cluster. | ||
* | ||
* <p> | ||
|
||
Comment on lines
-36
to
+41
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This formatting looks a bit off with an empty line in the middle of a JavaDoc. Could you revert the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does the |
||
* The indicator will report: | ||
* * RED when there's room for less than 5 shards (either data or frozen nodes) | ||
* * YELLOW when there's room for less than 10 shards (either data or frozen nodes) | ||
* * GREEN otherwise | ||
* </p> | ||
* <ul> | ||
* <li> {@code RED} when there's room for less than the configured {@code health.shard_capacity.unhealthy_threshold.red} (default 5) shards | ||
* (either data or frozen nodes)</li> | ||
* <li> {@code YELLOW} when there's room for less than the configured {@code health.shard_capacity.unhealthy_threshold.yellow} (default 10) | ||
* shards (either data or frozen nodes)</li> | ||
* <li> {@code GREEN} otherwise</li> | ||
* </ul> | ||
* | ||
* Although the `max_shard_per_node(.frozen)?` information is scoped by Node, we use the information from master because there is where | ||
* the available room for new shards is checked before creating new indices. | ||
|
@@ -89,12 +97,96 @@ public class ShardsCapacityHealthIndicatorService implements HealthIndicatorServ | |
"frozen" | ||
); | ||
|
||
public static final NodeFeature SHARD_CAPACITY_UNHEALTHY_THRESHOLD_SETTINGS = new NodeFeature( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: can we include |
||
"health.shard_capacity.unhealthy_threshold_settings" | ||
); | ||
|
||
public static final Setting<Integer> SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW = Setting.intSetting( | ||
"health.shard_capacity.unhealthy_threshold.yellow", | ||
10, | ||
1, | ||
new Setting.Validator<>() { | ||
@Override | ||
public void validate(Integer value) {} | ||
|
||
@Override | ||
public void validate(Integer value, Map<Setting<?>, Object> settings) { | ||
Integer redThreshold = (Integer) settings.get(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED); | ||
if (value <= redThreshold) { | ||
throw new IllegalArgumentException( | ||
String.format( | ||
Locale.ROOT, | ||
"Setting [%s] (%d) must be greater than [%s] (%d)", | ||
SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.getKey(), | ||
value, | ||
SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.getKey(), | ||
redThreshold | ||
) | ||
); | ||
} | ||
} | ||
|
||
@Override | ||
public Iterator<Setting<?>> settings() { | ||
final List<Setting<?>> settings = List.of(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED); | ||
return settings.iterator(); | ||
} | ||
}, | ||
Setting.Property.Dynamic, | ||
Setting.Property.NodeScope | ||
); | ||
|
||
public static final Setting<Integer> SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED = Setting.intSetting( | ||
"health.shard_capacity.unhealthy_threshold.red", | ||
5, | ||
1, | ||
new Setting.Validator<>() { | ||
@Override | ||
public void validate(Integer value) {} | ||
|
||
@Override | ||
public void validate(Integer value, Map<Setting<?>, Object> settings) { | ||
Integer yellowThreshold = (Integer) settings.get(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW); | ||
if (value >= yellowThreshold) { | ||
throw new IllegalArgumentException( | ||
String.format( | ||
Locale.ROOT, | ||
"Setting [%s] (%d) must be less than [%s] (%d)", | ||
SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.getKey(), | ||
value, | ||
SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.getKey(), | ||
yellowThreshold | ||
) | ||
); | ||
} | ||
} | ||
|
||
@Override | ||
public Iterator<Setting<?>> settings() { | ||
final List<Setting<?>> settings = List.of(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW); | ||
return settings.iterator(); | ||
} | ||
}, | ||
Setting.Property.Dynamic, | ||
Setting.Property.NodeScope | ||
); | ||
|
||
private final ClusterService clusterService; | ||
private final List<ShardLimitValidator.LimitGroup> shardLimitGroups; | ||
|
||
public ShardsCapacityHealthIndicatorService(ClusterService clusterService) { | ||
private int unhealthyThresholdYellow; | ||
private int unhealthyThresholdRed; | ||
|
||
public ShardsCapacityHealthIndicatorService(ClusterService clusterService, Settings settings) { | ||
this.clusterService = clusterService; | ||
this.unhealthyThresholdYellow = SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.get(settings); | ||
this.unhealthyThresholdRed = SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.get(settings); | ||
this.shardLimitGroups = ShardLimitValidator.applicableLimitGroups(DiscoveryNode.isStateless(clusterService.getSettings())); | ||
|
||
clusterService.getClusterSettings() | ||
.addSettingsUpdateConsumer(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW, this::setUnhealthyThresholdYellow); | ||
clusterService.getClusterSettings() | ||
.addSettingsUpdateConsumer(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED, this::setUnhealthyThresholdRed); | ||
} | ||
|
||
@Override | ||
|
@@ -117,7 +209,9 @@ public HealthIndicatorResult calculate(boolean verbose, int maxAffectedResources | |
ShardLimitValidator.getShardLimitPerNode(limitGroup, shardLimitsMetadata), | ||
state.nodes(), | ||
state.metadata(), | ||
limitGroup::checkShardLimit | ||
limitGroup::checkShardLimit, | ||
unhealthyThresholdYellow, | ||
unhealthyThresholdRed | ||
) | ||
) | ||
.toList(); | ||
|
@@ -166,7 +260,9 @@ private HealthIndicatorResult mergeIndicators(boolean verbose, List<StatusResult | |
return createIndicator( | ||
finalStatus, | ||
symptomBuilder.toString(), | ||
verbose ? buildDetails(statusResults.stream().map(StatusResult::result).toList()) : HealthIndicatorDetails.EMPTY, | ||
verbose | ||
? buildDetails(statusResults.stream().map(StatusResult::result).toList(), unhealthyThresholdYellow, unhealthyThresholdRed) | ||
: HealthIndicatorDetails.EMPTY, | ||
indicatorImpacts, | ||
verbose ? List.copyOf(diagnoses) : List.of() | ||
); | ||
|
@@ -176,22 +272,28 @@ static StatusResult calculateFrom( | |
int maxShardsPerNodeSetting, | ||
DiscoveryNodes discoveryNodes, | ||
Metadata metadata, | ||
ShardsCapacityChecker checker | ||
ShardsCapacityChecker checker, | ||
int shardThresholdYellow, | ||
int shardThresholdRed | ||
) { | ||
var result = checker.check(maxShardsPerNodeSetting, 5, 1, discoveryNodes, metadata); | ||
var result = checker.check(maxShardsPerNodeSetting, shardThresholdRed, 1, discoveryNodes, metadata); | ||
if (result.canAddShards() == false) { | ||
return new StatusResult(HealthStatus.RED, result); | ||
} | ||
|
||
result = checker.check(maxShardsPerNodeSetting, 10, 1, discoveryNodes, metadata); | ||
result = checker.check(maxShardsPerNodeSetting, shardThresholdYellow, 1, discoveryNodes, metadata); | ||
if (result.canAddShards() == false) { | ||
return new StatusResult(HealthStatus.YELLOW, result); | ||
} | ||
|
||
return new StatusResult(HealthStatus.GREEN, result); | ||
} | ||
|
||
static HealthIndicatorDetails buildDetails(List<ShardLimitValidator.Result> results) { | ||
static HealthIndicatorDetails buildDetails( | ||
List<ShardLimitValidator.Result> results, | ||
int unhealthyThresholdYellow, | ||
int unhealthyThresholdRed | ||
) { | ||
return (builder, params) -> { | ||
builder.startObject(); | ||
for (var result : results) { | ||
|
@@ -202,6 +304,12 @@ static HealthIndicatorDetails buildDetails(List<ShardLimitValidator.Result> resu | |
} | ||
builder.endObject(); | ||
} | ||
{ | ||
builder.startObject("settings"); | ||
builder.field(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.getKey(), unhealthyThresholdYellow); | ||
builder.field(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.getKey(), unhealthyThresholdRed); | ||
builder.endObject(); | ||
} | ||
Comment on lines
+307
to
+312
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a specific reason you're including the settings in the indicator output? I don't immediately see the value tbh. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Oh it's more for convenience of the health report. Since these thresholds are now dynamic settings, I thought it'd be good to report them as part of the health report. If someone's looking at previous health report logs they know what the thresholds were and don't need to guess if these settings have been changed. |
||
builder.endObject(); | ||
return builder; | ||
}; | ||
|
@@ -217,6 +325,14 @@ private HealthIndicatorResult unknownIndicator() { | |
); | ||
} | ||
|
||
private void setUnhealthyThresholdYellow(int value) { | ||
this.unhealthyThresholdYellow = value; | ||
} | ||
|
||
private void setUnhealthyThresholdRed(int value) { | ||
this.unhealthyThresholdRed = value; | ||
} | ||
|
||
private static String nodeTypeFroLimitGroup(ShardLimitValidator.LimitGroup limitGroup) { | ||
return switch (limitGroup) { | ||
case NORMAL -> "data"; | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I don't think we can do
- match: { indicators.shards_capacity.status: "green" }
here. Check out my PR and the related Serverless test failure issue: #109808.So, I think we need to convert this to a Java REST test, which allows us to wait for the
green
indicator health. Maybe we could get away with changing this line frommatch
toexist
(like I did in that PR), but I'm worried thematch: yellow
below might still be flaky, and it'll reduce the value of the test slightly as we don't have a guaranteed starting point. What do you think?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is a good catch! I wonder why HealthMetadata needs to be cluster state metadata, they seem to just be a copy of these cluster settings. If we read the settings directly from cluster state, then it should eliminate the flakiness, but there are probably reason I don't understand yet. I posted in the team thread to discuss about this.