Skip to content

Commit 8f929a8

Browse files
committed
Add settings for health indicator shard_capacity thresholds
1 parent 51599b4 commit 8f929a8

File tree

5 files changed

+338
-45
lines changed

5 files changed

+338
-45
lines changed

rest-api-spec/src/yamlRestTest/resources/rest-api-spec/test/health/30_feature.yml

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,3 +23,56 @@
2323
- match: { indicators.master_is_stable.status: "green" }
2424
- match: { indicators.master_is_stable.symptom: "The cluster has a stable master node" }
2525
- is_false: indicators.master_is_stable.details
26+
---
27+
"cluster health test for shard capacity settings":
28+
- do:
29+
health_report:
30+
feature: shards_capacity
31+
- is_true: cluster_name
32+
- match: { indicators.shards_capacity.status: "green" }
33+
34+
# set large threshold to force the indicator to go yellow
35+
- do:
36+
cluster.put_settings:
37+
body:
38+
persistent:
39+
health.shard_capacity.unhealthy_threshold.yellow: 10000
40+
flat_settings: true
41+
- match: { persistent: { health.shard_capacity.unhealthy_threshold.yellow: "10000" } }
42+
43+
- do:
44+
health_report:
45+
feature: shards_capacity
46+
- is_true: cluster_name
47+
- match: { indicators.shards_capacity.status: "yellow" }
48+
49+
# set large threshold to force the indicator to go red
50+
- do:
51+
cluster.put_settings:
52+
body:
53+
persistent:
54+
health.shard_capacity.unhealthy_threshold.red: 9000
55+
flat_settings: true
56+
- match: { persistent: { health.shard_capacity.unhealthy_threshold.red: "9000" } }
57+
58+
- do:
59+
health_report:
60+
feature: shards_capacity
61+
- is_true: cluster_name
62+
- match: { indicators.shards_capacity.status: "red" }
63+
64+
# set back to default
65+
- do:
66+
cluster.put_settings:
67+
body:
68+
persistent:
69+
health.shard_capacity.unhealthy_threshold.yellow: 10
70+
health.shard_capacity.unhealthy_threshold.red: 5
71+
flat_settings: true
72+
- match: { acknowledged: true }
73+
74+
- do:
75+
health_report:
76+
feature: shards_capacity
77+
- is_true: cluster_name
78+
- match: { indicators.shards_capacity.status: "green" }

server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@
8282
import org.elasticsearch.gateway.PersistedClusterStateService;
8383
import org.elasticsearch.health.HealthPeriodicLogger;
8484
import org.elasticsearch.health.node.LocalHealthMonitor;
85+
import org.elasticsearch.health.node.ShardsCapacityHealthIndicatorService;
8586
import org.elasticsearch.health.node.action.TransportHealthNodeAction;
8687
import org.elasticsearch.health.node.selection.HealthNodeTaskExecutor;
8788
import org.elasticsearch.http.HttpTransportSettings;
@@ -650,6 +651,8 @@ public void apply(Settings value, Settings current, Settings previous) {
650651
WriteLoadConstraintSettings.WRITE_LOAD_DECIDER_HIGH_UTILIZATION_THRESHOLD_SETTING,
651652
WriteLoadConstraintSettings.WRITE_LOAD_DECIDER_HIGH_UTILIZATION_DURATION_SETTING,
652653
WriteLoadConstraintSettings.WRITE_LOAD_DECIDER_QUEUE_LATENCY_THRESHOLD_SETTING,
653-
WriteLoadConstraintSettings.WRITE_LOAD_DECIDER_REROUTE_INTERVAL_SETTING
654+
WriteLoadConstraintSettings.WRITE_LOAD_DECIDER_REROUTE_INTERVAL_SETTING,
655+
ShardsCapacityHealthIndicatorService.SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW,
656+
ShardsCapacityHealthIndicatorService.SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED
654657
);
655658
}

server/src/main/java/org/elasticsearch/health/node/ShardsCapacityHealthIndicatorService.java

Lines changed: 127 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
import org.elasticsearch.common.ReferenceDocs;
1616
import org.elasticsearch.common.TriFunction;
1717
import org.elasticsearch.common.settings.Setting;
18+
import org.elasticsearch.common.settings.Settings;
1819
import org.elasticsearch.health.Diagnosis;
1920
import org.elasticsearch.health.HealthIndicatorDetails;
2021
import org.elasticsearch.health.HealthIndicatorImpact;
@@ -25,18 +26,24 @@
2526
import org.elasticsearch.health.metadata.HealthMetadata;
2627
import org.elasticsearch.indices.ShardLimitValidator;
2728

29+
import java.util.Iterator;
2830
import java.util.List;
31+
import java.util.Locale;
32+
import java.util.Map;
2933
import java.util.stream.Stream;
3034

3135
/**
3236
* This indicator reports health data about the shard capacity across the cluster.
3337
*
34-
* <p>
38+
3539
* The indicator will report:
36-
* * RED when there's room for less than 5 shards (either data or frozen nodes)
37-
* * YELLOW when there's room for less than 10 shards (either data or frozen nodes)
38-
* * GREEN otherwise
39-
* </p>
40+
* <ul>
41+
* <li> {@code RED} when there's room for less than the configured {@code health.shard_capacity.unhealthy_threshold.red} (default 5) shards
42+
* (either data or frozen nodes)
43+
* <li> {@code YELLOW} when there's room for less than the configured {@code health.shard_capacity.unhealthy_threshold.yellow} (default 10)
44+
* shards (either data or frozen nodes)
45+
* <li> {@code GREEN} otherwise
46+
* </ul>
4047
*
4148
* Although the `max_shard_per_node(.frozen)?` information is scoped by Node, we use the information from master because there is where
4249
* the available room for new shards is checked before creating new indices.
@@ -89,10 +96,90 @@ public class ShardsCapacityHealthIndicatorService implements HealthIndicatorServ
8996
"frozen"
9097
);
9198

99+
public static final Setting<Integer> SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW = Setting.intSetting(
100+
"health.shard_capacity.unhealthy_threshold.yellow",
101+
10,
102+
0,
103+
new Setting.Validator<>() {
104+
@Override
105+
public void validate(Integer value) {}
106+
107+
@Override
108+
public void validate(Integer value, Map<Setting<?>, Object> settings) {
109+
Integer redThreshold = (Integer) settings.get(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED);
110+
if (value < redThreshold) {
111+
throw new IllegalArgumentException(
112+
String.format(
113+
Locale.ROOT,
114+
"Setting [%s] (%d) must be greater than or equal to [%s] (%d)",
115+
SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.getKey(),
116+
value,
117+
SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.getKey(),
118+
redThreshold
119+
)
120+
);
121+
}
122+
}
123+
124+
@Override
125+
public Iterator<Setting<?>> settings() {
126+
final List<Setting<?>> settings = List.of(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED);
127+
return settings.iterator();
128+
}
129+
},
130+
Setting.Property.Dynamic,
131+
Setting.Property.NodeScope
132+
);
133+
134+
public static final Setting<Integer> SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED = Setting.intSetting(
135+
"health.shard_capacity.unhealthy_threshold.red",
136+
5,
137+
0,
138+
new Setting.Validator<>() {
139+
@Override
140+
public void validate(Integer value) {}
141+
142+
@Override
143+
public void validate(Integer value, Map<Setting<?>, Object> settings) {
144+
Integer yellowThreshold = (Integer) settings.get(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW);
145+
if (value > yellowThreshold) {
146+
throw new IllegalArgumentException(
147+
String.format(
148+
Locale.ROOT,
149+
"Setting [%s] (%d) must be less than or equal to [%s] (%d)",
150+
SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.getKey(),
151+
value,
152+
SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.getKey(),
153+
yellowThreshold
154+
)
155+
);
156+
}
157+
}
158+
159+
@Override
160+
public Iterator<Setting<?>> settings() {
161+
final List<Setting<?>> settings = List.of(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW);
162+
return settings.iterator();
163+
}
164+
},
165+
Setting.Property.Dynamic,
166+
Setting.Property.NodeScope
167+
);
168+
92169
private final ClusterService clusterService;
93170

94-
public ShardsCapacityHealthIndicatorService(ClusterService clusterService) {
171+
private int unhealthyThresholdYellow;
172+
private int unhealthyThresholdRed;
173+
174+
public ShardsCapacityHealthIndicatorService(ClusterService clusterService, Settings settings) {
95175
this.clusterService = clusterService;
176+
this.unhealthyThresholdYellow = SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.get(settings);
177+
this.unhealthyThresholdRed = SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.get(settings);
178+
179+
clusterService.getClusterSettings()
180+
.addSettingsUpdateConsumer(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW, this::setUnhealthyThresholdYellow);
181+
clusterService.getClusterSettings()
182+
.addSettingsUpdateConsumer(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED, this::setUnhealthyThresholdRed);
96183
}
97184

98185
@Override
@@ -115,13 +202,17 @@ public HealthIndicatorResult calculate(boolean verbose, int maxAffectedResources
115202
shardLimitsMetadata.maxShardsPerNode(),
116203
state.nodes(),
117204
state.metadata(),
118-
ShardLimitValidator::checkShardLimitForNormalNodes
205+
ShardLimitValidator::checkShardLimitForNormalNodes,
206+
unhealthyThresholdYellow,
207+
unhealthyThresholdRed
119208
),
120209
calculateFrom(
121210
shardLimitsMetadata.maxShardsPerNodeFrozen(),
122211
state.nodes(),
123212
state.metadata(),
124-
ShardLimitValidator::checkShardLimitForFrozenNodes
213+
ShardLimitValidator::checkShardLimitForFrozenNodes,
214+
unhealthyThresholdYellow,
215+
unhealthyThresholdRed
125216
)
126217
);
127218
}
@@ -164,7 +255,9 @@ private HealthIndicatorResult mergeIndicators(boolean verbose, StatusResult data
164255
return createIndicator(
165256
finalStatus,
166257
symptomBuilder.toString(),
167-
verbose ? buildDetails(dataNodes.result, frozenNodes.result) : HealthIndicatorDetails.EMPTY,
258+
verbose
259+
? buildDetails(dataNodes.result, frozenNodes.result, unhealthyThresholdYellow, unhealthyThresholdRed)
260+
: HealthIndicatorDetails.EMPTY,
168261
indicatorImpacts,
169262
verbose ? diagnoses : List.of()
170263
);
@@ -174,22 +267,29 @@ static StatusResult calculateFrom(
174267
int maxShardsPerNodeSetting,
175268
DiscoveryNodes discoveryNodes,
176269
Metadata metadata,
177-
ShardsCapacityChecker checker
270+
ShardsCapacityChecker checker,
271+
int shardThresholdYellow,
272+
int shardThresholdRed
178273
) {
179-
var result = checker.check(maxShardsPerNodeSetting, 5, 1, discoveryNodes, metadata);
274+
var result = checker.check(maxShardsPerNodeSetting, shardThresholdRed, 1, discoveryNodes, metadata);
180275
if (result.canAddShards() == false) {
181276
return new StatusResult(HealthStatus.RED, result);
182277
}
183278

184-
result = checker.check(maxShardsPerNodeSetting, 10, 1, discoveryNodes, metadata);
279+
result = checker.check(maxShardsPerNodeSetting, shardThresholdYellow, 1, discoveryNodes, metadata);
185280
if (result.canAddShards() == false) {
186281
return new StatusResult(HealthStatus.YELLOW, result);
187282
}
188283

189284
return new StatusResult(HealthStatus.GREEN, result);
190285
}
191286

192-
static HealthIndicatorDetails buildDetails(ShardLimitValidator.Result dataNodes, ShardLimitValidator.Result frozenNodes) {
287+
static HealthIndicatorDetails buildDetails(
288+
ShardLimitValidator.Result dataNodes,
289+
ShardLimitValidator.Result frozenNodes,
290+
int unhealthyThresholdYellow,
291+
int unhealthyThresholdRed
292+
) {
193293
return (builder, params) -> {
194294
builder.startObject();
195295
{
@@ -208,6 +308,12 @@ static HealthIndicatorDetails buildDetails(ShardLimitValidator.Result dataNodes,
208308
}
209309
builder.endObject();
210310
}
311+
{
312+
builder.startObject("settings");
313+
builder.field(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_YELLOW.getKey(), unhealthyThresholdYellow);
314+
builder.field(SHARD_CAPACITY_UNHEALTHY_THRESHOLD_RED.getKey(), unhealthyThresholdRed);
315+
builder.endObject();
316+
}
211317
builder.endObject();
212318
return builder;
213319
};
@@ -223,6 +329,14 @@ private HealthIndicatorResult unknownIndicator() {
223329
);
224330
}
225331

332+
private void setUnhealthyThresholdYellow(int value) {
333+
this.unhealthyThresholdYellow = value;
334+
}
335+
336+
private void setUnhealthyThresholdRed(int value) {
337+
this.unhealthyThresholdRed = value;
338+
}
339+
226340
record StatusResult(HealthStatus status, ShardLimitValidator.Result result) {}
227341

228342
@FunctionalInterface

server/src/main/java/org/elasticsearch/node/NodeConstruction.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1469,7 +1469,7 @@ private Module loadDiagnosticServices(
14691469
new StableMasterHealthIndicatorService(coordinationDiagnosticsService, clusterService),
14701470
new RepositoryIntegrityHealthIndicatorService(clusterService),
14711471
new DiskHealthIndicatorService(clusterService),
1472-
new ShardsCapacityHealthIndicatorService(clusterService),
1472+
new ShardsCapacityHealthIndicatorService(clusterService, settings),
14731473
new FileSettingsHealthIndicatorService()
14741474
);
14751475
var pluginHealthIndicatorServices = pluginsService.filterPlugins(HealthPlugin.class)

0 commit comments

Comments
 (0)