Skip to content

Commit 9c0709f

Browse files
authored
Do not recommend increasing max_shards_per_node (#120458)
Today if the `shards_capacity` health indicator detects a problem then it recommends increasing the limit, which goes against the advice in the manual about not increasing these limits and also makes it rather pointless having a limit in the first place. This commit improves the recommendation to suggest either adding nodes or else reducing the shard count.
1 parent d1a2e0d commit 9c0709f

File tree

6 files changed

+57
-58
lines changed

6 files changed

+57
-58
lines changed

docs/changelog/120458.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 120458
2+
summary: Do not recommend increasing `max_shards_per_node`
3+
area: Health
4+
type: bug
5+
issues: []

server/src/main/java/org/elasticsearch/common/ReferenceDocs.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ public enum ReferenceDocs {
8383
ALLOCATION_EXPLAIN_NO_COPIES,
8484
ALLOCATION_EXPLAIN_MAX_RETRY,
8585
SECURE_SETTINGS,
86+
CLUSTER_SHARD_LIMIT,
8687
// this comment keeps the ';' on the next line so every entry above has a trailing ',' which makes the diff for adding new links cleaner
8788
;
8889

server/src/main/java/org/elasticsearch/health/node/ShardsCapacityHealthIndicatorService.java

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,9 @@
1212
import org.elasticsearch.cluster.metadata.Metadata;
1313
import org.elasticsearch.cluster.node.DiscoveryNodes;
1414
import org.elasticsearch.cluster.service.ClusterService;
15+
import org.elasticsearch.common.ReferenceDocs;
1516
import org.elasticsearch.common.TriFunction;
1617
import org.elasticsearch.common.settings.Setting;
17-
import org.elasticsearch.features.FeatureService;
1818
import org.elasticsearch.health.Diagnosis;
1919
import org.elasticsearch.health.HealthIndicatorDetails;
2020
import org.elasticsearch.health.HealthIndicatorImpact;
@@ -54,21 +54,18 @@ public class ShardsCapacityHealthIndicatorService implements HealthIndicatorServ
5454
"The cluster is running low on room to add new shards. Adding data to new indices is at risk";
5555
private static final String INDEX_CREATION_RISK =
5656
"The cluster is running low on room to add new shards. Adding data to new indices might soon fail.";
57-
private static final String HELP_GUIDE = "https://ela.st/fix-shards-capacity";
5857
private static final TriFunction<String, Setting<?>, String, Diagnosis> SHARD_MAX_CAPACITY_REACHED_FN = (
5958
id,
6059
setting,
6160
indexType) -> new Diagnosis(
6261
new Diagnosis.Definition(
6362
NAME,
6463
id,
65-
"Elasticsearch is about to reach the maximum number of shards it can host, based on your current settings.",
66-
"Increase the value of ["
67-
+ setting.getKey()
68-
+ "] cluster setting or remove "
64+
"Elasticsearch is about to reach the maximum number of shards it can host as set by [" + setting.getKey() + "].",
65+
"Increase the number of nodes in your cluster or remove some "
6966
+ indexType
70-
+ " indices to clear up resources.",
71-
HELP_GUIDE
67+
+ " indices to reduce the number of shards in the cluster.",
68+
ReferenceDocs.CLUSTER_SHARD_LIMIT.toString()
7269
),
7370
null
7471
);
@@ -82,22 +79,20 @@ public class ShardsCapacityHealthIndicatorService implements HealthIndicatorServ
8279
new HealthIndicatorImpact(NAME, "creation_of_new_indices_at_risk", 2, INDEX_CREATION_RISK, List.of(ImpactArea.INGEST))
8380
);
8481
static final Diagnosis SHARDS_MAX_CAPACITY_REACHED_DATA_NODES = SHARD_MAX_CAPACITY_REACHED_FN.apply(
85-
"increase_max_shards_per_node",
82+
"decrease_shards_per_non_frozen_node",
8683
ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE,
87-
"data"
84+
"non-frozen"
8885
);
8986
static final Diagnosis SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES = SHARD_MAX_CAPACITY_REACHED_FN.apply(
90-
"increase_max_shards_per_node_frozen",
87+
"decrease_shards_per_frozen_node",
9188
ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN,
9289
"frozen"
9390
);
9491

9592
private final ClusterService clusterService;
96-
private final FeatureService featureService;
9793

98-
public ShardsCapacityHealthIndicatorService(ClusterService clusterService, FeatureService featureService) {
94+
public ShardsCapacityHealthIndicatorService(ClusterService clusterService) {
9995
this.clusterService = clusterService;
100-
this.featureService = featureService;
10196
}
10297

10398
@Override

server/src/main/java/org/elasticsearch/node/NodeConstruction.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1352,7 +1352,7 @@ private Module loadDiagnosticServices(
13521352
new StableMasterHealthIndicatorService(coordinationDiagnosticsService, clusterService),
13531353
new RepositoryIntegrityHealthIndicatorService(clusterService),
13541354
new DiskHealthIndicatorService(clusterService, featureService),
1355-
new ShardsCapacityHealthIndicatorService(clusterService, featureService),
1355+
new ShardsCapacityHealthIndicatorService(clusterService),
13561356
fileSettingsHealthIndicatorService
13571357
);
13581358
var pluginHealthIndicatorServices = pluginsService.filterPlugins(HealthPlugin.class)

server/src/main/resources/org/elasticsearch/common/reference-docs-links.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,4 @@ CIRCUIT_BREAKER_ERRORS circuit-breaker-
4545
ALLOCATION_EXPLAIN_NO_COPIES cluster-allocation-explain.html#no-valid-shard-copy
4646
ALLOCATION_EXPLAIN_MAX_RETRY cluster-allocation-explain.html#maximum-number-of-retries-exceeded
4747
SECURE_SETTINGS secure-settings.html
48+
CLUSTER_SHARD_LIMIT misc-cluster-settings.html#cluster-shard-limit

server/src/test/java/org/elasticsearch/health/node/ShardsCapacityHealthIndicatorServiceTests.java

Lines changed: 40 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
import org.elasticsearch.common.bytes.BytesReference;
2121
import org.elasticsearch.common.settings.Settings;
2222
import org.elasticsearch.common.xcontent.LoggingDeprecationHandler;
23-
import org.elasticsearch.features.FeatureService;
2423
import org.elasticsearch.health.HealthIndicatorDetails;
2524
import org.elasticsearch.health.HealthStatus;
2625
import org.elasticsearch.health.metadata.HealthMetadata;
@@ -39,7 +38,6 @@
3938
import org.junit.AfterClass;
4039
import org.junit.Before;
4140
import org.junit.BeforeClass;
42-
import org.mockito.Mockito;
4341

4442
import java.io.IOException;
4543
import java.util.List;
@@ -61,10 +59,13 @@
6159
import static org.elasticsearch.indices.ShardLimitValidator.FROZEN_GROUP;
6260
import static org.elasticsearch.indices.ShardLimitValidator.INDEX_SETTING_SHARD_LIMIT_GROUP;
6361
import static org.elasticsearch.indices.ShardLimitValidator.NORMAL_GROUP;
62+
import static org.elasticsearch.indices.ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE;
63+
import static org.elasticsearch.indices.ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN;
64+
import static org.hamcrest.Matchers.allOf;
65+
import static org.hamcrest.Matchers.containsString;
6466
import static org.hamcrest.Matchers.equalTo;
6567
import static org.hamcrest.Matchers.hasSize;
6668
import static org.hamcrest.Matchers.is;
67-
import static org.mockito.ArgumentMatchers.any;
6869

6970
public class ShardsCapacityHealthIndicatorServiceTests extends ESTestCase {
7071

@@ -73,7 +74,6 @@ public class ShardsCapacityHealthIndicatorServiceTests extends ESTestCase {
7374
private static ThreadPool threadPool;
7475

7576
private ClusterService clusterService;
76-
private FeatureService featureService;
7777
private DiscoveryNode dataNode;
7878
private DiscoveryNode frozenNode;
7979

@@ -92,9 +92,6 @@ public void setUp() throws Exception {
9292
.build();
9393

9494
clusterService = ClusterServiceUtils.createClusterService(threadPool);
95-
96-
featureService = Mockito.mock(FeatureService.class);
97-
Mockito.when(featureService.clusterHasFeature(any(), any())).thenReturn(true);
9895
}
9996

10097
@After
@@ -122,7 +119,7 @@ public void testNoShardsCapacityMetadata() throws IOException {
122119
createIndexInDataNode(100)
123120
)
124121
);
125-
var target = new ShardsCapacityHealthIndicatorService(clusterService, featureService);
122+
var target = new ShardsCapacityHealthIndicatorService(clusterService);
126123
var indicatorResult = target.calculate(true, HealthInfo.EMPTY_HEALTH_INFO);
127124

128125
assertEquals(indicatorResult.status(), HealthStatus.UNKNOWN);
@@ -136,10 +133,7 @@ public void testIndicatorYieldsGreenInCaseThereIsRoom() throws IOException {
136133
int maxShardsPerNode = randomValidMaxShards();
137134
int maxShardsPerNodeFrozen = randomValidMaxShards();
138135
var clusterService = createClusterService(maxShardsPerNode, maxShardsPerNodeFrozen, createIndexInDataNode(maxShardsPerNode / 4));
139-
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
140-
true,
141-
HealthInfo.EMPTY_HEALTH_INFO
142-
);
136+
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);
143137

144138
assertEquals(indicatorResult.status(), HealthStatus.GREEN);
145139
assertTrue(indicatorResult.impacts().isEmpty());
@@ -158,15 +152,36 @@ public void testIndicatorYieldsGreenInCaseThereIsRoom() throws IOException {
158152
);
159153
}
160154

155+
public void testDiagnoses() {
156+
assertEquals("shards_capacity", SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().indicatorName());
157+
assertEquals("decrease_shards_per_non_frozen_node", SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().id());
158+
assertThat(
159+
SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().cause(),
160+
allOf(containsString("maximum number of shards"), containsString(SETTING_CLUSTER_MAX_SHARDS_PER_NODE.getKey()))
161+
);
162+
assertThat(
163+
SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().action(),
164+
allOf(containsString("Increase the number of nodes in your cluster"), containsString("remove some non-frozen indices"))
165+
);
166+
167+
assertEquals("shards_capacity", SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().indicatorName());
168+
assertEquals("decrease_shards_per_frozen_node", SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().id());
169+
assertThat(
170+
SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().cause(),
171+
allOf(containsString("maximum number of shards"), containsString(SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN.getKey()))
172+
);
173+
assertThat(
174+
SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().action(),
175+
allOf(containsString("Increase the number of nodes in your cluster"), containsString("remove some frozen indices"))
176+
);
177+
}
178+
161179
public void testIndicatorYieldsYellowInCaseThereIsNotEnoughRoom() throws IOException {
162180
{
163181
// Only data_nodes does not have enough space
164182
int maxShardsPerNodeFrozen = randomValidMaxShards();
165183
var clusterService = createClusterService(25, maxShardsPerNodeFrozen, createIndexInDataNode(4));
166-
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
167-
true,
168-
HealthInfo.EMPTY_HEALTH_INFO
169-
);
184+
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);
170185

171186
assertEquals(indicatorResult.status(), YELLOW);
172187
assertEquals(indicatorResult.symptom(), "Cluster is close to reaching the configured maximum number of shards for data nodes.");
@@ -189,10 +204,7 @@ public void testIndicatorYieldsYellowInCaseThereIsNotEnoughRoom() throws IOExcep
189204
// Only frozen_nodes does not have enough space
190205
int maxShardsPerNode = randomValidMaxShards();
191206
var clusterService = createClusterService(maxShardsPerNode, 25, createIndexInFrozenNode(4));
192-
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
193-
true,
194-
HealthInfo.EMPTY_HEALTH_INFO
195-
);
207+
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);
196208

197209
assertEquals(indicatorResult.status(), YELLOW);
198210
assertEquals(
@@ -217,10 +229,7 @@ public void testIndicatorYieldsYellowInCaseThereIsNotEnoughRoom() throws IOExcep
217229
{
218230
// Both data and frozen nodes does not have enough space
219231
var clusterService = createClusterService(25, 25, createIndexInDataNode(4), createIndexInFrozenNode(4));
220-
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
221-
true,
222-
HealthInfo.EMPTY_HEALTH_INFO
223-
);
232+
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);
224233

225234
assertEquals(indicatorResult.status(), YELLOW);
226235
assertEquals(
@@ -251,10 +260,7 @@ public void testIndicatorYieldsRedInCaseThereIsNotEnoughRoom() throws IOExceptio
251260
// Only data_nodes does not have enough space
252261
int maxShardsPerNodeFrozen = randomValidMaxShards();
253262
var clusterService = createClusterService(25, maxShardsPerNodeFrozen, createIndexInDataNode(11));
254-
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
255-
true,
256-
HealthInfo.EMPTY_HEALTH_INFO
257-
);
263+
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);
258264

259265
assertEquals(indicatorResult.status(), RED);
260266
assertEquals(indicatorResult.symptom(), "Cluster is close to reaching the configured maximum number of shards for data nodes.");
@@ -277,10 +283,7 @@ public void testIndicatorYieldsRedInCaseThereIsNotEnoughRoom() throws IOExceptio
277283
// Only frozen_nodes does not have enough space
278284
int maxShardsPerNode = randomValidMaxShards();
279285
var clusterService = createClusterService(maxShardsPerNode, 25, createIndexInFrozenNode(11));
280-
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
281-
true,
282-
HealthInfo.EMPTY_HEALTH_INFO
283-
);
286+
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);
284287

285288
assertEquals(indicatorResult.status(), RED);
286289
assertEquals(
@@ -305,10 +308,7 @@ public void testIndicatorYieldsRedInCaseThereIsNotEnoughRoom() throws IOExceptio
305308
{
306309
// Both data and frozen nodes does not have enough space
307310
var clusterService = createClusterService(25, 25, createIndexInDataNode(11), createIndexInFrozenNode(11));
308-
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
309-
true,
310-
HealthInfo.EMPTY_HEALTH_INFO
311-
);
311+
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(true, HealthInfo.EMPTY_HEALTH_INFO);
312312

313313
assertEquals(indicatorResult.status(), RED);
314314
assertEquals(
@@ -377,22 +377,19 @@ public void testCalculateMethods() {
377377
public void testMappedFieldsForTelemetry() {
378378
assertEquals(ShardsCapacityHealthIndicatorService.NAME, "shards_capacity");
379379
assertEquals(
380-
"elasticsearch:health:shards_capacity:diagnosis:increase_max_shards_per_node",
380+
"elasticsearch:health:shards_capacity:diagnosis:decrease_shards_per_non_frozen_node",
381381
SHARDS_MAX_CAPACITY_REACHED_DATA_NODES.definition().getUniqueId()
382382
);
383383
assertEquals(
384-
"elasticsearch:health:shards_capacity:diagnosis:increase_max_shards_per_node_frozen",
384+
"elasticsearch:health:shards_capacity:diagnosis:decrease_shards_per_frozen_node",
385385
SHARDS_MAX_CAPACITY_REACHED_FROZEN_NODES.definition().getUniqueId()
386386
);
387387
}
388388

389389
public void testSkippingFieldsWhenVerboseIsFalse() {
390390
int maxShardsPerNodeFrozen = randomValidMaxShards();
391391
var clusterService = createClusterService(25, maxShardsPerNodeFrozen, createIndexInDataNode(11));
392-
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService, featureService).calculate(
393-
false,
394-
HealthInfo.EMPTY_HEALTH_INFO
395-
);
392+
var indicatorResult = new ShardsCapacityHealthIndicatorService(clusterService).calculate(false, HealthInfo.EMPTY_HEALTH_INFO);
396393

397394
assertEquals(indicatorResult.status(), RED);
398395
assertEquals(indicatorResult.symptom(), "Cluster is close to reaching the configured maximum number of shards for data nodes.");
@@ -441,7 +438,7 @@ private ClusterState createClusterState(
441438
var metadata = Metadata.builder()
442439
.persistentSettings(
443440
Settings.builder()
444-
.put(ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE.getKey(), maxShardsPerNode)
441+
.put(SETTING_CLUSTER_MAX_SHARDS_PER_NODE.getKey(), maxShardsPerNode)
445442
.put(ShardLimitValidator.SETTING_CLUSTER_MAX_SHARDS_PER_NODE_FROZEN.getKey(), maxShardsPerNodeFrozen)
446443
.build()
447444
);

0 commit comments

Comments
 (0)