Skip to content

Commit 6f30995

Browse files
authored
Limit number of allocation explanations in shards_availability health indicator (#136060) (#136471)
We currently compute the shard allocation explanation for every unassigned shard (primaries and replicas) in the health report API when `verbose` is `true`, which includes the periodic health logs. Computing the shard allocation explanation of a shard is quite expensive in large clusters. Therefore, when there are lots of unassigned shards, `ShardsAvailabilityHealthIndicatorService` can take a long time to complete - we've seen cases of 2 minutes with 40k unassigned shards. To avoid the runtime of `ShardsAvailabilityHealthIndicatorService` scaling linearly with the number of unassigned shards (times the size of the cluster), we limit the number of allocation explanations we compute to `maxAffectedResourcesCount`, which comes from the `size` parameter of the `_health_report` API and currently defaults to `1000` - a follow-up PR will address the high default size. This significantly reduces the runtime of this health indicator and avoids the periodic health logs from overlapping. A downside of this change is that the returned list of diagnoses may be incomplete. For example, if the `size` parameter is set to `10`, and the first 10 shards are unassigned due to reason `X` and the remaining unassigned shards due to reason `Y`, only reason `X` will be returned in the health API. We accept this downside as we expect that there are generally not many different diagnoses relevant - if more than `size` shards are unassigned, they're likely all unassigned due to the same reason. Users can always increase `size` and/or manually call the allocation explain API to get more detailed information. (cherry picked from commit ede1d06) # Conflicts: # server/src/main/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorService.java # server/src/test/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorServiceTests.java
1 parent 3aed800 commit 6f30995

File tree

3 files changed

+82
-53
lines changed

3 files changed

+82
-53
lines changed

docs/changelog/136060.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 136060
2+
summary: Limit number of allocation explanations in `shards_availability` health indicator
3+
area: Health
4+
type: bug
5+
issues: []

server/src/main/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorService.java

Lines changed: 45 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -161,17 +161,18 @@ public String name() {
161161
* primary and replica availability, providing the color, diagnosis, and
162162
* messages about the available or unavailable shards in the cluster.
163163
* @param metadata Metadata for the cluster
164+
* @param maxAffectedResourcesCount Max number of affect resources to return
164165
* @return A new ShardAllocationStatus that has not yet been filled.
165166
*/
166-
public ShardAllocationStatus createNewStatus(Metadata metadata) {
167-
return new ShardAllocationStatus(metadata);
167+
public ShardAllocationStatus createNewStatus(Metadata metadata, int maxAffectedResourcesCount) {
168+
return new ShardAllocationStatus(metadata, maxAffectedResourcesCount);
168169
}
169170

170171
@Override
171172
public HealthIndicatorResult calculate(boolean verbose, int maxAffectedResourcesCount, HealthInfo healthInfo) {
172173
var state = clusterService.state();
173174
var shutdown = state.getMetadata().custom(NodesShutdownMetadata.TYPE, NodesShutdownMetadata.EMPTY);
174-
var status = createNewStatus(state.getMetadata());
175+
var status = createNewStatus(state.getMetadata(), maxAffectedResourcesCount);
175176
updateShardAllocationStatus(status, state, shutdown, verbose, replicaUnassignedBufferTime);
176177
return createIndicator(
177178
status.getStatus(),
@@ -454,18 +455,33 @@ static void updateShardAllocationStatus(
454455
);
455456

456457
public class ShardAllocationCounts {
457-
int unassigned = 0;
458-
int unassigned_new = 0;
459-
int unassigned_restarting = 0;
460-
int initializing = 0;
461-
int started = 0;
462-
int relocating = 0;
463-
public final Set<String> indicesWithUnavailableShards = new HashSet<>();
464-
public final Set<String> indicesWithAllShardsUnavailable = new HashSet<>();
458+
final int maxAffectedResourcesCount;
459+
int unassigned;
460+
int unassigned_new;
461+
int unassigned_restarting;
462+
int initializing;
463+
int started;
464+
int relocating;
465+
public final Set<String> indicesWithUnavailableShards;
466+
public final Set<String> indicesWithAllShardsUnavailable;
465467
// We keep the searchable snapshots separately as long as the original index is still available
466468
// This is checked during the post-processing
467-
public SearchableSnapshotsState searchableSnapshotsState = new SearchableSnapshotsState();
468-
final Map<Diagnosis.Definition, Set<String>> diagnosisDefinitions = new HashMap<>();
469+
public SearchableSnapshotsState searchableSnapshotsState;
470+
final Map<Diagnosis.Definition, Set<String>> diagnosisDefinitions;
471+
472+
public ShardAllocationCounts(int maxAffectedResourcesCount) {
473+
this.maxAffectedResourcesCount = maxAffectedResourcesCount;
474+
unassigned = 0;
475+
unassigned_new = 0;
476+
unassigned_restarting = 0;
477+
initializing = 0;
478+
started = 0;
479+
relocating = 0;
480+
indicesWithUnavailableShards = new HashSet<>();
481+
indicesWithAllShardsUnavailable = new HashSet<>();
482+
searchableSnapshotsState = new SearchableSnapshotsState();
483+
diagnosisDefinitions = new HashMap<>();
484+
}
469485

470486
public void increment(
471487
ShardRouting routing,
@@ -500,7 +516,15 @@ public void increment(
500516
unassigned_restarting++;
501517
} else {
502518
unassigned++;
503-
if (verbose) {
519+
// Computing the diagnosis can be very expensive in large clusters, so we limit the number of
520+
// computations to the maxAffectedResourcesCount. The main negative side effect of this is that
521+
// we might miss some diagnoses. We are willing to take this risk, and users can always
522+
// use the allocation explain API for more details or increase the maxAffectedResourcesCount.
523+
// Since we have two ShardAllocationCounts instances (primaries and replicas), we technically
524+
// do 2 * maxAffectedResourcesCount computations, but the added complexity of accurately
525+
// limiting the number of calls doesn't outweigh the benefits, as the main goal is to limit
526+
// the number of computations to a constant rather than a number that grows with the cluster size.
527+
if (verbose && unassigned <= maxAffectedResourcesCount) {
504528
diagnoseUnassignedShardRouting(routing, state).forEach(
505529
definition -> addDefinition(definition, routing.getIndexName())
506530
);
@@ -942,12 +966,16 @@ public Diagnosis.Definition getIncreaseNodeWithRoleCapacityAction(String role) {
942966
}
943967

944968
public class ShardAllocationStatus {
945-
protected final ShardAllocationCounts primaries = new ShardAllocationCounts();
946-
protected final ShardAllocationCounts replicas = new ShardAllocationCounts();
969+
protected final ShardAllocationCounts primaries;
970+
protected final ShardAllocationCounts replicas;
947971
protected final Metadata clusterMetadata;
972+
protected final int maxAffectedResourcesCount;
948973

949-
public ShardAllocationStatus(Metadata clusterMetadata) {
974+
public ShardAllocationStatus(Metadata clusterMetadata, int maxAffectedResourcesCount) {
950975
this.clusterMetadata = clusterMetadata;
976+
this.maxAffectedResourcesCount = maxAffectedResourcesCount;
977+
primaries = new ShardAllocationCounts(maxAffectedResourcesCount);
978+
replicas = new ShardAllocationCounts(maxAffectedResourcesCount);
951979
}
952980

953981
void addPrimary(ShardRouting routing, ClusterState state, NodesShutdownMetadata shutdowns, boolean verbose) {

server/src/test/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorServiceTests.java

Lines changed: 32 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -336,7 +336,7 @@ public void testAllReplicasUnassigned() {
336336
List.of()
337337
);
338338
var service = createShardsAvailabilityIndicatorService(clusterState);
339-
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
339+
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
340340
ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
341341
status,
342342
clusterState,
@@ -359,7 +359,7 @@ public void testAllReplicasUnassigned() {
359359
List.of()
360360
);
361361
var service = createShardsAvailabilityIndicatorService(clusterState);
362-
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
362+
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
363363
ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
364364
status,
365365
clusterState,
@@ -382,7 +382,7 @@ public void testAllReplicasUnassigned() {
382382
List.of()
383383
);
384384
var service = createShardsAvailabilityIndicatorService(clusterState);
385-
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
385+
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
386386
ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
387387
status,
388388
clusterState,
@@ -407,7 +407,7 @@ public void testAllReplicasUnassigned() {
407407
);
408408

409409
var service = createShardsAvailabilityIndicatorService(clusterState);
410-
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
410+
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
411411
ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
412412
status,
413413
clusterState,
@@ -444,7 +444,7 @@ public void testAllReplicasUnassigned() {
444444
List.of()
445445
);
446446
var service = createShardsAvailabilityIndicatorService(clusterState);
447-
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
447+
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
448448
ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
449449
status,
450450
clusterState,
@@ -474,7 +474,7 @@ public void testAllReplicasUnassigned() {
474474
);
475475
ClusterState clusterState = createClusterStateWith(List.of(routingTable), List.of());
476476
var service = createShardsAvailabilityIndicatorService(clusterState);
477-
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
477+
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
478478
ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
479479
status,
480480
clusterState,
@@ -498,7 +498,7 @@ public void testAllReplicasUnassigned() {
498498
List.of()
499499
);
500500
var service = createShardsAvailabilityIndicatorService(clusterState);
501-
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
501+
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
502502
ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
503503
status,
504504
clusterState,
@@ -1686,32 +1686,31 @@ public void testLimitNumberOfAffectedResources() {
16861686

16871687
{
16881688
// assert the full result to check that details, impacts, and symptoms use the correct count of affected indices (5)
1689-
assertThat(
1690-
service.calculate(true, 2, HealthInfo.EMPTY_HEALTH_INFO),
1691-
equalTo(
1692-
createExpectedResult(
1693-
RED,
1694-
"This cluster has 5 unavailable primary shards.",
1695-
Map.of("unassigned_primaries", 5),
1696-
List.of(
1697-
new HealthIndicatorImpact(
1698-
NAME,
1699-
ShardsAvailabilityHealthIndicatorService.PRIMARY_UNASSIGNED_IMPACT_ID,
1700-
1,
1701-
"Cannot add data to 5 indices [red-index1, red-index2, red-index3, red-index4, red-index5]. Searches might "
1702-
+ "return incomplete results.",
1703-
List.of(ImpactArea.INGEST, ImpactArea.SEARCH)
1704-
)
1705-
),
1706-
List.of(
1707-
new Diagnosis(
1708-
ACTION_CHECK_ALLOCATION_EXPLAIN_API,
1709-
List.of(new Diagnosis.Resource(INDEX, List.of("red-index1", "red-index2")))
1710-
)
1711-
)
1689+
// since we limit the number of allocation explanations while looping over the shards, we can't guarantee
1690+
// which indices end up in the affected resources list, but we can at least check that the size is correct
1691+
var calculatedResult = service.calculate(true, 2, HealthInfo.EMPTY_HEALTH_INFO);
1692+
assertEquals(RED, calculatedResult.status());
1693+
assertEquals("This cluster has 5 unavailable primary shards.", calculatedResult.symptom());
1694+
assertEquals(new SimpleHealthIndicatorDetails(addDefaults(Map.of("unassigned_primaries", 5))), calculatedResult.details());
1695+
assertEquals(
1696+
List.of(
1697+
new HealthIndicatorImpact(
1698+
NAME,
1699+
ShardsAvailabilityHealthIndicatorService.PRIMARY_UNASSIGNED_IMPACT_ID,
1700+
1,
1701+
"Cannot add data to 5 indices [red-index1, red-index2, red-index3, red-index4, red-index5]. Searches might "
1702+
+ "return incomplete results.",
1703+
List.of(ImpactArea.INGEST, ImpactArea.SEARCH)
17121704
)
1713-
)
1705+
),
1706+
calculatedResult.impacts()
17141707
);
1708+
assertEquals("Expected 1 diagnosis but got " + calculatedResult.diagnosisList(), 1, calculatedResult.diagnosisList().size());
1709+
var diagnosis = calculatedResult.diagnosisList().get(0);
1710+
assertEquals(ACTION_CHECK_ALLOCATION_EXPLAIN_API, diagnosis.definition());
1711+
assertEquals("Expected 1 affected resource but got " + diagnosis.affectedResources(), 1, diagnosis.affectedResources().size());
1712+
var affectedResource = diagnosis.affectedResources().get(0);
1713+
assertEquals("Expected 2 indices but got " + affectedResource.getValues(), 2, affectedResource.getValues().size());
17151714
}
17161715

17171716
{
@@ -1733,11 +1732,8 @@ public void testLimitNumberOfAffectedResources() {
17331732
}
17341733

17351734
{
1736-
// 0 affected resources
1737-
assertThat(
1738-
service.calculate(true, 0, HealthInfo.EMPTY_HEALTH_INFO).diagnosisList(),
1739-
equalTo(List.of(new Diagnosis(ACTION_CHECK_ALLOCATION_EXPLAIN_API, List.of(new Diagnosis.Resource(INDEX, List.of())))))
1740-
);
1735+
// 0 affected resources means we don't do any shard allocation explanation and thus do not report any diagnosis
1736+
assertThat(service.calculate(true, 0, HealthInfo.EMPTY_HEALTH_INFO).diagnosisList(), equalTo(List.of()));
17411737
}
17421738
}
17431739

0 commit comments

Comments
 (0)