From 4b433f49002520f3fe9d845737541acac15fc726 Mon Sep 17 00:00:00 2001 From: Niels Bauman Date: Mon, 6 Oct 2025 15:36:37 -0300 Subject: [PATCH 1/2] Limit number of allocation explanations in `shards_availability` health indicator We currently compute the shard allocation explanation for every unassigned shard (primaries and replicas) in the health report API when `verbose` is `true`, which includes the periodic health logs. Computing the shard allocation explanation of a shard is quite expensive in large clusters. Therefore, when there are lots of unassigned shards, `ShardsAvailabilityHealthIndicatorService` can take a long time to complete - we've seen cases of 2 minutes with 40k unassigned shards. To avoid the runtime of `ShardsAvailabilityHealthIndicatorService` scaling linearly with the number of unassigned shards (times the size of the cluster), we limit the number of allocation explanations we compute to `maxAffectedResourcesCount`, which comes from the `size` parameter of the `_health_report` API and currently defaults to `1000` - a follow-up PR will address the high default size. This significantly reduces the runtime of this health indicator and avoids the periodic health logs from overlapping. A downside of this change is that the returned list of diagnoses may be incomplete. For example, if the `size` parameter is set to `10`, and the first 10 shards are unassigned due to reason `X` and the remaining unassigned shards due to reason `Y`, only reason `X` will be returned in the health API. We accept this downside as we expect that there are generally not many different diagnoses relevant - if more than `size` shards are unassigned, they're likely all unassigned due to the same reason. Users can always increase `size` and/or manually call the allocation explain API to get more detailed information. --- ...rdsAvailabilityHealthIndicatorService.java | 64 ++++++++++++----- ...ailabilityHealthIndicatorServiceTests.java | 68 +++++++++---------- 2 files changed, 78 insertions(+), 54 deletions(-) diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorService.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorService.java index e642ab3bd18d0..55a57d32dfa84 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorService.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorService.java @@ -166,17 +166,18 @@ public String name() { * primary and replica availability, providing the color, diagnosis, and * messages about the available or unavailable shards in the cluster. * @param metadata Metadata for the cluster + * @param maxAffectedResourcesCount Max number of affect resources to return * @return A new ShardAllocationStatus that has not yet been filled. */ - public ShardAllocationStatus createNewStatus(Metadata metadata) { - return new ShardAllocationStatus(metadata); + public ShardAllocationStatus createNewStatus(Metadata metadata, int maxAffectedResourcesCount) { + return new ShardAllocationStatus(metadata, maxAffectedResourcesCount); } @Override public HealthIndicatorResult calculate(boolean verbose, int maxAffectedResourcesCount, HealthInfo healthInfo) { var state = clusterService.state(); var shutdown = state.getMetadata().custom(NodesShutdownMetadata.TYPE, NodesShutdownMetadata.EMPTY); - var status = createNewStatus(state.getMetadata()); + var status = createNewStatus(state.getMetadata(), maxAffectedResourcesCount); updateShardAllocationStatus(status, state, shutdown, verbose, replicaUnassignedBufferTime); return createIndicator( status.getStatus(), @@ -464,18 +465,33 @@ static void updateShardAllocationStatus( ); public class ShardAllocationCounts { - int unassigned = 0; - int unassigned_new = 0; - int unassigned_restarting = 0; - int initializing = 0; - int started = 0; - int relocating = 0; - public final Set indicesWithUnavailableShards = new HashSet<>(); - public final Set indicesWithAllShardsUnavailable = new HashSet<>(); + final int maxAffectedResourcesCount; + int unassigned; + int unassigned_new; + int unassigned_restarting; + int initializing; + int started; + int relocating; + public final Set indicesWithUnavailableShards; + public final Set indicesWithAllShardsUnavailable; // We keep the searchable snapshots separately as long as the original index is still available // This is checked during the post-processing - public SearchableSnapshotsState searchableSnapshotsState = new SearchableSnapshotsState(); - final Map> diagnosisDefinitions = new HashMap<>(); + public SearchableSnapshotsState searchableSnapshotsState; + final Map> diagnosisDefinitions; + + public ShardAllocationCounts(int maxAffectedResourcesCount) { + this.maxAffectedResourcesCount = maxAffectedResourcesCount; + unassigned = 0; + unassigned_new = 0; + unassigned_restarting = 0; + initializing = 0; + started = 0; + relocating = 0; + indicesWithUnavailableShards = new HashSet<>(); + indicesWithAllShardsUnavailable = new HashSet<>(); + searchableSnapshotsState = new SearchableSnapshotsState(); + diagnosisDefinitions = new HashMap<>(); + } public void increment( ProjectId projectId, @@ -512,14 +528,22 @@ public void increment( unassigned_restarting++; } else { unassigned++; - if (verbose) { + // Computing the diagnosis can be very expensive in large clusters, so we limit the number of + // computations to the maxAffectedResourcesCount. The main negative side effect of this is that + // we might miss some diagnoses. We are willing to take this risk, and users can always + // use the allocation explain API for more details or increase the maxAffectedResourcesCount. + // Since we have two `SharAllocationCounts` instances (primaries and replicas), we technically + // do 2 * maxAffectedResourcesCount computations, but the added complexity of accurately + // limiting the number of calls doesn't outweigh the benefits, as the main goal is to limit + // the number of computations to a constant rather than a number that grows with the cluster size. + if (verbose && unassigned <= maxAffectedResourcesCount) { diagnoseUnassignedShardRouting(routing, state).forEach(definition -> addDefinition(definition, projectIndex)); } } } case INITIALIZING -> { initializing++; - if (verbose) { + if (verbose && initializing <= maxAffectedResourcesCount) { addDefinition(DIAGNOSIS_WAIT_FOR_INITIALIZATION, projectIndex); } } @@ -957,12 +981,16 @@ public Diagnosis.Definition getIncreaseNodeWithRoleCapacityAction(String role) { } public class ShardAllocationStatus { - protected final ShardAllocationCounts primaries = new ShardAllocationCounts(); - protected final ShardAllocationCounts replicas = new ShardAllocationCounts(); + protected final ShardAllocationCounts primaries; + protected final ShardAllocationCounts replicas; protected final Metadata clusterMetadata; + protected final int maxAffectedResourcesCount; - public ShardAllocationStatus(Metadata clusterMetadata) { + public ShardAllocationStatus(Metadata clusterMetadata, int maxAffectedResourcesCount) { this.clusterMetadata = clusterMetadata; + this.maxAffectedResourcesCount = maxAffectedResourcesCount; + primaries = new ShardAllocationCounts(maxAffectedResourcesCount); + replicas = new ShardAllocationCounts(maxAffectedResourcesCount); } void addPrimary(ProjectId projectId, ShardRouting routing, ClusterState state, NodesShutdownMetadata shutdowns, boolean verbose) { diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorServiceTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorServiceTests.java index c3de0c954c49a..72fdc3e71fcb7 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorServiceTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorServiceTests.java @@ -361,7 +361,7 @@ public void testAllReplicasUnassigned() { List.of() ); var service = createShardsAvailabilityIndicatorService(projectId, clusterState); - ShardAllocationStatus status = service.createNewStatus(clusterState.metadata()); + ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt()); ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus( status, clusterState, @@ -386,7 +386,7 @@ public void testAllReplicasUnassigned() { List.of() ); var service = createShardsAvailabilityIndicatorService(projectId, clusterState); - ShardAllocationStatus status = service.createNewStatus(clusterState.metadata()); + ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt()); ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus( status, clusterState, @@ -411,7 +411,7 @@ public void testAllReplicasUnassigned() { List.of() ); var service = createShardsAvailabilityIndicatorService(projectId, clusterState); - ShardAllocationStatus status = service.createNewStatus(clusterState.metadata()); + ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt()); ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus( status, clusterState, @@ -438,7 +438,7 @@ public void testAllReplicasUnassigned() { ); var service = createShardsAvailabilityIndicatorService(projectId, clusterState); - ShardAllocationStatus status = service.createNewStatus(clusterState.metadata()); + ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt()); ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus( status, clusterState, @@ -477,7 +477,7 @@ public void testAllReplicasUnassigned() { List.of() ); var service = createShardsAvailabilityIndicatorService(projectId, clusterState); - ShardAllocationStatus status = service.createNewStatus(clusterState.metadata()); + ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt()); ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus( status, clusterState, @@ -508,7 +508,7 @@ public void testAllReplicasUnassigned() { ProjectId projectId = randomProjectIdOrDefault(); var clusterState = createClusterStateWith(projectId, List.of(routingTable), List.of()); var service = createShardsAvailabilityIndicatorService(projectId, clusterState); - ShardAllocationStatus status = service.createNewStatus(clusterState.metadata()); + ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt()); ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus( status, clusterState, @@ -534,7 +534,7 @@ public void testAllReplicasUnassigned() { List.of() ); var service = createShardsAvailabilityIndicatorService(projectId, clusterState); - ShardAllocationStatus status = service.createNewStatus(clusterState.metadata()); + ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt()); ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus( status, clusterState, @@ -1791,32 +1791,31 @@ public void testLimitNumberOfAffectedResources() { { // assert the full result to check that details, impacts, and symptoms use the correct count of affected indices (5) - assertThat( - service.calculate(true, 2, HealthInfo.EMPTY_HEALTH_INFO), - equalTo( - createExpectedResult( - RED, - "This cluster has 5 unavailable primary shards.", - Map.of("unassigned_primaries", 5), - List.of( - new HealthIndicatorImpact( - NAME, - ShardsAvailabilityHealthIndicatorService.PRIMARY_UNASSIGNED_IMPACT_ID, - 1, - "Cannot add data to 5 indices [red-index1, red-index2, red-index3, red-index4, red-index5]. Searches might " - + "return incomplete results.", - List.of(ImpactArea.INGEST, ImpactArea.SEARCH) - ) - ), - List.of( - new Diagnosis( - ACTION_CHECK_ALLOCATION_EXPLAIN_API, - List.of(new Diagnosis.Resource(INDEX, List.of("red-index1", "red-index2"))) - ) - ) + // since we limit the number of allocation explanations while looping over the shards, we can't guarantee + // which indices end up in the affected resources list, but we can at least check that the size is correct + var calculatedResult = service.calculate(true, 2, HealthInfo.EMPTY_HEALTH_INFO); + assertEquals(RED, calculatedResult.status()); + assertEquals("This cluster has 5 unavailable primary shards.", calculatedResult.symptom()); + assertEquals(new SimpleHealthIndicatorDetails(addDefaults(Map.of("unassigned_primaries", 5))), calculatedResult.details()); + assertEquals( + List.of( + new HealthIndicatorImpact( + NAME, + ShardsAvailabilityHealthIndicatorService.PRIMARY_UNASSIGNED_IMPACT_ID, + 1, + "Cannot add data to 5 indices [red-index1, red-index2, red-index3, red-index4, red-index5]. Searches might " + + "return incomplete results.", + List.of(ImpactArea.INGEST, ImpactArea.SEARCH) ) - ) + ), + calculatedResult.impacts() ); + assertEquals("Expected 1 diagnosis but got " + calculatedResult.diagnosisList(), 1, calculatedResult.diagnosisList().size()); + var diagnosis = calculatedResult.diagnosisList().get(0); + assertEquals(ACTION_CHECK_ALLOCATION_EXPLAIN_API, diagnosis.definition()); + assertEquals("Expected 1 affected resource but got " + diagnosis.affectedResources(), 1, diagnosis.affectedResources().size()); + var affectedResource = diagnosis.affectedResources().get(0); + assertEquals("Expected 2 indices but got " + affectedResource.getValues(), 2, affectedResource.getValues().size()); } { @@ -1838,11 +1837,8 @@ public void testLimitNumberOfAffectedResources() { } { - // 0 affected resources - assertThat( - service.calculate(true, 0, HealthInfo.EMPTY_HEALTH_INFO).diagnosisList(), - equalTo(List.of(new Diagnosis(ACTION_CHECK_ALLOCATION_EXPLAIN_API, List.of(new Diagnosis.Resource(INDEX, List.of()))))) - ); + // 0 affected resources means we don't do any shard allocation explanation and thus do not report any diagnosis + assertThat(service.calculate(true, 0, HealthInfo.EMPTY_HEALTH_INFO).diagnosisList(), equalTo(List.of())); } } From b8eb086c7d1e6f32eb035749bd40604326fbf2e7 Mon Sep 17 00:00:00 2001 From: Niels Bauman <33722607+nielsbauman@users.noreply.github.com> Date: Mon, 6 Oct 2025 17:55:12 -0300 Subject: [PATCH 2/2] Update docs/changelog/136060.yaml --- docs/changelog/136060.yaml | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 docs/changelog/136060.yaml diff --git a/docs/changelog/136060.yaml b/docs/changelog/136060.yaml new file mode 100644 index 0000000000000..1b07c57cd351a --- /dev/null +++ b/docs/changelog/136060.yaml @@ -0,0 +1,5 @@ +pr: 136060 +summary: Limit number of allocation explanations in `shards_availability` health indicator +area: Health +type: enhancement +issues: []