Limit number of allocation explanations in shards_availability health indicator (#136060) (#136471)

nielsbauman · web-flow · commit 6f30995da99c · 2025-10-13T12:55:08.000+02:00
We currently compute the shard allocation explanation for every unassigned shard (primaries and replicas) in the health report API when `verbose` is `true`, which includes the periodic health logs. Computing the shard allocation explanation of a shard is quite expensive in large clusters. Therefore, when there are lots of unassigned shards, `ShardsAvailabilityHealthIndicatorService` can take a long time to complete - we've seen cases of 2 minutes with 40k unassigned shards. To avoid the runtime of `ShardsAvailabilityHealthIndicatorService` scaling linearly with the number of unassigned shards (times the size of the cluster), we limit the number of allocation explanations we compute to `maxAffectedResourcesCount`, which comes from the `size` parameter of the `_health_report` API and currently defaults to `1000` - a follow-up PR will address the high default size. This significantly reduces the runtime of this health indicator and avoids the periodic health logs from overlapping. A downside of this change is that the returned list of diagnoses may be incomplete. For example, if the `size` parameter is set to `10`, and the first 10 shards are unassigned due to reason `X` and the remaining unassigned shards due to reason `Y`, only reason `X` will be returned in the health API. We accept this downside as we expect that there are generally not many different diagnoses relevant - if more than `size` shards are unassigned, they're likely all unassigned due to the same reason. Users can always increase `size` and/or manually call the allocation explain API to get more detailed information. (cherry picked from commit ede1d06) # Conflicts: # server/src/main/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorService.java # server/src/test/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorServiceTests.java
diff --git a/docs/changelog/136060.yaml b/docs/changelog/136060.yaml
@@ -0,0 +1,5 @@
+pr: 136060
+summary: Limit number of allocation explanations in `shards_availability` health indicator
+area: Health
+type: bug
+issues: []
diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorService.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorService.java
@@ -161,17 +161,18 @@ public String name() {
      * primary and replica availability, providing the color, diagnosis, and
      * messages about the available or unavailable shards in the cluster.
      * @param metadata Metadata for the cluster
+     * @param maxAffectedResourcesCount Max number of affect resources to return
      * @return A new ShardAllocationStatus that has not yet been filled.
      */
-    public ShardAllocationStatus createNewStatus(Metadata metadata) {
-        return new ShardAllocationStatus(metadata);
+    public ShardAllocationStatus createNewStatus(Metadata metadata, int maxAffectedResourcesCount) {
+        return new ShardAllocationStatus(metadata, maxAffectedResourcesCount);
     }
 
     @Override
     public HealthIndicatorResult calculate(boolean verbose, int maxAffectedResourcesCount, HealthInfo healthInfo) {
         var state = clusterService.state();
         var shutdown = state.getMetadata().custom(NodesShutdownMetadata.TYPE, NodesShutdownMetadata.EMPTY);
-        var status = createNewStatus(state.getMetadata());
+        var status = createNewStatus(state.getMetadata(), maxAffectedResourcesCount);
         updateShardAllocationStatus(status, state, shutdown, verbose, replicaUnassignedBufferTime);
         return createIndicator(
             status.getStatus(),
@@ -454,18 +455,33 @@ static void updateShardAllocationStatus(
         );
 
     public class ShardAllocationCounts {
-        int unassigned = 0;
-        int unassigned_new = 0;
-        int unassigned_restarting = 0;
-        int initializing = 0;
-        int started = 0;
-        int relocating = 0;
-        public final Set<String> indicesWithUnavailableShards = new HashSet<>();
-        public final Set<String> indicesWithAllShardsUnavailable = new HashSet<>();
+        final int maxAffectedResourcesCount;
+        int unassigned;
+        int unassigned_new;
+        int unassigned_restarting;
+        int initializing;
+        int started;
+        int relocating;
+        public final Set<String> indicesWithUnavailableShards;
+        public final Set<String> indicesWithAllShardsUnavailable;
         // We keep the searchable snapshots separately as long as the original index is still available
         // This is checked during the post-processing
-        public SearchableSnapshotsState searchableSnapshotsState = new SearchableSnapshotsState();
-        final Map<Diagnosis.Definition, Set<String>> diagnosisDefinitions = new HashMap<>();
+        public SearchableSnapshotsState searchableSnapshotsState;
+        final Map<Diagnosis.Definition, Set<String>> diagnosisDefinitions;
+
+        public ShardAllocationCounts(int maxAffectedResourcesCount) {
+            this.maxAffectedResourcesCount = maxAffectedResourcesCount;
+            unassigned = 0;
+            unassigned_new = 0;
+            unassigned_restarting = 0;
+            initializing = 0;
+            started = 0;
+            relocating = 0;
+            indicesWithUnavailableShards = new HashSet<>();
+            indicesWithAllShardsUnavailable = new HashSet<>();
+            searchableSnapshotsState = new SearchableSnapshotsState();
+            diagnosisDefinitions = new HashMap<>();
+        }
 
         public void increment(
             ShardRouting routing,
@@ -500,7 +516,15 @@ public void increment(
                         unassigned_restarting++;
                     } else {
                         unassigned++;
-                        if (verbose) {
+                        // Computing the diagnosis can be very expensive in large clusters, so we limit the number of
+                        // computations to the maxAffectedResourcesCount. The main negative side effect of this is that
+                        // we might miss some diagnoses. We are willing to take this risk, and users can always
+                        // use the allocation explain API for more details or increase the maxAffectedResourcesCount.
+                        // Since we have two ShardAllocationCounts instances (primaries and replicas), we technically
+                        // do 2 * maxAffectedResourcesCount computations, but the added complexity of accurately
+                        // limiting the number of calls doesn't outweigh the benefits, as the main goal is to limit
+                        // the number of computations to a constant rather than a number that grows with the cluster size.
+                        if (verbose && unassigned <= maxAffectedResourcesCount) {
                             diagnoseUnassignedShardRouting(routing, state).forEach(
                                 definition -> addDefinition(definition, routing.getIndexName())
                             );
@@ -942,12 +966,16 @@ public Diagnosis.Definition getIncreaseNodeWithRoleCapacityAction(String role) {
     }
 
     public class ShardAllocationStatus {
-        protected final ShardAllocationCounts primaries = new ShardAllocationCounts();
-        protected final ShardAllocationCounts replicas = new ShardAllocationCounts();
+        protected final ShardAllocationCounts primaries;
+        protected final ShardAllocationCounts replicas;
         protected final Metadata clusterMetadata;
+        protected final int maxAffectedResourcesCount;
 
-        public ShardAllocationStatus(Metadata clusterMetadata) {
+        public ShardAllocationStatus(Metadata clusterMetadata, int maxAffectedResourcesCount) {
             this.clusterMetadata = clusterMetadata;
+            this.maxAffectedResourcesCount = maxAffectedResourcesCount;
+            primaries = new ShardAllocationCounts(maxAffectedResourcesCount);
+            replicas = new ShardAllocationCounts(maxAffectedResourcesCount);
         }
 
         void addPrimary(ShardRouting routing, ClusterState state, NodesShutdownMetadata shutdowns, boolean verbose) {
diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorServiceTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorServiceTests.java
@@ -336,7 +336,7 @@ public void testAllReplicasUnassigned() {
                 List.of()
             );
             var service = createShardsAvailabilityIndicatorService(clusterState);
-            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
+            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
             ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
                 status,
                 clusterState,
@@ -359,7 +359,7 @@ public void testAllReplicasUnassigned() {
                 List.of()
             );
             var service = createShardsAvailabilityIndicatorService(clusterState);
-            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
+            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
             ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
                 status,
                 clusterState,
@@ -382,7 +382,7 @@ public void testAllReplicasUnassigned() {
                 List.of()
             );
             var service = createShardsAvailabilityIndicatorService(clusterState);
-            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
+            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
             ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
                 status,
                 clusterState,
@@ -407,7 +407,7 @@ public void testAllReplicasUnassigned() {
             );
 
             var service = createShardsAvailabilityIndicatorService(clusterState);
-            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
+            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
             ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
                 status,
                 clusterState,
@@ -444,7 +444,7 @@ public void testAllReplicasUnassigned() {
                 List.of()
             );
             var service = createShardsAvailabilityIndicatorService(clusterState);
-            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
+            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
             ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
                 status,
                 clusterState,
@@ -474,7 +474,7 @@ public void testAllReplicasUnassigned() {
             );
             ClusterState clusterState = createClusterStateWith(List.of(routingTable), List.of());
             var service = createShardsAvailabilityIndicatorService(clusterState);
-            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
+            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
             ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
                 status,
                 clusterState,
@@ -498,7 +498,7 @@ public void testAllReplicasUnassigned() {
                 List.of()
             );
             var service = createShardsAvailabilityIndicatorService(clusterState);
-            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
+            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
             ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
                 status,
                 clusterState,
@@ -1686,32 +1686,31 @@ public void testLimitNumberOfAffectedResources() {
 
         {
             // assert the full result to check that details, impacts, and symptoms use the correct count of affected indices (5)
-            assertThat(
-                service.calculate(true, 2, HealthInfo.EMPTY_HEALTH_INFO),
-                equalTo(
-                    createExpectedResult(
-                        RED,
-                        "This cluster has 5 unavailable primary shards.",
-                        Map.of("unassigned_primaries", 5),
-                        List.of(
-                            new HealthIndicatorImpact(
-                                NAME,
-                                ShardsAvailabilityHealthIndicatorService.PRIMARY_UNASSIGNED_IMPACT_ID,
-                                1,
-                                "Cannot add data to 5 indices [red-index1, red-index2, red-index3, red-index4, red-index5]. Searches might "
-                                    + "return incomplete results.",
-                                List.of(ImpactArea.INGEST, ImpactArea.SEARCH)
-                            )
-                        ),
-                        List.of(
-                            new Diagnosis(
-                                ACTION_CHECK_ALLOCATION_EXPLAIN_API,
-                                List.of(new Diagnosis.Resource(INDEX, List.of("red-index1", "red-index2")))
-                            )
-                        )
+            // since we limit the number of allocation explanations while looping over the shards, we can't guarantee
+            // which indices end up in the affected resources list, but we can at least check that the size is correct
+            var calculatedResult = service.calculate(true, 2, HealthInfo.EMPTY_HEALTH_INFO);
+            assertEquals(RED, calculatedResult.status());
+            assertEquals("This cluster has 5 unavailable primary shards.", calculatedResult.symptom());
+            assertEquals(new SimpleHealthIndicatorDetails(addDefaults(Map.of("unassigned_primaries", 5))), calculatedResult.details());
+            assertEquals(
+                List.of(
+                    new HealthIndicatorImpact(
+                        NAME,
+                        ShardsAvailabilityHealthIndicatorService.PRIMARY_UNASSIGNED_IMPACT_ID,
+                        1,
+                        "Cannot add data to 5 indices [red-index1, red-index2, red-index3, red-index4, red-index5]. Searches might "
+                            + "return incomplete results.",
+                        List.of(ImpactArea.INGEST, ImpactArea.SEARCH)
                     )
-                )
+                ),
+                calculatedResult.impacts()
             );
+            assertEquals("Expected 1 diagnosis but got " + calculatedResult.diagnosisList(), 1, calculatedResult.diagnosisList().size());
+            var diagnosis = calculatedResult.diagnosisList().get(0);
+            assertEquals(ACTION_CHECK_ALLOCATION_EXPLAIN_API, diagnosis.definition());
+            assertEquals("Expected 1 affected resource but got " + diagnosis.affectedResources(), 1, diagnosis.affectedResources().size());
+            var affectedResource = diagnosis.affectedResources().get(0);
+            assertEquals("Expected 2 indices but got " + affectedResource.getValues(), 2, affectedResource.getValues().size());
         }
 
         {
@@ -1733,11 +1732,8 @@ public void testLimitNumberOfAffectedResources() {
         }
 
         {
-            // 0 affected resources
-            assertThat(
-                service.calculate(true, 0, HealthInfo.EMPTY_HEALTH_INFO).diagnosisList(),
-                equalTo(List.of(new Diagnosis(ACTION_CHECK_ALLOCATION_EXPLAIN_API, List.of(new Diagnosis.Resource(INDEX, List.of())))))
-            );
+            // 0 affected resources means we don't do any shard allocation explanation and thus do not report any diagnosis
+            assertThat(service.calculate(true, 0, HealthInfo.EMPTY_HEALTH_INFO).diagnosisList(), equalTo(List.of()));
         }
     }