From 4b433f49002520f3fe9d845737541acac15fc726 Mon Sep 17 00:00:00 2001
From: Niels Bauman <nielsbauman@gmail.com>
Date: Mon, 6 Oct 2025 15:36:37 -0300
Subject: [PATCH 1/2] Limit number of allocation explanations in
 `shards_availability` health indicator

We currently compute the shard allocation explanation for every
unassigned shard (primaries and replicas) in the health report API when
`verbose` is `true`, which includes the periodic health logs. Computing
the shard allocation explanation of a shard is quite expensive in large
clusters. Therefore, when there are lots of unassigned shards,
`ShardsAvailabilityHealthIndicatorService` can take a long time to
complete - we've seen cases of 2 minutes with 40k unassigned shards.

To avoid the runtime of `ShardsAvailabilityHealthIndicatorService`
scaling linearly with the number of unassigned shards (times the size of
the cluster), we limit the number of allocation explanations we compute
to `maxAffectedResourcesCount`, which comes from the `size` parameter of
the `_health_report` API and currently defaults to `1000` - a follow-up
PR will address the high default size. This significantly reduces the
runtime of this health indicator and avoids the periodic health logs
from overlapping.

A downside of this change is that the returned list of diagnoses may be
incomplete. For example, if the `size` parameter is set to `10`, and the
first 10 shards are unassigned due to reason `X` and the remaining
unassigned shards due to reason `Y`, only reason `X` will be returned in
the health API. We accept this downside as we expect that there are
generally not many different diagnoses relevant - if more than `size`
shards are unassigned, they're likely all unassigned due to the same
reason. Users can always increase `size` and/or manually call the
allocation explain API to get more detailed information.
---
 ...rdsAvailabilityHealthIndicatorService.java | 64 ++++++++++++-----
 ...ailabilityHealthIndicatorServiceTests.java | 68 +++++++++----------
 2 files changed, 78 insertions(+), 54 deletions(-)

diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorService.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorService.java
index e642ab3bd18d0..55a57d32dfa84 100644
--- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorService.java
+++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorService.java
@@ -166,17 +166,18 @@ public String name() {
      * primary and replica availability, providing the color, diagnosis, and
      * messages about the available or unavailable shards in the cluster.
      * @param metadata Metadata for the cluster
+     * @param maxAffectedResourcesCount Max number of affect resources to return
      * @return A new ShardAllocationStatus that has not yet been filled.
      */
-    public ShardAllocationStatus createNewStatus(Metadata metadata) {
-        return new ShardAllocationStatus(metadata);
+    public ShardAllocationStatus createNewStatus(Metadata metadata, int maxAffectedResourcesCount) {
+        return new ShardAllocationStatus(metadata, maxAffectedResourcesCount);
     }
 
     @Override
     public HealthIndicatorResult calculate(boolean verbose, int maxAffectedResourcesCount, HealthInfo healthInfo) {
         var state = clusterService.state();
         var shutdown = state.getMetadata().custom(NodesShutdownMetadata.TYPE, NodesShutdownMetadata.EMPTY);
-        var status = createNewStatus(state.getMetadata());
+        var status = createNewStatus(state.getMetadata(), maxAffectedResourcesCount);
         updateShardAllocationStatus(status, state, shutdown, verbose, replicaUnassignedBufferTime);
         return createIndicator(
             status.getStatus(),
@@ -464,18 +465,33 @@ static void updateShardAllocationStatus(
         );
 
     public class ShardAllocationCounts {
-        int unassigned = 0;
-        int unassigned_new = 0;
-        int unassigned_restarting = 0;
-        int initializing = 0;
-        int started = 0;
-        int relocating = 0;
-        public final Set<ProjectIndexName> indicesWithUnavailableShards = new HashSet<>();
-        public final Set<ProjectIndexName> indicesWithAllShardsUnavailable = new HashSet<>();
+        final int maxAffectedResourcesCount;
+        int unassigned;
+        int unassigned_new;
+        int unassigned_restarting;
+        int initializing;
+        int started;
+        int relocating;
+        public final Set<ProjectIndexName> indicesWithUnavailableShards;
+        public final Set<ProjectIndexName> indicesWithAllShardsUnavailable;
         // We keep the searchable snapshots separately as long as the original index is still available
         // This is checked during the post-processing
-        public SearchableSnapshotsState searchableSnapshotsState = new SearchableSnapshotsState();
-        final Map<Diagnosis.Definition, Set<ProjectIndexName>> diagnosisDefinitions = new HashMap<>();
+        public SearchableSnapshotsState searchableSnapshotsState;
+        final Map<Diagnosis.Definition, Set<ProjectIndexName>> diagnosisDefinitions;
+
+        public ShardAllocationCounts(int maxAffectedResourcesCount) {
+            this.maxAffectedResourcesCount = maxAffectedResourcesCount;
+            unassigned = 0;
+            unassigned_new = 0;
+            unassigned_restarting = 0;
+            initializing = 0;
+            started = 0;
+            relocating = 0;
+            indicesWithUnavailableShards = new HashSet<>();
+            indicesWithAllShardsUnavailable = new HashSet<>();
+            searchableSnapshotsState = new SearchableSnapshotsState();
+            diagnosisDefinitions = new HashMap<>();
+        }
 
         public void increment(
             ProjectId projectId,
@@ -512,14 +528,22 @@ public void increment(
                         unassigned_restarting++;
                     } else {
                         unassigned++;
-                        if (verbose) {
+                        // Computing the diagnosis can be very expensive in large clusters, so we limit the number of
+                        // computations to the maxAffectedResourcesCount. The main negative side effect of this is that
+                        // we might miss some diagnoses. We are willing to take this risk, and users can always
+                        // use the allocation explain API for more details or increase the maxAffectedResourcesCount.
+                        // Since we have two `SharAllocationCounts` instances (primaries and replicas), we technically
+                        // do 2 * maxAffectedResourcesCount computations, but the added complexity of accurately
+                        // limiting the number of calls doesn't outweigh the benefits, as the main goal is to limit
+                        // the number of computations to a constant rather than a number that grows with the cluster size.
+                        if (verbose && unassigned <= maxAffectedResourcesCount) {
                             diagnoseUnassignedShardRouting(routing, state).forEach(definition -> addDefinition(definition, projectIndex));
                         }
                     }
                 }
                 case INITIALIZING -> {
                     initializing++;
-                    if (verbose) {
+                    if (verbose && initializing <= maxAffectedResourcesCount) {
                         addDefinition(DIAGNOSIS_WAIT_FOR_INITIALIZATION, projectIndex);
                     }
                 }
@@ -957,12 +981,16 @@ public Diagnosis.Definition getIncreaseNodeWithRoleCapacityAction(String role) {
     }
 
     public class ShardAllocationStatus {
-        protected final ShardAllocationCounts primaries = new ShardAllocationCounts();
-        protected final ShardAllocationCounts replicas = new ShardAllocationCounts();
+        protected final ShardAllocationCounts primaries;
+        protected final ShardAllocationCounts replicas;
         protected final Metadata clusterMetadata;
+        protected final int maxAffectedResourcesCount;
 
-        public ShardAllocationStatus(Metadata clusterMetadata) {
+        public ShardAllocationStatus(Metadata clusterMetadata, int maxAffectedResourcesCount) {
             this.clusterMetadata = clusterMetadata;
+            this.maxAffectedResourcesCount = maxAffectedResourcesCount;
+            primaries = new ShardAllocationCounts(maxAffectedResourcesCount);
+            replicas = new ShardAllocationCounts(maxAffectedResourcesCount);
         }
 
         void addPrimary(ProjectId projectId, ShardRouting routing, ClusterState state, NodesShutdownMetadata shutdowns, boolean verbose) {
diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorServiceTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorServiceTests.java
index c3de0c954c49a..72fdc3e71fcb7 100644
--- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorServiceTests.java
+++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorServiceTests.java
@@ -361,7 +361,7 @@ public void testAllReplicasUnassigned() {
                 List.of()
             );
             var service = createShardsAvailabilityIndicatorService(projectId, clusterState);
-            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
+            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
             ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
                 status,
                 clusterState,
@@ -386,7 +386,7 @@ public void testAllReplicasUnassigned() {
                 List.of()
             );
             var service = createShardsAvailabilityIndicatorService(projectId, clusterState);
-            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
+            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
             ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
                 status,
                 clusterState,
@@ -411,7 +411,7 @@ public void testAllReplicasUnassigned() {
                 List.of()
             );
             var service = createShardsAvailabilityIndicatorService(projectId, clusterState);
-            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
+            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
             ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
                 status,
                 clusterState,
@@ -438,7 +438,7 @@ public void testAllReplicasUnassigned() {
             );
 
             var service = createShardsAvailabilityIndicatorService(projectId, clusterState);
-            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
+            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
             ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
                 status,
                 clusterState,
@@ -477,7 +477,7 @@ public void testAllReplicasUnassigned() {
                 List.of()
             );
             var service = createShardsAvailabilityIndicatorService(projectId, clusterState);
-            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
+            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
             ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
                 status,
                 clusterState,
@@ -508,7 +508,7 @@ public void testAllReplicasUnassigned() {
             ProjectId projectId = randomProjectIdOrDefault();
             var clusterState = createClusterStateWith(projectId, List.of(routingTable), List.of());
             var service = createShardsAvailabilityIndicatorService(projectId, clusterState);
-            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
+            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
             ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
                 status,
                 clusterState,
@@ -534,7 +534,7 @@ public void testAllReplicasUnassigned() {
                 List.of()
             );
             var service = createShardsAvailabilityIndicatorService(projectId, clusterState);
-            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
+            ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
             ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
                 status,
                 clusterState,
@@ -1791,32 +1791,31 @@ public void testLimitNumberOfAffectedResources() {
 
         {
             // assert the full result to check that details, impacts, and symptoms use the correct count of affected indices (5)
-            assertThat(
-                service.calculate(true, 2, HealthInfo.EMPTY_HEALTH_INFO),
-                equalTo(
-                    createExpectedResult(
-                        RED,
-                        "This cluster has 5 unavailable primary shards.",
-                        Map.of("unassigned_primaries", 5),
-                        List.of(
-                            new HealthIndicatorImpact(
-                                NAME,
-                                ShardsAvailabilityHealthIndicatorService.PRIMARY_UNASSIGNED_IMPACT_ID,
-                                1,
-                                "Cannot add data to 5 indices [red-index1, red-index2, red-index3, red-index4, red-index5]. Searches might "
-                                    + "return incomplete results.",
-                                List.of(ImpactArea.INGEST, ImpactArea.SEARCH)
-                            )
-                        ),
-                        List.of(
-                            new Diagnosis(
-                                ACTION_CHECK_ALLOCATION_EXPLAIN_API,
-                                List.of(new Diagnosis.Resource(INDEX, List.of("red-index1", "red-index2")))
-                            )
-                        )
+            // since we limit the number of allocation explanations while looping over the shards, we can't guarantee
+            // which indices end up in the affected resources list, but we can at least check that the size is correct
+            var calculatedResult = service.calculate(true, 2, HealthInfo.EMPTY_HEALTH_INFO);
+            assertEquals(RED, calculatedResult.status());
+            assertEquals("This cluster has 5 unavailable primary shards.", calculatedResult.symptom());
+            assertEquals(new SimpleHealthIndicatorDetails(addDefaults(Map.of("unassigned_primaries", 5))), calculatedResult.details());
+            assertEquals(
+                List.of(
+                    new HealthIndicatorImpact(
+                        NAME,
+                        ShardsAvailabilityHealthIndicatorService.PRIMARY_UNASSIGNED_IMPACT_ID,
+                        1,
+                        "Cannot add data to 5 indices [red-index1, red-index2, red-index3, red-index4, red-index5]. Searches might "
+                            + "return incomplete results.",
+                        List.of(ImpactArea.INGEST, ImpactArea.SEARCH)
                     )
-                )
+                ),
+                calculatedResult.impacts()
             );
+            assertEquals("Expected 1 diagnosis but got " + calculatedResult.diagnosisList(), 1, calculatedResult.diagnosisList().size());
+            var diagnosis = calculatedResult.diagnosisList().get(0);
+            assertEquals(ACTION_CHECK_ALLOCATION_EXPLAIN_API, diagnosis.definition());
+            assertEquals("Expected 1 affected resource but got " + diagnosis.affectedResources(), 1, diagnosis.affectedResources().size());
+            var affectedResource = diagnosis.affectedResources().get(0);
+            assertEquals("Expected 2 indices but got " + affectedResource.getValues(), 2, affectedResource.getValues().size());
         }
 
         {
@@ -1838,11 +1837,8 @@ public void testLimitNumberOfAffectedResources() {
         }
 
         {
-            // 0 affected resources
-            assertThat(
-                service.calculate(true, 0, HealthInfo.EMPTY_HEALTH_INFO).diagnosisList(),
-                equalTo(List.of(new Diagnosis(ACTION_CHECK_ALLOCATION_EXPLAIN_API, List.of(new Diagnosis.Resource(INDEX, List.of())))))
-            );
+            // 0 affected resources means we don't do any shard allocation explanation and thus do not report any diagnosis
+            assertThat(service.calculate(true, 0, HealthInfo.EMPTY_HEALTH_INFO).diagnosisList(), equalTo(List.of()));
         }
     }
 

From b8eb086c7d1e6f32eb035749bd40604326fbf2e7 Mon Sep 17 00:00:00 2001
From: Niels Bauman <33722607+nielsbauman@users.noreply.github.com>
Date: Mon, 6 Oct 2025 17:55:12 -0300
Subject: [PATCH 2/2] Update docs/changelog/136060.yaml

---
 docs/changelog/136060.yaml | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 docs/changelog/136060.yaml

diff --git a/docs/changelog/136060.yaml b/docs/changelog/136060.yaml
new file mode 100644
index 0000000000000..1b07c57cd351a
--- /dev/null
+++ b/docs/changelog/136060.yaml
@@ -0,0 +1,5 @@
+pr: 136060
+summary: Limit number of allocation explanations in `shards_availability` health indicator
+area: Health
+type: enhancement
+issues: []