Skip to content

Commit 4b433f4

Browse files
committed
Limit number of allocation explanations in shards_availability health indicator
We currently compute the shard allocation explanation for every unassigned shard (primaries and replicas) in the health report API when `verbose` is `true`, which includes the periodic health logs. Computing the shard allocation explanation of a shard is quite expensive in large clusters. Therefore, when there are lots of unassigned shards, `ShardsAvailabilityHealthIndicatorService` can take a long time to complete - we've seen cases of 2 minutes with 40k unassigned shards. To avoid the runtime of `ShardsAvailabilityHealthIndicatorService` scaling linearly with the number of unassigned shards (times the size of the cluster), we limit the number of allocation explanations we compute to `maxAffectedResourcesCount`, which comes from the `size` parameter of the `_health_report` API and currently defaults to `1000` - a follow-up PR will address the high default size. This significantly reduces the runtime of this health indicator and avoids the periodic health logs from overlapping. A downside of this change is that the returned list of diagnoses may be incomplete. For example, if the `size` parameter is set to `10`, and the first 10 shards are unassigned due to reason `X` and the remaining unassigned shards due to reason `Y`, only reason `X` will be returned in the health API. We accept this downside as we expect that there are generally not many different diagnoses relevant - if more than `size` shards are unassigned, they're likely all unassigned due to the same reason. Users can always increase `size` and/or manually call the allocation explain API to get more detailed information.
1 parent 2b91dd5 commit 4b433f4

File tree

2 files changed

+78
-54
lines changed

2 files changed

+78
-54
lines changed

server/src/main/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorService.java

Lines changed: 46 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -166,17 +166,18 @@ public String name() {
166166
* primary and replica availability, providing the color, diagnosis, and
167167
* messages about the available or unavailable shards in the cluster.
168168
* @param metadata Metadata for the cluster
169+
* @param maxAffectedResourcesCount Max number of affect resources to return
169170
* @return A new ShardAllocationStatus that has not yet been filled.
170171
*/
171-
public ShardAllocationStatus createNewStatus(Metadata metadata) {
172-
return new ShardAllocationStatus(metadata);
172+
public ShardAllocationStatus createNewStatus(Metadata metadata, int maxAffectedResourcesCount) {
173+
return new ShardAllocationStatus(metadata, maxAffectedResourcesCount);
173174
}
174175

175176
@Override
176177
public HealthIndicatorResult calculate(boolean verbose, int maxAffectedResourcesCount, HealthInfo healthInfo) {
177178
var state = clusterService.state();
178179
var shutdown = state.getMetadata().custom(NodesShutdownMetadata.TYPE, NodesShutdownMetadata.EMPTY);
179-
var status = createNewStatus(state.getMetadata());
180+
var status = createNewStatus(state.getMetadata(), maxAffectedResourcesCount);
180181
updateShardAllocationStatus(status, state, shutdown, verbose, replicaUnassignedBufferTime);
181182
return createIndicator(
182183
status.getStatus(),
@@ -464,18 +465,33 @@ static void updateShardAllocationStatus(
464465
);
465466

466467
public class ShardAllocationCounts {
467-
int unassigned = 0;
468-
int unassigned_new = 0;
469-
int unassigned_restarting = 0;
470-
int initializing = 0;
471-
int started = 0;
472-
int relocating = 0;
473-
public final Set<ProjectIndexName> indicesWithUnavailableShards = new HashSet<>();
474-
public final Set<ProjectIndexName> indicesWithAllShardsUnavailable = new HashSet<>();
468+
final int maxAffectedResourcesCount;
469+
int unassigned;
470+
int unassigned_new;
471+
int unassigned_restarting;
472+
int initializing;
473+
int started;
474+
int relocating;
475+
public final Set<ProjectIndexName> indicesWithUnavailableShards;
476+
public final Set<ProjectIndexName> indicesWithAllShardsUnavailable;
475477
// We keep the searchable snapshots separately as long as the original index is still available
476478
// This is checked during the post-processing
477-
public SearchableSnapshotsState searchableSnapshotsState = new SearchableSnapshotsState();
478-
final Map<Diagnosis.Definition, Set<ProjectIndexName>> diagnosisDefinitions = new HashMap<>();
479+
public SearchableSnapshotsState searchableSnapshotsState;
480+
final Map<Diagnosis.Definition, Set<ProjectIndexName>> diagnosisDefinitions;
481+
482+
public ShardAllocationCounts(int maxAffectedResourcesCount) {
483+
this.maxAffectedResourcesCount = maxAffectedResourcesCount;
484+
unassigned = 0;
485+
unassigned_new = 0;
486+
unassigned_restarting = 0;
487+
initializing = 0;
488+
started = 0;
489+
relocating = 0;
490+
indicesWithUnavailableShards = new HashSet<>();
491+
indicesWithAllShardsUnavailable = new HashSet<>();
492+
searchableSnapshotsState = new SearchableSnapshotsState();
493+
diagnosisDefinitions = new HashMap<>();
494+
}
479495

480496
public void increment(
481497
ProjectId projectId,
@@ -512,14 +528,22 @@ public void increment(
512528
unassigned_restarting++;
513529
} else {
514530
unassigned++;
515-
if (verbose) {
531+
// Computing the diagnosis can be very expensive in large clusters, so we limit the number of
532+
// computations to the maxAffectedResourcesCount. The main negative side effect of this is that
533+
// we might miss some diagnoses. We are willing to take this risk, and users can always
534+
// use the allocation explain API for more details or increase the maxAffectedResourcesCount.
535+
// Since we have two `SharAllocationCounts` instances (primaries and replicas), we technically
536+
// do 2 * maxAffectedResourcesCount computations, but the added complexity of accurately
537+
// limiting the number of calls doesn't outweigh the benefits, as the main goal is to limit
538+
// the number of computations to a constant rather than a number that grows with the cluster size.
539+
if (verbose && unassigned <= maxAffectedResourcesCount) {
516540
diagnoseUnassignedShardRouting(routing, state).forEach(definition -> addDefinition(definition, projectIndex));
517541
}
518542
}
519543
}
520544
case INITIALIZING -> {
521545
initializing++;
522-
if (verbose) {
546+
if (verbose && initializing <= maxAffectedResourcesCount) {
523547
addDefinition(DIAGNOSIS_WAIT_FOR_INITIALIZATION, projectIndex);
524548
}
525549
}
@@ -957,12 +981,16 @@ public Diagnosis.Definition getIncreaseNodeWithRoleCapacityAction(String role) {
957981
}
958982

959983
public class ShardAllocationStatus {
960-
protected final ShardAllocationCounts primaries = new ShardAllocationCounts();
961-
protected final ShardAllocationCounts replicas = new ShardAllocationCounts();
984+
protected final ShardAllocationCounts primaries;
985+
protected final ShardAllocationCounts replicas;
962986
protected final Metadata clusterMetadata;
987+
protected final int maxAffectedResourcesCount;
963988

964-
public ShardAllocationStatus(Metadata clusterMetadata) {
989+
public ShardAllocationStatus(Metadata clusterMetadata, int maxAffectedResourcesCount) {
965990
this.clusterMetadata = clusterMetadata;
991+
this.maxAffectedResourcesCount = maxAffectedResourcesCount;
992+
primaries = new ShardAllocationCounts(maxAffectedResourcesCount);
993+
replicas = new ShardAllocationCounts(maxAffectedResourcesCount);
966994
}
967995

968996
void addPrimary(ProjectId projectId, ShardRouting routing, ClusterState state, NodesShutdownMetadata shutdowns, boolean verbose) {

server/src/test/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorServiceTests.java

Lines changed: 32 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,7 @@ public void testAllReplicasUnassigned() {
361361
List.of()
362362
);
363363
var service = createShardsAvailabilityIndicatorService(projectId, clusterState);
364-
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
364+
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
365365
ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
366366
status,
367367
clusterState,
@@ -386,7 +386,7 @@ public void testAllReplicasUnassigned() {
386386
List.of()
387387
);
388388
var service = createShardsAvailabilityIndicatorService(projectId, clusterState);
389-
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
389+
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
390390
ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
391391
status,
392392
clusterState,
@@ -411,7 +411,7 @@ public void testAllReplicasUnassigned() {
411411
List.of()
412412
);
413413
var service = createShardsAvailabilityIndicatorService(projectId, clusterState);
414-
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
414+
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
415415
ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
416416
status,
417417
clusterState,
@@ -438,7 +438,7 @@ public void testAllReplicasUnassigned() {
438438
);
439439

440440
var service = createShardsAvailabilityIndicatorService(projectId, clusterState);
441-
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
441+
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
442442
ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
443443
status,
444444
clusterState,
@@ -477,7 +477,7 @@ public void testAllReplicasUnassigned() {
477477
List.of()
478478
);
479479
var service = createShardsAvailabilityIndicatorService(projectId, clusterState);
480-
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
480+
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
481481
ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
482482
status,
483483
clusterState,
@@ -508,7 +508,7 @@ public void testAllReplicasUnassigned() {
508508
ProjectId projectId = randomProjectIdOrDefault();
509509
var clusterState = createClusterStateWith(projectId, List.of(routingTable), List.of());
510510
var service = createShardsAvailabilityIndicatorService(projectId, clusterState);
511-
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
511+
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
512512
ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
513513
status,
514514
clusterState,
@@ -534,7 +534,7 @@ public void testAllReplicasUnassigned() {
534534
List.of()
535535
);
536536
var service = createShardsAvailabilityIndicatorService(projectId, clusterState);
537-
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
537+
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
538538
ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
539539
status,
540540
clusterState,
@@ -1791,32 +1791,31 @@ public void testLimitNumberOfAffectedResources() {
17911791

17921792
{
17931793
// assert the full result to check that details, impacts, and symptoms use the correct count of affected indices (5)
1794-
assertThat(
1795-
service.calculate(true, 2, HealthInfo.EMPTY_HEALTH_INFO),
1796-
equalTo(
1797-
createExpectedResult(
1798-
RED,
1799-
"This cluster has 5 unavailable primary shards.",
1800-
Map.of("unassigned_primaries", 5),
1801-
List.of(
1802-
new HealthIndicatorImpact(
1803-
NAME,
1804-
ShardsAvailabilityHealthIndicatorService.PRIMARY_UNASSIGNED_IMPACT_ID,
1805-
1,
1806-
"Cannot add data to 5 indices [red-index1, red-index2, red-index3, red-index4, red-index5]. Searches might "
1807-
+ "return incomplete results.",
1808-
List.of(ImpactArea.INGEST, ImpactArea.SEARCH)
1809-
)
1810-
),
1811-
List.of(
1812-
new Diagnosis(
1813-
ACTION_CHECK_ALLOCATION_EXPLAIN_API,
1814-
List.of(new Diagnosis.Resource(INDEX, List.of("red-index1", "red-index2")))
1815-
)
1816-
)
1794+
// since we limit the number of allocation explanations while looping over the shards, we can't guarantee
1795+
// which indices end up in the affected resources list, but we can at least check that the size is correct
1796+
var calculatedResult = service.calculate(true, 2, HealthInfo.EMPTY_HEALTH_INFO);
1797+
assertEquals(RED, calculatedResult.status());
1798+
assertEquals("This cluster has 5 unavailable primary shards.", calculatedResult.symptom());
1799+
assertEquals(new SimpleHealthIndicatorDetails(addDefaults(Map.of("unassigned_primaries", 5))), calculatedResult.details());
1800+
assertEquals(
1801+
List.of(
1802+
new HealthIndicatorImpact(
1803+
NAME,
1804+
ShardsAvailabilityHealthIndicatorService.PRIMARY_UNASSIGNED_IMPACT_ID,
1805+
1,
1806+
"Cannot add data to 5 indices [red-index1, red-index2, red-index3, red-index4, red-index5]. Searches might "
1807+
+ "return incomplete results.",
1808+
List.of(ImpactArea.INGEST, ImpactArea.SEARCH)
18171809
)
1818-
)
1810+
),
1811+
calculatedResult.impacts()
18191812
);
1813+
assertEquals("Expected 1 diagnosis but got " + calculatedResult.diagnosisList(), 1, calculatedResult.diagnosisList().size());
1814+
var diagnosis = calculatedResult.diagnosisList().get(0);
1815+
assertEquals(ACTION_CHECK_ALLOCATION_EXPLAIN_API, diagnosis.definition());
1816+
assertEquals("Expected 1 affected resource but got " + diagnosis.affectedResources(), 1, diagnosis.affectedResources().size());
1817+
var affectedResource = diagnosis.affectedResources().get(0);
1818+
assertEquals("Expected 2 indices but got " + affectedResource.getValues(), 2, affectedResource.getValues().size());
18201819
}
18211820

18221821
{
@@ -1838,11 +1837,8 @@ public void testLimitNumberOfAffectedResources() {
18381837
}
18391838

18401839
{
1841-
// 0 affected resources
1842-
assertThat(
1843-
service.calculate(true, 0, HealthInfo.EMPTY_HEALTH_INFO).diagnosisList(),
1844-
equalTo(List.of(new Diagnosis(ACTION_CHECK_ALLOCATION_EXPLAIN_API, List.of(new Diagnosis.Resource(INDEX, List.of())))))
1845-
);
1840+
// 0 affected resources means we don't do any shard allocation explanation and thus do not report any diagnosis
1841+
assertThat(service.calculate(true, 0, HealthInfo.EMPTY_HEALTH_INFO).diagnosisList(), equalTo(List.of()));
18461842
}
18471843
}
18481844

0 commit comments

Comments
 (0)