Skip to content

Commit e02f8b4

Browse files
committed
Limit number of allocation explanations in shards_availability health indicator
1 parent 40f90fb commit e02f8b4

File tree

3 files changed

+79
-54
lines changed

3 files changed

+79
-54
lines changed

server/src/main/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorService.java

Lines changed: 42 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -166,17 +166,18 @@ public String name() {
166166
* primary and replica availability, providing the color, diagnosis, and
167167
* messages about the available or unavailable shards in the cluster.
168168
* @param metadata Metadata for the cluster
169+
* @param maxAffectedResourcesCount Max number of affect resources to return
169170
* @return A new ShardAllocationStatus that has not yet been filled.
170171
*/
171-
public ShardAllocationStatus createNewStatus(Metadata metadata) {
172-
return new ShardAllocationStatus(metadata);
172+
public ShardAllocationStatus createNewStatus(Metadata metadata, int maxAffectedResourcesCount) {
173+
return new ShardAllocationStatus(metadata, maxAffectedResourcesCount);
173174
}
174175

175176
@Override
176177
public HealthIndicatorResult calculate(boolean verbose, int maxAffectedResourcesCount, HealthInfo healthInfo) {
177178
var state = clusterService.state();
178179
var shutdown = state.getMetadata().custom(NodesShutdownMetadata.TYPE, NodesShutdownMetadata.EMPTY);
179-
var status = createNewStatus(state.getMetadata());
180+
var status = createNewStatus(state.getMetadata(), maxAffectedResourcesCount);
180181
updateShardAllocationStatus(status, state, shutdown, verbose, replicaUnassignedBufferTime);
181182
return createIndicator(
182183
status.getStatus(),
@@ -464,18 +465,33 @@ static void updateShardAllocationStatus(
464465
);
465466

466467
public class ShardAllocationCounts {
467-
int unassigned = 0;
468-
int unassigned_new = 0;
469-
int unassigned_restarting = 0;
470-
int initializing = 0;
471-
int started = 0;
472-
int relocating = 0;
473-
public final Set<ProjectIndexName> indicesWithUnavailableShards = new HashSet<>();
474-
public final Set<ProjectIndexName> indicesWithAllShardsUnavailable = new HashSet<>();
468+
final int maxAffectedResourcesCount;
469+
int unassigned;
470+
int unassigned_new;
471+
int unassigned_restarting;
472+
int initializing;
473+
int started;
474+
int relocating;
475+
public final Set<ProjectIndexName> indicesWithUnavailableShards;
476+
public final Set<ProjectIndexName> indicesWithAllShardsUnavailable;
475477
// We keep the searchable snapshots separately as long as the original index is still available
476478
// This is checked during the post-processing
477-
public SearchableSnapshotsState searchableSnapshotsState = new SearchableSnapshotsState();
478-
final Map<Diagnosis.Definition, Set<ProjectIndexName>> diagnosisDefinitions = new HashMap<>();
479+
public SearchableSnapshotsState searchableSnapshotsState;
480+
final Map<Diagnosis.Definition, Set<ProjectIndexName>> diagnosisDefinitions;
481+
482+
public ShardAllocationCounts(int maxAffectedResourcesCount) {
483+
this.maxAffectedResourcesCount = maxAffectedResourcesCount;
484+
unassigned = 0;
485+
unassigned_new = 0;
486+
unassigned_restarting = 0;
487+
initializing = 0;
488+
started = 0;
489+
relocating = 0;
490+
indicesWithUnavailableShards = new HashSet<>();
491+
indicesWithAllShardsUnavailable = new HashSet<>();
492+
searchableSnapshotsState = new SearchableSnapshotsState();
493+
diagnosisDefinitions = new HashMap<>();
494+
}
479495

480496
public void increment(
481497
ProjectId projectId,
@@ -512,14 +528,18 @@ public void increment(
512528
unassigned_restarting++;
513529
} else {
514530
unassigned++;
515-
if (verbose) {
531+
// Computing the diagnosis can be very expensive in large clusters, so we limit the number of
532+
// computations to the maxAffectedResourcesCount. The main negative side effect of this is that
533+
// we might miss some diagnoses. We are willing to take this risk, and users can always
534+
// use the allocation explain API for more details or increase the maxAffectedResourcesCount.
535+
if (verbose && unassigned <= maxAffectedResourcesCount) {
516536
diagnoseUnassignedShardRouting(routing, state).forEach(definition -> addDefinition(definition, projectIndex));
517537
}
518538
}
519539
}
520540
case INITIALIZING -> {
521541
initializing++;
522-
if (verbose) {
542+
if (verbose && initializing <= maxAffectedResourcesCount) {
523543
addDefinition(DIAGNOSIS_WAIT_FOR_INITIALIZATION, projectIndex);
524544
}
525545
}
@@ -957,12 +977,16 @@ public Diagnosis.Definition getIncreaseNodeWithRoleCapacityAction(String role) {
957977
}
958978

959979
public class ShardAllocationStatus {
960-
protected final ShardAllocationCounts primaries = new ShardAllocationCounts();
961-
protected final ShardAllocationCounts replicas = new ShardAllocationCounts();
980+
protected final ShardAllocationCounts primaries;
981+
protected final ShardAllocationCounts replicas;
962982
protected final Metadata clusterMetadata;
983+
protected final int maxAffectedResourcesCount;
963984

964-
public ShardAllocationStatus(Metadata clusterMetadata) {
985+
public ShardAllocationStatus(Metadata clusterMetadata, int maxAffectedResourcesCount) {
965986
this.clusterMetadata = clusterMetadata;
987+
this.maxAffectedResourcesCount = maxAffectedResourcesCount;
988+
primaries = new ShardAllocationCounts(maxAffectedResourcesCount);
989+
replicas = new ShardAllocationCounts(maxAffectedResourcesCount);
966990
}
967991

968992
void addPrimary(ProjectId projectId, ShardRouting routing, ClusterState state, NodesShutdownMetadata shutdowns, boolean verbose) {

server/src/main/java/org/elasticsearch/health/Diagnosis.java

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,11 @@ public Collection<String> getValues() {
126126
public Collection<DiscoveryNode> getNodes() {
127127
return nodes;
128128
}
129+
130+
@Override
131+
public String toString() {
132+
return "Resource{" + "type=" + type + ", values=" + values + ", nodes=" + nodes + '}';
133+
}
129134
}
130135

131136
/**

server/src/test/java/org/elasticsearch/cluster/routing/allocation/shards/ShardsAvailabilityHealthIndicatorServiceTests.java

Lines changed: 32 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -361,7 +361,7 @@ public void testAllReplicasUnassigned() {
361361
List.of()
362362
);
363363
var service = createShardsAvailabilityIndicatorService(projectId, clusterState);
364-
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
364+
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
365365
ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
366366
status,
367367
clusterState,
@@ -386,7 +386,7 @@ public void testAllReplicasUnassigned() {
386386
List.of()
387387
);
388388
var service = createShardsAvailabilityIndicatorService(projectId, clusterState);
389-
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
389+
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
390390
ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
391391
status,
392392
clusterState,
@@ -411,7 +411,7 @@ public void testAllReplicasUnassigned() {
411411
List.of()
412412
);
413413
var service = createShardsAvailabilityIndicatorService(projectId, clusterState);
414-
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
414+
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
415415
ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
416416
status,
417417
clusterState,
@@ -438,7 +438,7 @@ public void testAllReplicasUnassigned() {
438438
);
439439

440440
var service = createShardsAvailabilityIndicatorService(projectId, clusterState);
441-
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
441+
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
442442
ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
443443
status,
444444
clusterState,
@@ -477,7 +477,7 @@ public void testAllReplicasUnassigned() {
477477
List.of()
478478
);
479479
var service = createShardsAvailabilityIndicatorService(projectId, clusterState);
480-
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
480+
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
481481
ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
482482
status,
483483
clusterState,
@@ -508,7 +508,7 @@ public void testAllReplicasUnassigned() {
508508
ProjectId projectId = randomProjectIdOrDefault();
509509
var clusterState = createClusterStateWith(projectId, List.of(routingTable), List.of());
510510
var service = createShardsAvailabilityIndicatorService(projectId, clusterState);
511-
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
511+
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
512512
ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
513513
status,
514514
clusterState,
@@ -534,7 +534,7 @@ public void testAllReplicasUnassigned() {
534534
List.of()
535535
);
536536
var service = createShardsAvailabilityIndicatorService(projectId, clusterState);
537-
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata());
537+
ShardAllocationStatus status = service.createNewStatus(clusterState.metadata(), randomNonNegativeInt());
538538
ShardsAvailabilityHealthIndicatorService.updateShardAllocationStatus(
539539
status,
540540
clusterState,
@@ -1791,32 +1791,31 @@ public void testLimitNumberOfAffectedResources() {
17911791

17921792
{
17931793
// assert the full result to check that details, impacts, and symptoms use the correct count of affected indices (5)
1794-
assertThat(
1795-
service.calculate(true, 2, HealthInfo.EMPTY_HEALTH_INFO),
1796-
equalTo(
1797-
createExpectedResult(
1798-
RED,
1799-
"This cluster has 5 unavailable primary shards.",
1800-
Map.of("unassigned_primaries", 5),
1801-
List.of(
1802-
new HealthIndicatorImpact(
1803-
NAME,
1804-
ShardsAvailabilityHealthIndicatorService.PRIMARY_UNASSIGNED_IMPACT_ID,
1805-
1,
1806-
"Cannot add data to 5 indices [red-index1, red-index2, red-index3, red-index4, red-index5]. Searches might "
1807-
+ "return incomplete results.",
1808-
List.of(ImpactArea.INGEST, ImpactArea.SEARCH)
1809-
)
1810-
),
1811-
List.of(
1812-
new Diagnosis(
1813-
ACTION_CHECK_ALLOCATION_EXPLAIN_API,
1814-
List.of(new Diagnosis.Resource(INDEX, List.of("red-index1", "red-index2")))
1815-
)
1816-
)
1794+
// since we limit the number of allocation explanations while looping over the shards, we can't guarantee
1795+
// which indices end up in the affected resources list, but we can at least check that the size is correct
1796+
var calculatedResult = service.calculate(true, 2, HealthInfo.EMPTY_HEALTH_INFO);
1797+
assertEquals(RED, calculatedResult.status());
1798+
assertEquals("This cluster has 5 unavailable primary shards.", calculatedResult.symptom());
1799+
assertEquals(new SimpleHealthIndicatorDetails(addDefaults(Map.of("unassigned_primaries", 5))), calculatedResult.details());
1800+
assertEquals(
1801+
List.of(
1802+
new HealthIndicatorImpact(
1803+
NAME,
1804+
ShardsAvailabilityHealthIndicatorService.PRIMARY_UNASSIGNED_IMPACT_ID,
1805+
1,
1806+
"Cannot add data to 5 indices [red-index1, red-index2, red-index3, red-index4, red-index5]. Searches might "
1807+
+ "return incomplete results.",
1808+
List.of(ImpactArea.INGEST, ImpactArea.SEARCH)
18171809
)
1818-
)
1810+
),
1811+
calculatedResult.impacts()
18191812
);
1813+
assertEquals("Expected 1 diagnosis but got " + calculatedResult.diagnosisList(), 1, calculatedResult.diagnosisList().size());
1814+
var diagnosis = calculatedResult.diagnosisList().get(0);
1815+
assertEquals(ACTION_CHECK_ALLOCATION_EXPLAIN_API, diagnosis.definition());
1816+
assertEquals("Expected 1 affected resource but got " + diagnosis.affectedResources(), 1, diagnosis.affectedResources().size());
1817+
var affectedResource = diagnosis.affectedResources().get(0);
1818+
assertEquals("Expected 2 indices but got " + affectedResource.getValues(), 2, affectedResource.getValues().size());
18201819
}
18211820

18221821
{
@@ -1838,11 +1837,8 @@ public void testLimitNumberOfAffectedResources() {
18381837
}
18391838

18401839
{
1841-
// 0 affected resources
1842-
assertThat(
1843-
service.calculate(true, 0, HealthInfo.EMPTY_HEALTH_INFO).diagnosisList(),
1844-
equalTo(List.of(new Diagnosis(ACTION_CHECK_ALLOCATION_EXPLAIN_API, List.of(new Diagnosis.Resource(INDEX, List.of())))))
1845-
);
1840+
// 0 affected resources means we don't do any shard allocation explanation and thus do not report any diagnosis
1841+
assertThat(service.calculate(true, 0, HealthInfo.EMPTY_HEALTH_INFO).diagnosisList(), equalTo(List.of()));
18461842
}
18471843
}
18481844

0 commit comments

Comments
 (0)