Skip to content

Commit 0d3bc04

Browse files
authored
Fix how the health API is reporting initializing shards (#93502) (#93586)
When a shard is initializing is not available for usage but it's also not unassigned. In this PR, we update the symptom and we add a new diagnosis to guide the user through this situation. (cherry picked from commit db17d38)
1 parent 7552430 commit 0d3bc04

File tree

3 files changed

+171
-25
lines changed

3 files changed

+171
-25
lines changed

docs/changelog/93502.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
pr: 93502
2+
summary: Health API fix the reporting of initializing shards
3+
area: Health
4+
type: bug
5+
issues: [90327]

server/src/main/java/org/elasticsearch/cluster/routing/allocation/ShardsAvailabilityHealthIndicatorService.java

Lines changed: 32 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,18 @@ public HealthIndicatorResult calculate(boolean verbose, HealthInfo healthInfo) {
166166
FIX_DELAYED_SHARDS_GUIDE
167167
);
168168

169+
public static final String WAIT_FOR_INITIALIZATION_GUIDE = "https://ela.st/wait-for-shard-initialization";
170+
public static final Diagnosis.Definition DIAGNOSIS_WAIT_FOR_INITIALIZATION = new Diagnosis.Definition(
171+
NAME,
172+
"initializing_shards",
173+
"Elasticsearch is currently initializing the unavailable shards. Please wait for the initialization to finish.",
174+
"The shards will become available as long as the initialization completes. No action is required by the user, you can"
175+
+ " monitor the progress of the initializing shards at "
176+
+ WAIT_FOR_INITIALIZATION_GUIDE
177+
+ ".",
178+
WAIT_FOR_INITIALIZATION_GUIDE
179+
);
180+
169181
public static final String ENABLE_INDEX_ALLOCATION_GUIDE = "https://ela.st/fix-index-allocation";
170182
public static final Diagnosis.Definition ACTION_ENABLE_INDEX_ROUTING_ALLOCATION = new Diagnosis.Definition(
171183
NAME,
@@ -401,7 +413,12 @@ public void increment(ShardRouting routing, ClusterState state, NodesShutdownMet
401413
}
402414
}
403415
}
404-
case INITIALIZING -> initializing++;
416+
case INITIALIZING -> {
417+
initializing++;
418+
if (verbose) {
419+
addDefinition(DIAGNOSIS_WAIT_FOR_INITIALIZATION, routing.getIndexName());
420+
}
421+
}
405422
case STARTED -> started++;
406423
case RELOCATING -> relocating++;
407424
}
@@ -440,22 +457,16 @@ List<Diagnosis.Definition> diagnoseUnassignedShardRouting(ShardRouting shardRout
440457
List<Diagnosis.Definition> diagnosisDefs = new ArrayList<>();
441458
LOGGER.trace("Diagnosing unassigned shard [{}] due to reason [{}]", shardRouting.shardId(), shardRouting.unassignedInfo());
442459
switch (shardRouting.unassignedInfo().getLastAllocationStatus()) {
443-
case NO_VALID_SHARD_COPY:
444-
diagnosisDefs.add(ACTION_RESTORE_FROM_SNAPSHOT);
445-
break;
446-
case NO_ATTEMPT:
460+
case NO_VALID_SHARD_COPY -> diagnosisDefs.add(ACTION_RESTORE_FROM_SNAPSHOT);
461+
case NO_ATTEMPT -> {
447462
if (shardRouting.unassignedInfo().isDelayed()) {
448463
diagnosisDefs.add(DIAGNOSIS_WAIT_FOR_OR_FIX_DELAYED_SHARDS);
449464
} else {
450465
diagnosisDefs.addAll(explainAllocationsAndDiagnoseDeciders(shardRouting, state));
451466
}
452-
break;
453-
case DECIDERS_NO:
454-
diagnosisDefs.addAll(explainAllocationsAndDiagnoseDeciders(shardRouting, state));
455-
break;
456-
case DELAYED_ALLOCATION:
457-
diagnosisDefs.add(DIAGNOSIS_WAIT_FOR_OR_FIX_DELAYED_SHARDS);
458-
break;
467+
}
468+
case DECIDERS_NO -> diagnosisDefs.addAll(explainAllocationsAndDiagnoseDeciders(shardRouting, state));
469+
case DELAYED_ALLOCATION -> diagnosisDefs.add(DIAGNOSIS_WAIT_FOR_OR_FIX_DELAYED_SHARDS);
459470
}
460471
if (diagnosisDefs.isEmpty()) {
461472
diagnosisDefs.add(ACTION_CHECK_ALLOCATION_EXPLAIN_API);
@@ -545,10 +556,10 @@ List<Diagnosis.Definition> diagnoseAllocationResults(
545556
* @return A predicate that returns true if the decision exists and matches the expected outcome, false otherwise.
546557
*/
547558
private static Predicate<NodeAllocationResult> hasDeciderResult(String deciderName, Decision.Type outcome) {
548-
return (nodeResult) -> nodeResult.getCanAllocateDecision()
549-
.getDecisions()
550-
.stream()
551-
.anyMatch(decision -> deciderName.equals(decision.label()) && outcome == decision.type());
559+
return (nodeResult) -> {
560+
Decision decision = nodeResult.getCanAllocateDecision();
561+
return decision != null && decision.getDecisions().stream().anyMatch(d -> deciderName.equals(d.label()) && outcome == d.type());
562+
};
552563
}
553564

554565
/**
@@ -793,13 +804,17 @@ public String getSymptom() {
793804
|| primaries.unassigned_new > 0
794805
|| primaries.unassigned_restarting > 0
795806
|| replicas.unassigned > 0
796-
|| replicas.unassigned_restarting > 0) {
807+
|| replicas.unassigned_restarting > 0
808+
|| primaries.initializing > 0
809+
|| replicas.initializing > 0) {
797810
builder.append(
798811
Stream.of(
799812
createMessage(primaries.unassigned, "unavailable primary shard", "unavailable primary shards"),
800813
createMessage(primaries.unassigned_new, "creating primary shard", "creating primary shards"),
801814
createMessage(primaries.unassigned_restarting, "restarting primary shard", "restarting primary shards"),
802815
createMessage(replicas.unassigned, "unavailable replica shard", "unavailable replica shards"),
816+
createMessage(primaries.initializing, "initializing primary shard", "initializing primary shards"),
817+
createMessage(replicas.initializing, "initializing replica shard", "initializing replica shards"),
803818
createMessage(replicas.unassigned_restarting, "restarting replica shard", "restarting replica shards")
804819
).flatMap(Function.identity()).collect(joining(", "))
805820
).append(".");

server/src/test/java/org/elasticsearch/cluster/routing/allocation/ShardsAvailabilityHealthIndicatorServiceTests.java

Lines changed: 134 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,11 @@
7474
import static org.elasticsearch.cluster.routing.allocation.ShardsAvailabilityHealthIndicatorService.ACTION_MIGRATE_TIERS_AWAY_FROM_INCLUDE_DATA_LOOKUP;
7575
import static org.elasticsearch.cluster.routing.allocation.ShardsAvailabilityHealthIndicatorService.ACTION_MIGRATE_TIERS_AWAY_FROM_REQUIRE_DATA_LOOKUP;
7676
import static org.elasticsearch.cluster.routing.allocation.ShardsAvailabilityHealthIndicatorService.ACTION_RESTORE_FROM_SNAPSHOT;
77+
import static org.elasticsearch.cluster.routing.allocation.ShardsAvailabilityHealthIndicatorService.DIAGNOSIS_WAIT_FOR_INITIALIZATION;
7778
import static org.elasticsearch.cluster.routing.allocation.ShardsAvailabilityHealthIndicatorService.DIAGNOSIS_WAIT_FOR_OR_FIX_DELAYED_SHARDS;
7879
import static org.elasticsearch.cluster.routing.allocation.ShardsAvailabilityHealthIndicatorService.NAME;
7980
import static org.elasticsearch.cluster.routing.allocation.ShardsAvailabilityHealthIndicatorServiceTests.ShardState.AVAILABLE;
81+
import static org.elasticsearch.cluster.routing.allocation.ShardsAvailabilityHealthIndicatorServiceTests.ShardState.CREATING;
8082
import static org.elasticsearch.cluster.routing.allocation.ShardsAvailabilityHealthIndicatorServiceTests.ShardState.INITIALIZING;
8183
import static org.elasticsearch.cluster.routing.allocation.ShardsAvailabilityHealthIndicatorServiceTests.ShardState.RESTARTING;
8284
import static org.elasticsearch.cluster.routing.allocation.ShardsAvailabilityHealthIndicatorServiceTests.ShardState.UNAVAILABLE;
@@ -121,6 +123,98 @@ public void testShouldBeGreenWhenAllPrimariesAndReplicasAreStarted() {
121123
);
122124
}
123125

126+
public void testShouldBeYellowWhenReplicaIsInitializing() {
127+
var clusterState = createClusterStateWith(
128+
List.of(
129+
index("replicated-index", new ShardAllocation(randomNodeId(), AVAILABLE), new ShardAllocation(randomNodeId(), INITIALIZING))
130+
),
131+
List.of()
132+
);
133+
var service = createAllocationHealthIndicatorService(clusterState);
134+
135+
assertThat(
136+
service.calculate(true, HealthInfo.EMPTY_HEALTH_INFO),
137+
equalTo(
138+
createExpectedResult(
139+
YELLOW,
140+
"This cluster has 1 initializing replica shard.",
141+
Map.of("started_primaries", 1, "initializing_replicas", 1),
142+
List.of(
143+
new HealthIndicatorImpact(
144+
NAME,
145+
ShardsAvailabilityHealthIndicatorService.REPLICA_UNASSIGNED_IMPACT_ID,
146+
2,
147+
"Searches might be slower than usual. Fewer redundant copies of the data exist on 1 index [replicated-index].",
148+
List.of(ImpactArea.SEARCH)
149+
)
150+
),
151+
List.of(
152+
new Diagnosis(
153+
DIAGNOSIS_WAIT_FOR_INITIALIZATION,
154+
List.of(new Diagnosis.Resource(INDEX, List.of("replicated-index")))
155+
)
156+
)
157+
)
158+
)
159+
);
160+
}
161+
162+
public void testShouldBeRedWhenPrimaryIsInitializing() {
163+
var clusterState = createClusterStateWith(
164+
List.of(index("unreplicated-index", new ShardAllocation(randomNodeId(), INITIALIZING))),
165+
List.of()
166+
);
167+
var service = createAllocationHealthIndicatorService(clusterState);
168+
169+
HealthIndicatorResult calculate = service.calculate(true, HealthInfo.EMPTY_HEALTH_INFO);
170+
assertThat(
171+
calculate,
172+
equalTo(
173+
createExpectedResult(
174+
RED,
175+
"This cluster has 1 initializing primary shard.",
176+
Map.of("initializing_primaries", 1),
177+
List.of(
178+
new HealthIndicatorImpact(
179+
NAME,
180+
ShardsAvailabilityHealthIndicatorService.PRIMARY_UNASSIGNED_IMPACT_ID,
181+
1,
182+
"Cannot add data to 1 index [unreplicated-index]. Searches might return incomplete results.",
183+
List.of(ImpactArea.INGEST, ImpactArea.SEARCH)
184+
)
185+
),
186+
List.of(
187+
new Diagnosis(
188+
DIAGNOSIS_WAIT_FOR_INITIALIZATION,
189+
List.of(new Diagnosis.Resource(INDEX, List.of("unreplicated-index")))
190+
)
191+
)
192+
)
193+
)
194+
);
195+
}
196+
197+
public void testShouldBeGreenWhenAllPrimariesAreCreating() {
198+
var clusterState = createClusterStateWith(
199+
List.of(index("unreplicated-index", new ShardAllocation(randomNodeId(), CREATING))),
200+
List.of()
201+
);
202+
var service = createAllocationHealthIndicatorService(clusterState);
203+
204+
assertThat(
205+
service.calculate(true, HealthInfo.EMPTY_HEALTH_INFO),
206+
equalTo(
207+
createExpectedResult(
208+
GREEN,
209+
"This cluster has 1 creating primary shard.",
210+
Map.of("creating_primaries", 1),
211+
Collections.emptyList(),
212+
Collections.emptyList()
213+
)
214+
)
215+
);
216+
}
217+
124218
public void testShouldBeYellowWhenThereAreUnassignedReplicas() {
125219
var availableReplicas = randomList(0, 5, () -> new ShardAllocation(randomNodeId(), AVAILABLE));
126220
var unavailableReplicas = randomList(1, 5, () -> new ShardAllocation(randomNodeId(), UNAVAILABLE));
@@ -423,10 +517,7 @@ public void testShouldBeYellowWhenRestartingReplicasReachedAllocationDelay() {
423517
}
424518

425519
public void testShouldBeGreenWhenThereAreInitializingPrimaries() {
426-
var clusterState = createClusterStateWith(
427-
List.of(index("restarting-index", new ShardAllocation("node-0", INITIALIZING))),
428-
List.of()
429-
);
520+
var clusterState = createClusterStateWith(List.of(index("restarting-index", new ShardAllocation("node-0", CREATING))), List.of());
430521
var service = createAllocationHealthIndicatorService(clusterState);
431522

432523
assertThat(
@@ -643,6 +734,37 @@ public void testDiagnoseEnableIndexAllocation() {
643734
assertThat(actions, contains(ACTION_ENABLE_INDEX_ROUTING_ALLOCATION));
644735
}
645736

737+
public void testNodeAllocationResultWithNullDecision() {
738+
// Index definition, 1 primary no replicas, allocation is not allowed
739+
IndexMetadata indexMetadata = IndexMetadata.builder("red-index")
740+
.settings(
741+
Settings.builder()
742+
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
743+
.put(EnableAllocationDecider.INDEX_ROUTING_ALLOCATION_ENABLE_SETTING.getKey(), "none")
744+
.build()
745+
)
746+
.numberOfShards(1)
747+
.numberOfReplicas(0)
748+
.build();
749+
750+
var service = createAllocationHealthIndicatorService();
751+
752+
// Get the list of user actions that are generated for this unassigned index shard
753+
List<Diagnosis.Definition> actions = service.checkIsAllocationDisabled(
754+
indexMetadata,
755+
List.of(
756+
new NodeAllocationResult(
757+
// Shard allocation is disabled on index
758+
new DiscoveryNode(randomNodeId(), buildNewFakeTransportAddress(), Version.CURRENT),
759+
new NodeAllocationResult.ShardStoreInfo(10),
760+
null
761+
)
762+
)
763+
);
764+
765+
assertThat(actions, hasSize(0));
766+
}
767+
646768
public void testDiagnoseEnableClusterAllocation() {
647769
// Index definition, 1 primary no replicas
648770
IndexMetadata indexMetadata = IndexMetadata.builder("red-index")
@@ -1336,10 +1458,13 @@ private static ShardRouting createShardRouting(ShardId shardId, boolean primary,
13361458
getSource(primary, allocation.state),
13371459
new UnassignedInfo(UnassignedInfo.Reason.INDEX_CREATED, null)
13381460
);
1339-
if (allocation.state == INITIALIZING) {
1461+
if (allocation.state == CREATING) {
13401462
return routing;
13411463
}
13421464
routing = routing.initialize(allocation.nodeId, null, 0);
1465+
if (allocation.state == INITIALIZING) {
1466+
return routing;
1467+
}
13431468
routing = routing.moveToStarted(ShardRouting.UNAVAILABLE_EXPECTED_SHARD_SIZE);
13441469
if (allocation.state == AVAILABLE) {
13451470
return routing;
@@ -1369,7 +1494,7 @@ private static ShardRouting createShardRouting(ShardId shardId, boolean primary,
13691494

13701495
private static RecoverySource getSource(boolean primary, ShardState state) {
13711496
if (primary) {
1372-
return state == INITIALIZING
1497+
return state == CREATING
13731498
? RecoverySource.EmptyStoreRecoverySource.INSTANCE
13741499
: RecoverySource.ExistingStoreRecoverySource.INSTANCE;
13751500
} else {
@@ -1379,9 +1504,10 @@ private static RecoverySource getSource(boolean primary, ShardState state) {
13791504

13801505
public enum ShardState {
13811506
UNAVAILABLE,
1382-
INITIALIZING,
1507+
CREATING,
13831508
AVAILABLE,
1384-
RESTARTING
1509+
RESTARTING,
1510+
INITIALIZING,
13851511
}
13861512

13871513
private record ShardAllocation(String nodeId, ShardState state, Long unassignedTimeNanos, @Nullable UnassignedInfo unassignedInfo) {

0 commit comments

Comments
 (0)