Skip to content

Commit db5026b

Browse files
nicktindallywangd
andauthored
Undesired allocation tracking improvements (#137940)
Co-authored-by: Yang Wang <[email protected]>
1 parent af1c8b1 commit db5026b

File tree

2 files changed

+326
-8
lines changed

2 files changed

+326
-8
lines changed

server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/UndesiredAllocationsTracker.java

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,8 @@
99

1010
package org.elasticsearch.cluster.routing.allocation.allocator;
1111

12+
import org.elasticsearch.cluster.node.DiscoveryNode;
13+
import org.elasticsearch.cluster.node.DiscoveryNodeRole;
1214
import org.elasticsearch.cluster.routing.RoutingNodes;
1315
import org.elasticsearch.cluster.routing.ShardRouting;
1416
import org.elasticsearch.cluster.routing.allocation.RoutingAllocation;
@@ -17,6 +19,7 @@
1719
import org.elasticsearch.common.settings.Setting;
1820
import org.elasticsearch.common.time.TimeProvider;
1921
import org.elasticsearch.common.util.FeatureFlag;
22+
import org.elasticsearch.core.Releasable;
2023
import org.elasticsearch.core.TimeValue;
2124
import org.elasticsearch.index.shard.ShardId;
2225
import org.elasticsearch.logging.LogManager;
@@ -82,6 +85,7 @@ public class UndesiredAllocationsTracker {
8285
private final FrequencyCappedAction undesiredAllocationDurationLogInterval;
8386
private volatile TimeValue undesiredAllocationDurationLoggingThreshold;
8487
private volatile int maxUndesiredAllocationsToTrack;
88+
private boolean missingAllocationAssertionsEnabled = true;
8589

8690
UndesiredAllocationsTracker(ClusterSettings clusterSettings, TimeProvider timeProvider) {
8791
this.timeProvider = timeProvider;
@@ -161,6 +165,14 @@ public void maybeLogUndesiredShardsWarning(
161165
}
162166
}
163167

168+
private boolean shardTierMatchesNodeTier(ShardRouting shardRouting, DiscoveryNode discoveryNode) {
169+
return switch (shardRouting.role()) {
170+
case INDEX_ONLY -> discoveryNode.getRoles().contains(DiscoveryNodeRole.INDEX_ROLE);
171+
case SEARCH_ONLY -> discoveryNode.getRoles().contains(DiscoveryNodeRole.SEARCH_ROLE);
172+
default -> true;
173+
};
174+
}
175+
164176
private void logDecisionsForUndesiredShardsOverThreshold(
165177
RoutingNodes routingNodes,
166178
RoutingAllocation routingAllocation,
@@ -199,10 +211,21 @@ private void logUndesiredShardDetails(
199211
allocation.setDebugMode(RoutingAllocation.DebugMode.EXCLUDE_YES_DECISIONS);
200212
try {
201213
final var assignment = desiredBalance.getAssignment(shardRouting.shardId());
202-
logger.warn("Shard {} has been in an undesired allocation for {}", shardRouting.shardId(), undesiredDuration);
203-
for (final var nodeId : assignment.nodeIds()) {
204-
final var decision = allocation.deciders().canAllocate(shardRouting, routingNodes.node(nodeId), allocation);
205-
logger.warn("Shard {} allocation decision for node [{}]: {}", shardRouting.shardId(), nodeId, decision);
214+
if (assignment != null) {
215+
logger.warn("Shard {} has been in an undesired allocation for {}", shardRouting.shardId(), undesiredDuration);
216+
for (final var nodeId : assignment.nodeIds()) {
217+
if (allocation.nodes().nodeExists(nodeId)) {
218+
if (shardTierMatchesNodeTier(shardRouting, allocation.nodes().get(nodeId))) {
219+
final var decision = allocation.deciders().canAllocate(shardRouting, routingNodes.node(nodeId), allocation);
220+
logger.warn("Shard {} allocation decision for node [{}]: {}", shardRouting.shardId(), nodeId, decision);
221+
}
222+
} else {
223+
logger.warn("Shard {} desired node [{}] has left the cluster", shardRouting.shardId(), nodeId);
224+
}
225+
}
226+
} else {
227+
assert missingAllocationAssertionsEnabled == false
228+
: "Shard " + shardRouting + " was missing an assignment, this shouldn't be possible. " + desiredBalance;
206229
}
207230
} finally {
208231
allocation.setDebugMode(originalDebugMode);
@@ -239,4 +262,10 @@ Map<String, UndesiredAllocation> getUndesiredAllocations() {
239262
* @param undesiredSince The timestamp when the shard was first observed in an undesired allocation
240263
*/
241264
record UndesiredAllocation(ShardId shardId, long undesiredSince) {}
265+
266+
// Exposed for testing
267+
public Releasable disableMissingAllocationAssertions() {
268+
missingAllocationAssertionsEnabled = false;
269+
return () -> missingAllocationAssertionsEnabled = true;
270+
}
242271
}

0 commit comments

Comments
 (0)