Skip to content

Commit 93410cf

Browse files
authored
Log NOT_PREFERRED shard movements (elastic#138069)
1 parent e0fcab7 commit 93410cf

File tree

4 files changed

+66
-2
lines changed

4 files changed

+66
-2
lines changed

server/src/main/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocator.java

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@
7777
public class BalancedShardsAllocator implements ShardsAllocator {
7878

7979
private static final Logger logger = LogManager.getLogger(BalancedShardsAllocator.class);
80+
private static final Logger notPreferredLogger = LogManager.getLogger(BalancedShardsAllocator.class.getName() + ".not-preferred");
8081

8182
public static final Setting<Float> SHARD_BALANCE_FACTOR_SETTING = Setting.floatSetting(
8283
"cluster.routing.allocation.balance.shard",
@@ -866,6 +867,14 @@ public boolean moveShards() {
866867
// can use the cached decision.
867868
final var moveDecision = shardMoved ? decideMove(index, shardRouting) : storedShardMovement.moveDecision();
868869
if (moveDecision.isDecisionTaken() && moveDecision.cannotRemainAndCanMove()) {
870+
if (notPreferredLogger.isDebugEnabled()) {
871+
notPreferredLogger.debug(
872+
"Moving shard [{}] to [{}] from a NOT_PREFERRED allocation, explanation is [{}]",
873+
shardRouting,
874+
moveDecision.getTargetNode().getName(),
875+
moveDecision.getCanRemainDecision().getExplanation()
876+
);
877+
}
869878
executeMove(shardRouting, index, moveDecision, "move-non-preferred");
870879
// Return after a single move so that the change can be simulated before further moves are made.
871880
return true;

server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/WriteLoadConstraintDecider.java

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
import org.elasticsearch.cluster.routing.allocation.WriteLoadConstraintSettings;
2121
import org.elasticsearch.common.FrequencyCappedAction;
2222
import org.elasticsearch.common.settings.ClusterSettings;
23+
import org.elasticsearch.core.Nullable;
2324
import org.elasticsearch.core.Strings;
2425
import org.elasticsearch.core.TimeValue;
2526
import org.elasticsearch.threadpool.ThreadPool;
@@ -154,14 +155,16 @@ public Decision canRemain(IndexMetadata indexMetadata, ShardRouting shardRouting
154155
var nodeWriteThreadPoolQueueLatencyThreshold = writeLoadConstraintSettings.getQueueLatencyThreshold();
155156
if (nodeWriteThreadPoolStats.maxThreadPoolQueueLatencyMillis() >= nodeWriteThreadPoolQueueLatencyThreshold.millis()) {
156157
if (logger.isDebugEnabled() || allocation.debugDecision()) {
158+
final Double shardWriteLoad = getShardWriteLoad(allocation, shardRouting);
157159
final String explain = Strings.format(
158160
"""
159161
Node [%s] has a queue latency of [%d] millis that exceeds the queue latency threshold of [%s]. This node is \
160-
hot-spotting. Current thread pool utilization [%f]. Moving shard(s) away.""",
162+
hot-spotting. Current thread pool utilization [%f]. Shard write load [%s]. Moving shard(s) away.""",
161163
node.nodeId(),
162164
nodeWriteThreadPoolStats.maxThreadPoolQueueLatencyMillis(),
163165
nodeWriteThreadPoolQueueLatencyThreshold.toHumanReadableString(2),
164-
nodeWriteThreadPoolStats.averageThreadPoolUtilization()
166+
nodeWriteThreadPoolStats.averageThreadPoolUtilization(),
167+
shardWriteLoad == null ? "unknown" : shardWriteLoad
165168
);
166169
if (logger.isDebugEnabled()) {
167170
logCanRemainMessage.maybeExecute(() -> logger.debug(explain));
@@ -182,6 +185,11 @@ public Decision canRemain(IndexMetadata indexMetadata, ShardRouting shardRouting
182185
);
183186
}
184187

188+
@Nullable
189+
private Double getShardWriteLoad(RoutingAllocation allocation, ShardRouting shardRouting) {
190+
return allocation.clusterInfo().getShardWriteLoads().get(shardRouting.shardId());
191+
}
192+
185193
/**
186194
* Calculates the change to the node's write thread pool utilization percentage if the shard is added to the node.
187195
* Returns the percent thread pool utilization change.

server/src/test/java/org/elasticsearch/cluster/routing/allocation/allocator/BalancedShardsAllocatorTests.java

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
package org.elasticsearch.cluster.routing.allocation.allocator;
1111

12+
import org.apache.logging.log4j.Level;
1213
import org.elasticsearch.action.ActionListener;
1314
import org.elasticsearch.action.support.replication.ClusterStateCreationUtils;
1415
import org.elasticsearch.cluster.ClusterInfo;
@@ -55,7 +56,9 @@
5556
import org.elasticsearch.index.IndexVersion;
5657
import org.elasticsearch.index.shard.ShardId;
5758
import org.elasticsearch.snapshots.SnapshotShardSizeInfo;
59+
import org.elasticsearch.test.MockLog;
5860
import org.elasticsearch.test.gateway.TestGatewayAllocator;
61+
import org.elasticsearch.test.junit.annotations.TestLogging;
5962
import org.hamcrest.Matchers;
6063

6164
import java.util.ArrayList;
@@ -1004,6 +1007,43 @@ public void testReturnEarlyOnShardAssignmentChanges() {
10041007
applyStartedShardsUntilNoChange(clusterState, allocationService);
10051008
}
10061009

1010+
@TestLogging(
1011+
value = "org.elasticsearch.cluster.routing.allocation.allocator.BalancedShardsAllocator.not-preferred:DEBUG",
1012+
reason = "debug logging for test"
1013+
)
1014+
public void testNotPreferredMovementIsLoggedAtDebugLevel() {
1015+
final var clusterState = ClusterStateCreationUtils.state(randomIdentifier(), 3, 3);
1016+
final var balancedShardsAllocator = new BalancedShardsAllocator(
1017+
BalancerSettings.DEFAULT,
1018+
TEST_WRITE_LOAD_FORECASTER,
1019+
new GlobalBalancingWeightsFactory(BalancerSettings.DEFAULT)
1020+
);
1021+
1022+
final var allocation = new RoutingAllocation(new AllocationDeciders(List.<AllocationDecider>of(new AllocationDecider() {
1023+
@Override
1024+
public Decision canRemain(
1025+
IndexMetadata indexMetadata,
1026+
ShardRouting shardRouting,
1027+
RoutingNode node,
1028+
RoutingAllocation allocation
1029+
) {
1030+
return new Decision.Single(Decision.Type.NOT_PREFERRED, "test_decider", "Always NOT_PREFERRED");
1031+
}
1032+
})), clusterState.getRoutingNodes().mutableCopy(), clusterState, ClusterInfo.EMPTY, SnapshotShardSizeInfo.EMPTY, 0L);
1033+
1034+
final var notPreferredLoggerName = BalancedShardsAllocator.class.getName() + ".not-preferred";
1035+
MockLog.assertThatLogger(
1036+
() -> balancedShardsAllocator.allocate(allocation),
1037+
notPreferredLoggerName,
1038+
new MockLog.SeenEventExpectation(
1039+
"moved a NOT_PREFERRED allocation",
1040+
notPreferredLoggerName,
1041+
Level.DEBUG,
1042+
"Moving shard [*] to [*] from a NOT_PREFERRED allocation, explanation is [Always NOT_PREFERRED]"
1043+
)
1044+
);
1045+
}
1046+
10071047
/**
10081048
* Test for {@link PrioritiseByShardWriteLoadComparator}. See Comparator Javadoc for expected
10091049
* ordering.

test/framework/src/main/java/org/elasticsearch/test/MockLog.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -447,6 +447,13 @@ private static void addToMockLogs(MockLog mockLog, List<String> loggers) {
447447
* Executes an action and verifies expectations against the provided logger
448448
*/
449449
public static void assertThatLogger(Runnable action, Class<?> loggerOwner, MockLog.LoggingExpectation... expectations) {
450+
assertThatLogger(action, loggerOwner.getCanonicalName(), expectations);
451+
}
452+
453+
/**
454+
* Executes an action and verifies expectations against the provided logger
455+
*/
456+
public static void assertThatLogger(Runnable action, String loggerOwner, MockLog.LoggingExpectation... expectations) {
450457
try (var mockLog = MockLog.capture(loggerOwner)) {
451458
for (var expectation : expectations) {
452459
mockLog.addExpectation(expectation);

0 commit comments

Comments
 (0)