Skip to content

Commit dd2fb5b

Browse files
authored
Log local hot-threads when shard creation fails due to shardLock (#120495)
Closes: ES-10493
1 parent 682e46a commit dd2fb5b

File tree

2 files changed

+27
-2
lines changed

2 files changed

+27
-2
lines changed

server/src/internalClusterTest/java/org/elasticsearch/indices/cluster/ShardLockFailureIT.java

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,14 @@ public void testShardLockFailure() throws Exception {
7272
var ignored1 = internalCluster().getInstance(NodeEnvironment.class, node).shardLock(shardId, "blocked for test");
7373
var mockLog = MockLog.capture(IndicesClusterStateService.class);
7474
) {
75-
75+
mockLog.addExpectation(
76+
new MockLog.SeenEventExpectation(
77+
"hot threads",
78+
"org.elasticsearch.indices.cluster.IndicesClusterStateService",
79+
Level.WARN,
80+
"[testindex][0]: acquire shard lock for create"
81+
)
82+
);
7683
mockLog.addExpectation(new MockLog.LoggingExpectation() {
7784
private final CountDownLatch countDownLatch = new CountDownLatch(1);
7885
int debugMessagesSeen = 0;
@@ -147,6 +154,14 @@ public void testShardLockTimeout() throws Exception {
147154
var ignored1 = internalCluster().getInstance(NodeEnvironment.class, node).shardLock(shardId, "blocked for test");
148155
var mockLog = MockLog.capture(IndicesClusterStateService.class);
149156
) {
157+
mockLog.addExpectation(
158+
new MockLog.SeenEventExpectation(
159+
"hot threads",
160+
"org.elasticsearch.indices.cluster.IndicesClusterStateService",
161+
Level.WARN,
162+
"[testindex][0]: acquire shard lock for create"
163+
)
164+
);
150165
mockLog.addExpectation(
151166
new MockLog.SeenEventExpectation(
152167
"timeout message",

server/src/main/java/org/elasticsearch/indices/cluster/IndicesClusterStateService.java

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@
4242
import org.elasticsearch.common.util.concurrent.AbstractRunnable;
4343
import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
4444
import org.elasticsearch.common.util.concurrent.EsExecutors;
45+
import org.elasticsearch.common.util.concurrent.RunOnce;
4546
import org.elasticsearch.common.util.concurrent.ThreadContext;
4647
import org.elasticsearch.common.util.concurrent.ThrottledTaskRunner;
4748
import org.elasticsearch.core.Nullable;
@@ -74,6 +75,7 @@
7475
import org.elasticsearch.indices.recovery.RecoveryFailedException;
7576
import org.elasticsearch.indices.recovery.RecoveryState;
7677
import org.elasticsearch.injection.guice.Inject;
78+
import org.elasticsearch.monitor.jvm.HotThreads;
7779
import org.elasticsearch.repositories.RepositoriesService;
7880
import org.elasticsearch.search.SearchService;
7981
import org.elasticsearch.snapshots.SnapshotShardsService;
@@ -688,6 +690,7 @@ private void createShard(ShardRouting shardRouting, ClusterState state) {
688690
primaryTerm,
689691
0,
690692
0L,
693+
new RunOnce(() -> HotThreads.logLocalCurrentThreads(logger, Level.WARN, shardId + ": acquire shard lock for create")),
691694
ActionListener.runBefore(new ActionListener<>() {
692695
@Override
693696
public void onResponse(Boolean success) {
@@ -740,6 +743,7 @@ private void createShardWhenLockAvailable(
740743
long primaryTerm,
741744
int iteration,
742745
long delayMillis,
746+
RunOnce dumpHotThreads,
743747
ActionListener<Boolean> listener
744748
) {
745749
try {
@@ -763,8 +767,9 @@ private void createShardWhenLockAvailable(
763767
listener.onFailure(e);
764768
return;
765769
}
770+
final Level level = (iteration + 25) % 30 == 0 ? Level.WARN : Level.DEBUG;
766771
logger.log(
767-
(iteration + 25) % 30 == 0 ? Level.WARN : Level.DEBUG,
772+
level,
768773
"""
769774
shard lock for [{}] has been unavailable for at least [{}/{}ms], \
770775
attempting to create shard while applying cluster state [version={},uuid={}], will retry in [{}]: [{}]""",
@@ -776,6 +781,9 @@ private void createShardWhenLockAvailable(
776781
shardLockRetryInterval,
777782
e.getMessage()
778783
);
784+
if (level == Level.WARN) {
785+
dumpHotThreads.run();
786+
}
779787
// TODO could we instead subscribe to the shard lock and trigger the retry exactly when it is released rather than polling?
780788
threadPool.scheduleUnlessShuttingDown(
781789
shardLockRetryInterval,
@@ -813,6 +821,7 @@ private void createShardWhenLockAvailable(
813821
shardLockRetryTimeout.millis(),
814822
shardRouting
815823
);
824+
dumpHotThreads.run();
816825
listener.onFailure(
817826
new ElasticsearchTimeoutException("timed out while waiting to acquire shard lock for " + shardRouting)
818827
);
@@ -841,6 +850,7 @@ private void createShardWhenLockAvailable(
841850
primaryTerm,
842851
iteration + 1,
843852
newDelayMillis,
853+
dumpHotThreads,
844854
listener
845855
);
846856

0 commit comments

Comments
 (0)