Skip to content

Commit ad17198

Browse files
original-brownbearhenningandersen
authored andcommitted
Fix Stuck IO Thread Logging Time Precision (#42882)
* The precision of the timestamps we get from the cached time thread is only 200ms by default resulting in a number of needless ~200ms slow network thread execution logs * Fixed by making the warn threshold a function of the precision of the cached time thread found in the settings
1 parent 6262626 commit ad17198

File tree

2 files changed

+9
-9
lines changed

2 files changed

+9
-9
lines changed

test/framework/src/main/java/org/elasticsearch/transport/nio/MockNioTransport.java

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -84,7 +84,7 @@ public MockNioTransport(Settings settings, Version version, ThreadPool threadPoo
8484
PageCacheRecycler pageCacheRecycler, NamedWriteableRegistry namedWriteableRegistry,
8585
CircuitBreakerService circuitBreakerService) {
8686
super(settings, version, threadPool, pageCacheRecycler, circuitBreakerService, namedWriteableRegistry, networkService);
87-
this.transportThreadWatchdog = new TransportThreadWatchdog(threadPool);
87+
this.transportThreadWatchdog = new TransportThreadWatchdog(threadPool, settings);
8888
}
8989

9090
@Override
@@ -330,21 +330,20 @@ public void sendMessage(BytesReference reference, ActionListener<Void> listener)
330330
}
331331

332332
static final class TransportThreadWatchdog {
333-
334-
private static final long WARN_THRESHOLD = TimeUnit.MILLISECONDS.toNanos(150);
335-
336333
// Only check every 2s to not flood the logs on a blocked thread.
337334
// We mostly care about long blocks and not random slowness anyway and in tests would randomly catch slow operations that block for
338335
// less than 2s eventually.
339336
private static final TimeValue CHECK_INTERVAL = TimeValue.timeValueSeconds(2);
340337

338+
private final long warnThreshold;
341339
private final ThreadPool threadPool;
342340
private final ConcurrentHashMap<Thread, Long> registry = new ConcurrentHashMap<>();
343341

344342
private volatile boolean stopped;
345343

346-
TransportThreadWatchdog(ThreadPool threadPool) {
344+
TransportThreadWatchdog(ThreadPool threadPool, Settings settings) {
347345
this.threadPool = threadPool;
346+
warnThreshold = ThreadPool.ESTIMATED_TIME_INTERVAL_SETTING.get(settings).nanos() + TimeValue.timeValueMillis(100L).nanos();
348347
threadPool.schedule(this::logLongRunningExecutions, CHECK_INTERVAL, ThreadPool.Names.GENERIC);
349348
}
350349

@@ -361,7 +360,7 @@ public void unregister() {
361360

362361
private void maybeLogElapsedTime(long startTime) {
363362
long elapsedTime = threadPool.relativeTimeInNanos() - startTime;
364-
if (elapsedTime > WARN_THRESHOLD) {
363+
if (elapsedTime > warnThreshold) {
365364
logger.warn(
366365
new ParameterizedMessage("Slow execution on network thread [{} milliseconds]",
367366
TimeUnit.NANOSECONDS.toMillis(elapsedTime)),
@@ -372,7 +371,7 @@ private void maybeLogElapsedTime(long startTime) {
372371
private void logLongRunningExecutions() {
373372
for (Map.Entry<Thread, Long> entry : registry.entrySet()) {
374373
final long elapsedTimeInNanos = threadPool.relativeTimeInNanos() - entry.getValue();
375-
if (elapsedTimeInNanos > WARN_THRESHOLD) {
374+
if (elapsedTimeInNanos > warnThreshold) {
376375
final Thread thread = entry.getKey();
377376
logger.warn("Potentially blocked execution on network thread [{}] [{} milliseconds]: \n{}", thread.getName(),
378377
TimeUnit.NANOSECONDS.toMillis(elapsedTimeInNanos),

test/framework/src/test/java/org/elasticsearch/transport/nio/TestEventHandlerTests.java

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
import org.apache.logging.log4j.LogManager;
2424
import org.elasticsearch.common.CheckedRunnable;
2525
import org.elasticsearch.common.logging.Loggers;
26+
import org.elasticsearch.common.settings.Settings;
2627
import org.elasticsearch.nio.ServerChannelContext;
2728
import org.elasticsearch.nio.SocketChannelContext;
2829
import org.elasticsearch.test.ESTestCase;
@@ -57,7 +58,7 @@ public void tearDown() throws Exception {
5758

5859
public void testLogOnElapsedTime() throws Exception {
5960
long start = System.nanoTime();
60-
long end = start + TimeUnit.MILLISECONDS.toNanos(200);
61+
long end = start + TimeUnit.MILLISECONDS.toNanos(400);
6162
AtomicBoolean isStart = new AtomicBoolean(true);
6263
LongSupplier timeSupplier = () -> {
6364
if (isStart.compareAndSet(true, false)) {
@@ -70,7 +71,7 @@ public void testLogOnElapsedTime() throws Exception {
7071
final ThreadPool threadPool = mock(ThreadPool.class);
7172
doAnswer(i -> timeSupplier.getAsLong()).when(threadPool).relativeTimeInNanos();
7273
TestEventHandler eventHandler =
73-
new TestEventHandler((e) -> {}, () -> null, new MockNioTransport.TransportThreadWatchdog(threadPool));
74+
new TestEventHandler(e -> {}, () -> null, new MockNioTransport.TransportThreadWatchdog(threadPool, Settings.EMPTY));
7475

7576
ServerChannelContext serverChannelContext = mock(ServerChannelContext.class);
7677
SocketChannelContext socketChannelContext = mock(SocketChannelContext.class);

0 commit comments

Comments
 (0)