Skip to content

Commit f62df52

Browse files
Work around JVM Bug in LongGCDisruptionTests (#50731) (#50975)
There is a JVM bug causing `Thread#suspend` calls to randomly take multiple seconds breaking these tests that call the method numerous times in a loop. Increasing the timeout would will not work since we may call `suspend` tens if not hundreds of times and even a small number of them experiencing the blocking will lead to multiple minutes of waiting. This PR detects the specific issue by timing the `Thread#suspend` calls and skips the remainder of the test if it timed out because of the JVM bug. Closes #50047
1 parent 1a96cb9 commit f62df52

File tree

2 files changed

+27
-1
lines changed

2 files changed

+27
-1
lines changed

test/framework/src/main/java/org/elasticsearch/test/disruption/LongGCDisruption.java

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@
3232
import java.util.Random;
3333
import java.util.Set;
3434
import java.util.concurrent.ConcurrentHashMap;
35+
import java.util.concurrent.TimeUnit;
36+
import java.util.concurrent.atomic.AtomicBoolean;
3537
import java.util.concurrent.atomic.AtomicReference;
3638
import java.util.regex.Pattern;
3739
import java.util.stream.Collectors;
@@ -56,11 +58,23 @@ public class LongGCDisruption extends SingleNodeDisruption {
5658
private Set<Thread> suspendedThreads;
5759
private Thread blockDetectionThread;
5860

61+
private final AtomicBoolean sawSlowSuspendBug = new AtomicBoolean(false);
62+
5963
public LongGCDisruption(Random random, String disruptedNode) {
6064
super(random);
6165
this.disruptedNode = disruptedNode;
6266
}
6367

68+
/**
69+
* Checks if during disruption we ran into a known JVM issue that makes {@link Thread#suspend()} calls block for multiple seconds
70+
* was observed.
71+
* @see <a href=https://bugs.openjdk.java.net/browse/JDK-8218446>JDK-8218446</a>
72+
* @return true if during thread suspending a call to {@link Thread#suspend()} took more than 3s
73+
*/
74+
public boolean sawSlowSuspendBug() {
75+
return sawSlowSuspendBug.get();
76+
}
77+
6478
@Override
6579
public synchronized void startDisrupting() {
6680
if (suspendedThreads == null) {
@@ -251,7 +265,11 @@ protected boolean suspendThreads(Set<Thread> nodeThreads) {
251265
* assuming that it is safe.
252266
*/
253267
boolean definitelySafe = true;
268+
final long startTime = System.nanoTime();
254269
thread.suspend();
270+
if (System.nanoTime() - startTime > TimeUnit.SECONDS.toNanos(3L)) {
271+
sawSlowSuspendBug.set(true);
272+
}
255273
// double check the thread is not in a shared resource like logging; if so, let it go and come back
256274
safe:
257275
for (StackTraceElement stackElement : thread.getStackTrace()) {

test/framework/src/test/java/org/elasticsearch/test/disruption/LongGCDisruptionTests.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -146,14 +146,22 @@ protected Pattern[] getUnsafeClasses() {
146146
threads[i].start();
147147
}
148148
// make sure some threads are under lock
149-
disruption.startDisrupting();
149+
try {
150+
disruption.startDisrupting();
151+
} catch (RuntimeException e) {
152+
if (e.getMessage().contains("suspending node threads took too long") && disruption.sawSlowSuspendBug()) {
153+
return;
154+
}
155+
throw new AssertionError(e);
156+
}
150157
long first = ops.get();
151158
assertThat(lockedExecutor.lock.isLocked(), equalTo(false)); // no threads should own the lock
152159
Thread.sleep(100);
153160
assertThat(ops.get(), equalTo(first));
154161
disruption.stopDisrupting();
155162
assertBusy(() -> assertThat(ops.get(), greaterThan(first)));
156163
} finally {
164+
disruption.stopDisrupting();
157165
stop.set(true);
158166
for (final Thread thread : threads) {
159167
thread.join();

0 commit comments

Comments
 (0)