Skip to content

Commit 0b4b741

Browse files
authored
Reinstate disruption-related stateless masters tests (#95409)
Ensures that nodes in the `CoordinatorTests` which are disconnected or blackholed are also unable to access the shared object store, and reinstates the tests which verify that we react appropriately to this kind of disruption.
1 parent 4d6e451 commit 0b4b741

File tree

2 files changed

+65
-93
lines changed

2 files changed

+65
-93
lines changed

server/src/test/java/org/elasticsearch/cluster/coordination/AtomicRegisterCoordinatorTests.java

Lines changed: 59 additions & 90 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,13 @@
2828
import org.elasticsearch.test.junit.annotations.TestLogging;
2929
import org.elasticsearch.threadpool.ThreadPool;
3030

31+
import java.io.IOException;
3132
import java.util.HashMap;
3233
import java.util.Map;
3334
import java.util.OptionalLong;
3435
import java.util.Set;
36+
import java.util.concurrent.atomic.AtomicLong;
37+
import java.util.concurrent.atomic.AtomicReference;
3538
import java.util.function.BooleanSupplier;
3639
import java.util.function.Function;
3740
import java.util.function.LongSupplier;
@@ -48,31 +51,12 @@ public void testLeaderDisconnectionWithDisconnectEventDetectedQuickly() {
4851
// In this test the leader still has access to the register, therefore it is still considered as a leader.
4952
}
5053

51-
@Override
52-
@AwaitsFix(bugUrl = "ES-5645")
53-
public void testLeaderDisconnectionWithoutDisconnectEventDetectedQuickly() {
54-
// In this test the leader still has access to the register, therefore it is still considered as a leader.
55-
}
56-
57-
@Override
58-
@AwaitsFix(bugUrl = "ES-5645")
59-
public void testMasterStatsOnFailedUpdate() {
60-
// In this test the leader still has access to the register, therefore it is still considered as a leader, and it can perform
61-
// updates.
62-
}
63-
6454
@Override
6555
@AwaitsFix(bugUrl = "ES-5645")
6656
public void testUnhealthyLeaderIsReplaced() {
6757
// In this test the leader still has access to the register, therefore it is still considered as a leader.
6858
}
6959

70-
@Override
71-
@AwaitsFix(bugUrl = "ES-5645")
72-
public void testUnresponsiveLeaderDetectedEventually() {
73-
// In this test the leader still has access to the register, therefore it is still considered as a leader.
74-
}
75-
7660
@Override
7761
@AwaitsFix(bugUrl = "ES-5645")
7862
public void testLogsWarningPeriodicallyIfClusterNotFormed() {
@@ -91,47 +75,12 @@ public void testAckListenerReceivesNacksIfPublicationTimesOut() {
9175
// The leader still has access to the register, therefore it acknowledges the state update
9276
}
9377

94-
@Override
95-
@AwaitsFix(bugUrl = "ES-5645")
96-
public void testAppliesNoMasterBlockWritesByDefault() {
97-
// If the disconnected node is the leader it will continue to have connectivity
98-
// into the register and therefore the no master block won't be applied
99-
}
100-
101-
@Override
102-
@AwaitsFix(bugUrl = "ES-5645")
103-
public void testAppliesNoMasterBlockWritesIfConfigured() {
104-
// If the disconnected node is the leader it will continue to have connectivity
105-
// into the register and therefore the no master block won't be applied
106-
}
107-
108-
@Override
109-
@AwaitsFix(bugUrl = "ES-5645")
110-
public void testAppliesNoMasterBlockAllIfConfigured() {
111-
// If the disconnected node is the leader it will continue to have connectivity
112-
// into the register and therefore the no master block won't be applied
113-
}
114-
115-
@Override
116-
@AwaitsFix(bugUrl = "ES-5645")
117-
public void testAppliesNoMasterBlockMetadataWritesIfConfigured() {
118-
// If the disconnected node is the leader it will continue to have connectivity
119-
// into the register and therefore the no master block won't be applied
120-
}
121-
12278
@Override
12379
@AwaitsFix(bugUrl = "ES-5645")
12480
public void testClusterCannotFormWithFailingJoinValidation() {
12581
// A single node can form a cluster in this case
12682
}
12783

128-
@Override
129-
@AwaitsFix(bugUrl = "ES-5645")
130-
public void testReportsConnectBackProblemsDuringJoining() {
131-
// If the partitioned node is the leader, it still has access
132-
// to the store, therefore the test fail
133-
}
134-
13584
@Override
13685
@AwaitsFix(bugUrl = "ES-5645")
13786
public void testCannotJoinClusterWithDifferentUUID() {
@@ -165,34 +114,30 @@ public void testJoiningNodeReceivesFullState() {
165114

166115
@Override
167116
protected CoordinatorStrategy getCoordinatorStrategy() {
168-
var atomicRegister = new AtomicRegister();
169-
var sharedStore = new SharedStore();
170-
return new AtomicRegisterCoordinatorStrategy(atomicRegister, sharedStore);
117+
return new AtomicRegisterCoordinatorStrategy();
171118
}
172119

173120
class AtomicRegisterCoordinatorStrategy implements CoordinatorStrategy {
174-
private final AtomicRegister atomicRegister;
175-
private final SharedStore sharedStore;
176-
177-
AtomicRegisterCoordinatorStrategy(AtomicRegister atomicRegister, SharedStore sharedStore) {
178-
this.atomicRegister = atomicRegister;
179-
this.sharedStore = sharedStore;
180-
}
121+
private final AtomicLong currentTermRef = new AtomicLong();
122+
private final AtomicReference<Heartbeat> heartBeatRef = new AtomicReference<>();
123+
private final SharedStore sharedStore = new SharedStore();
181124

182125
@Override
183126
public CoordinationServices getCoordinationServices(
184127
ThreadPool threadPool,
185128
Settings settings,
186129
ClusterSettings clusterSettings,
187-
CoordinationState.PersistedState persistedState
130+
CoordinationState.PersistedState persistedState,
131+
BooleanSupplier isDisruptedSupplier
188132
) {
189133
final TimeValue heartbeatFrequency = HEARTBEAT_FREQUENCY.get(settings);
190-
var atomicHeartbeat = new StoreHeartbeatService(
191-
sharedStore,
134+
final var atomicRegister = new AtomicRegister(currentTermRef, isDisruptedSupplier);
135+
final var atomicHeartbeat = new StoreHeartbeatService(
136+
new SharedHeartbeatStore(heartBeatRef, isDisruptedSupplier),
192137
threadPool,
193138
heartbeatFrequency,
194139
TimeValue.timeValueMillis(heartbeatFrequency.millis() * MAX_MISSED_HEARTBEATS.get(settings)),
195-
listener -> listener.onResponse(OptionalLong.of(atomicRegister.readCurrentTerm()))
140+
listener -> ActionListener.completeWith(listener, () -> OptionalLong.of(atomicRegister.readCurrentTerm()))
196141
);
197142
var reconfigurator = new SingleNodeReconfigurator(settings, clusterSettings);
198143
var electionStrategy = new AtomicRegisterElectionStrategy(atomicRegister);
@@ -324,30 +269,29 @@ public boolean isInvalidReconfiguration(
324269
@Override
325270
public void beforeCommit(long term, long version, ActionListener<Void> listener) {
326271
// TODO: add a test to ensure that this gets called
327-
final var currentTerm = register.readCurrentTerm();
328-
if (currentTerm == term) {
329-
listener.onResponse(null);
330-
} else {
331-
assert term < currentTerm : term + " vs " + currentTerm;
332-
listener.onFailure(
333-
new CoordinationStateRejectedException(
272+
ActionListener.completeWith(listener, () -> {
273+
final var currentTerm = register.readCurrentTerm();
274+
if (currentTerm == term) {
275+
return null;
276+
} else {
277+
assert term < currentTerm : term + " vs " + currentTerm;
278+
throw new CoordinationStateRejectedException(
334279
Strings.format(
335280
"could not commit cluster state version %d in term %d, current term is now %d",
336281
version,
337282
term,
338283
currentTerm
339284
)
340-
)
341-
);
342-
}
285+
);
286+
}
287+
});
343288
}
344289
}
345290

346291
record PersistentClusterState(long term, long version, Metadata state) {}
347292

348-
private static class SharedStore implements HeartbeatStore {
293+
private static class SharedStore {
349294
private final Map<Long, PersistentClusterState> clusterStateByTerm = new HashMap<>();
350-
private Heartbeat heartbeat;
351295

352296
private void writeClusterState(ClusterState clusterState) {
353297
clusterStateByTerm.put(
@@ -367,32 +311,57 @@ void getClusterStateForTerm(long termGoal, ActionListener<PersistentClusterState
367311
return null;
368312
});
369313
}
314+
}
315+
316+
private static class SharedHeartbeatStore implements HeartbeatStore {
317+
318+
private final AtomicReference<Heartbeat> hearbeatRef;
319+
private final BooleanSupplier isDisruptedSupplier;
320+
321+
SharedHeartbeatStore(AtomicReference<Heartbeat> hearbeatRef, BooleanSupplier isDisruptedSupplier) {
322+
this.hearbeatRef = hearbeatRef;
323+
this.isDisruptedSupplier = isDisruptedSupplier;
324+
}
370325

371326
@Override
372327
public void writeHeartbeat(Heartbeat newHeartbeat, ActionListener<Void> listener) {
373-
this.heartbeat = newHeartbeat;
328+
if (isDisruptedSupplier.getAsBoolean()) {
329+
listener.onFailure(new IOException("simulating disrupted access to shared store"));
330+
}
331+
hearbeatRef.set(newHeartbeat);
374332
listener.onResponse(null);
375333
}
376334

377335
@Override
378336
public void readLatestHeartbeat(ActionListener<Heartbeat> listener) {
379-
listener.onResponse(heartbeat);
337+
if (isDisruptedSupplier.getAsBoolean()) {
338+
listener.onFailure(new IOException("simulating disrupted access to shared store"));
339+
}
340+
listener.onResponse(hearbeatRef.get());
380341
}
381342
}
382343

383344
private static class AtomicRegister {
384-
private long currentTerm;
345+
private final AtomicLong currentTermRef;
346+
private final BooleanSupplier isDisruptedSupplier;
385347

386-
long readCurrentTerm() {
387-
return currentTerm;
348+
AtomicRegister(AtomicLong currentTermRef, BooleanSupplier isDisruptedSupplier) {
349+
this.currentTermRef = currentTermRef;
350+
this.isDisruptedSupplier = isDisruptedSupplier;
351+
}
352+
353+
long readCurrentTerm() throws IOException {
354+
if (isDisruptedSupplier.getAsBoolean()) {
355+
throw new IOException("simulating disrupted access to shared store");
356+
}
357+
return currentTermRef.get();
388358
}
389359

390-
long compareAndExchange(long expected, long updated) {
391-
final var witness = currentTerm;
392-
if (currentTerm == expected) {
393-
currentTerm = updated;
360+
long compareAndExchange(long expected, long updated) throws IOException {
361+
if (isDisruptedSupplier.getAsBoolean()) {
362+
throw new IOException("simulating disrupted access to shared store");
394363
}
395-
return witness;
364+
return currentTermRef.compareAndExchange(expected, updated);
396365
}
397366
}
398367

test/framework/src/main/java/org/elasticsearch/cluster/coordination/AbstractCoordinatorTestCase.java

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1086,7 +1086,8 @@ public RecyclerBytesStreamOutput newNetworkBytesStream() {
10861086
threadPool,
10871087
settings,
10881088
clusterSettings,
1089-
persistedState
1089+
persistedState,
1090+
() -> disconnectedNodes.contains(localNode.getId()) || blackholedNodes.contains(localNode.getId())
10901091
);
10911092
coordinator = new Coordinator(
10921093
"test_node",
@@ -1480,7 +1481,8 @@ CoordinationServices getCoordinationServices(
14801481
ThreadPool threadPool,
14811482
Settings settings,
14821483
ClusterSettings clusterSettings,
1483-
CoordinationState.PersistedState persistedState
1484+
CoordinationState.PersistedState persistedState,
1485+
BooleanSupplier isDisruptedSupplier
14841486
);
14851487

14861488
CoordinationState.PersistedState createFreshPersistedState(
@@ -1529,7 +1531,8 @@ public CoordinationServices getCoordinationServices(
15291531
ThreadPool threadPool,
15301532
Settings settings,
15311533
ClusterSettings clusterSettings,
1532-
CoordinationState.PersistedState persistedState
1534+
CoordinationState.PersistedState persistedState,
1535+
BooleanSupplier isDisruptedSupplier
15331536
) {
15341537
return new CoordinationServices() {
15351538
@Override

0 commit comments

Comments
 (0)