Skip to content

Commit e2c867d

Browse files
authored
RATIS-2345. Leader stepDown could cause a deadlock. (#1300)
1 parent fd9ca3a commit e2c867d

File tree

3 files changed

+17
-7
lines changed

3 files changed

+17
-7
lines changed

ratis-server/src/main/java/org/apache/ratis/server/impl/LeaderStateImpl.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -703,10 +703,12 @@ void submitStepDownEvent(long term, StepDownReason reason) {
703703
private void stepDown(long term, StepDownReason reason) {
704704
try {
705705
lease.getAndSetEnabled(false);
706-
server.changeToFollowerAndPersistMetadata(term, false, reason).join();
706+
server.changeToFollowerAndPersistMetadata(term, false, reason)
707+
.get(5, TimeUnit.SECONDS);
707708
pendingStepDown.complete(server::newSuccessReply);
708-
} catch(IOException e) {
709-
final String s = this + ": Failed to persist metadata for term " + term;
709+
} catch(Exception e) {
710+
pendingStepDown.completeExceptionally(e);
711+
final String s = this + ": Failed to step down for term " + term;
710712
LOG.warn(s, e);
711713
// the failure should happen while changing the state to follower
712714
// thus the in-memory state should have been updated

ratis-server/src/main/java/org/apache/ratis/server/impl/PendingStepDown.java

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,12 @@ void complete(Function<TransferLeadershipRequest, RaftClientReply> newSuccessRep
5656
replyFuture.complete(newSuccessReply.apply(request));
5757
}
5858

59+
void completeExceptionally(Exception e) {
60+
replyFuture.completeExceptionally(e);
61+
}
62+
5963
void timeout() {
60-
replyFuture.completeExceptionally(new TimeoutIOException(
64+
completeExceptionally(new TimeoutIOException(
6165
": Failed to step down leader on " + leader + "request " + request.getTimeoutMs() + "ms"));
6266
}
6367

@@ -105,6 +109,10 @@ void complete(Function<TransferLeadershipRequest, RaftClientReply> newSuccessRep
105109
pending.getAndSetNull().ifPresent(p -> p.complete(newSuccessReply));
106110
}
107111

112+
void completeExceptionally(Exception e) {
113+
pending.getAndSetNull().ifPresent(p -> p.completeExceptionally(e));
114+
}
115+
108116
void timeout() {
109117
pending.getAndSetNull().ifPresent(PendingRequest::timeout);
110118
}

ratis-server/src/test/java/org/apache/ratis/RaftBasicTests.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -360,7 +360,7 @@ static void testWithLoad(final int numClients, final int numMessages,
360360

361361
final Timer timer = new Timer();
362362
timer.schedule(new TimerTask() {
363-
private int previousLastStep = lastStep.get();
363+
private final AtomicInteger previousLastStep = new AtomicInteger(lastStep.get());
364364

365365
@Override
366366
public void run() {
@@ -371,8 +371,8 @@ public void run() {
371371
JavaUtils.dumpAllThreads(s -> log.info(s));
372372

373373
final int last = lastStep.get();
374-
if (last != previousLastStep) {
375-
previousLastStep = last;
374+
if (last != previousLastStep.get()) {
375+
previousLastStep.set(last);
376376
} else {
377377
final RaftServer.Division leader = cluster.getLeader();
378378
log.info("NO PROGRESS at " + last + ", try to restart leader=" + leader);

0 commit comments

Comments
 (0)