Skip to content

Commit 326fc81

Browse files
authored
[IOTDB-3642] Add retry mechanism when resource unavailable (#7240)
1 parent 403f5a9 commit 326fc81

File tree

2 files changed

+108
-4
lines changed

2 files changed

+108
-4
lines changed

consensus/src/main/java/org/apache/iotdb/consensus/config/RatisConfig.java

Lines changed: 57 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,20 +34,23 @@ public class RatisConfig {
3434
private final ThreadPool threadPool;
3535
private final Log log;
3636
private final Grpc grpc;
37+
private final RatisConsensus ratisConsensus;
3738

3839
private RatisConfig(
3940
Rpc rpc,
4041
LeaderElection leaderElection,
4142
Snapshot snapshot,
4243
ThreadPool threadPool,
4344
Log log,
44-
Grpc grpc) {
45+
Grpc grpc,
46+
RatisConsensus ratisConsensus) {
4547
this.rpc = rpc;
4648
this.leaderElection = leaderElection;
4749
this.snapshot = snapshot;
4850
this.threadPool = threadPool;
4951
this.log = log;
5052
this.grpc = grpc;
53+
this.ratisConsensus = ratisConsensus;
5154
}
5255

5356
public Rpc getRpc() {
@@ -74,6 +77,10 @@ public Grpc getGrpc() {
7477
return grpc;
7578
}
7679

80+
public RatisConsensus getRatisConsensus() {
81+
return ratisConsensus;
82+
}
83+
7784
public static Builder newBuilder() {
7885
return new Builder();
7986
}
@@ -85,6 +92,7 @@ public static class Builder {
8592
private ThreadPool threadPool;
8693
private Log log;
8794
private Grpc grpc;
95+
private RatisConsensus ratisConsensus;
8896

8997
public RatisConfig build() {
9098
return new RatisConfig(
@@ -93,7 +101,8 @@ public RatisConfig build() {
93101
snapshot != null ? snapshot : Snapshot.newBuilder().build(),
94102
threadPool != null ? threadPool : ThreadPool.newBuilder().build(),
95103
log != null ? log : Log.newBuilder().build(),
96-
grpc != null ? grpc : Grpc.newBuilder().build());
104+
grpc != null ? grpc : Grpc.newBuilder().build(),
105+
ratisConsensus != null ? ratisConsensus : RatisConsensus.newBuilder().build());
97106
}
98107

99108
public Builder setRpc(Rpc rpc) {
@@ -125,6 +134,11 @@ public Builder setGrpc(Grpc grpc) {
125134
this.grpc = grpc;
126135
return this;
127136
}
137+
138+
public Builder setRatisConsensus(RatisConsensus ratisConsensus) {
139+
this.ratisConsensus = ratisConsensus;
140+
return this;
141+
}
128142
}
129143

130144
/** server rpc timeout related */
@@ -692,4 +706,45 @@ public Grpc.Builder setLeaderOutstandingAppendsMax(int leaderOutstandingAppendsM
692706
}
693707
}
694708
}
709+
710+
public static class RatisConsensus {
711+
private final int retryTimesMax;
712+
private final long retryWaitMillis;
713+
714+
private RatisConsensus(int retryTimesMax, long retryWaitMillis) {
715+
this.retryTimesMax = retryTimesMax;
716+
this.retryWaitMillis = retryWaitMillis;
717+
}
718+
719+
public int getRetryTimesMax() {
720+
return retryTimesMax;
721+
}
722+
723+
public long getRetryWaitMillis() {
724+
return retryWaitMillis;
725+
}
726+
727+
public static RatisConsensus.Builder newBuilder() {
728+
return new Builder();
729+
}
730+
731+
public static class Builder {
732+
private int retryTimesMax = 3;
733+
private long retryWaitMillis = 500;
734+
735+
public RatisConsensus build() {
736+
return new RatisConsensus(retryTimesMax, retryWaitMillis);
737+
}
738+
739+
public RatisConsensus.Builder setRetryTimesMax(int retryTimesMax) {
740+
this.retryTimesMax = retryTimesMax;
741+
return this;
742+
}
743+
744+
public RatisConsensus.Builder setRetryWaitMillis(long retryWaitMillis) {
745+
this.retryWaitMillis = retryWaitMillis;
746+
return this;
747+
}
748+
}
749+
}
695750
}

consensus/src/main/java/org/apache/iotdb/consensus/ratis/RatisConsensus.java

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import org.apache.iotdb.consensus.common.response.ConsensusReadResponse;
3939
import org.apache.iotdb.consensus.common.response.ConsensusWriteResponse;
4040
import org.apache.iotdb.consensus.config.ConsensusConfig;
41+
import org.apache.iotdb.consensus.config.RatisConfig;
4142
import org.apache.iotdb.consensus.exception.ConsensusException;
4243
import org.apache.iotdb.consensus.exception.ConsensusGroupNotExistException;
4344
import org.apache.iotdb.consensus.exception.NodeReadOnlyException;
@@ -62,9 +63,11 @@
6263
import org.apache.ratis.protocol.RaftPeerId;
6364
import org.apache.ratis.protocol.SnapshotManagementRequest;
6465
import org.apache.ratis.protocol.exceptions.NotLeaderException;
66+
import org.apache.ratis.protocol.exceptions.ResourceUnavailableException;
6567
import org.apache.ratis.server.DivisionInfo;
6668
import org.apache.ratis.server.RaftServer;
6769
import org.apache.ratis.server.RaftServerConfigKeys;
70+
import org.apache.ratis.util.function.CheckedSupplier;
6871
import org.apache.thrift.TException;
6972
import org.slf4j.Logger;
7073
import org.slf4j.LoggerFactory;
@@ -111,6 +114,8 @@ class RatisConsensus implements IConsensus {
111114
// TODO make it configurable
112115
private static final int DEFAULT_WAIT_LEADER_READY_TIMEOUT = (int) TimeUnit.SECONDS.toMillis(20);
113116

117+
private final RatisConfig config;
118+
114119
public RatisConsensus(ConsensusConfig config, IStateMachine.Registry registry)
115120
throws IOException {
116121
myself = Utils.fromTEndPointAndPriorityToRaftPeer(config.getThisNode(), DEFAULT_PRIORITY);
@@ -122,6 +127,7 @@ public RatisConsensus(ConsensusConfig config, IStateMachine.Registry registry)
122127
GrpcConfigKeys.Server.setPort(properties, config.getThisNode().getPort());
123128

124129
Utils.initRatisConfig(properties, config.getRatisConfig());
130+
this.config = config.getRatisConfig();
125131

126132
clientRpc = new GrpcFactory(new Parameters()).newRaftClientRpc(ClientId.randomId(), properties);
127133

@@ -148,6 +154,49 @@ public void stop() throws IOException {
148154
server.close();
149155
}
150156

157+
private boolean shouldRetry(RaftClientReply reply) {
158+
// currently, we only retry when ResourceUnavailableException is caught
159+
return !reply.isSuccess()
160+
&& (reply.getException() != null
161+
&& reply.getException() instanceof ResourceUnavailableException);
162+
}
163+
/** launch a consensus write with retry mechanism */
164+
private RaftClientReply writeWithRetry(CheckedSupplier<RaftClientReply, IOException> caller)
165+
throws IOException {
166+
167+
final int maxRetryTimes = config.getRatisConsensus().getRetryTimesMax();
168+
final long waitMillis = config.getRatisConsensus().getRetryWaitMillis();
169+
170+
int retry = 0;
171+
RaftClientReply reply = null;
172+
while (retry < maxRetryTimes) {
173+
retry++;
174+
175+
reply = caller.get();
176+
if (!shouldRetry(reply)) {
177+
return reply;
178+
}
179+
logger.debug("{} sending write request with retry = {} and reply = {}", this, retry, reply);
180+
181+
try {
182+
Thread.sleep(waitMillis);
183+
} catch (InterruptedException e) {
184+
logger.warn("{} retry write sleep is interrupted: {}", this, e);
185+
Thread.currentThread().interrupt();
186+
}
187+
}
188+
return reply;
189+
}
190+
191+
private RaftClientReply writeLocallyWithRetry(RaftClientRequest request) throws IOException {
192+
return writeWithRetry(() -> server.submitClientRequest(request));
193+
}
194+
195+
private RaftClientReply writeRemotelyWithRetry(RatisClient client, Message message)
196+
throws IOException {
197+
return writeWithRetry(() -> client.getRaftClient().io().send(message));
198+
}
199+
151200
/**
152201
* write will first send request to local server use method call if local server is not leader, it
153202
* will use RaftClient to send RPC to read leader
@@ -183,7 +232,7 @@ public ConsensusWriteResponse write(
183232
RaftPeer suggestedLeader = null;
184233
if (isLeader(consensusGroupId) && waitUntilLeaderReady(raftGroupId)) {
185234
try {
186-
localServerReply = server.submitClientRequest(clientRequest);
235+
localServerReply = writeLocallyWithRetry(clientRequest);
187236
if (localServerReply.isSuccess()) {
188237
ResponseMessage responseMessage = (ResponseMessage) localServerReply.getMessage();
189238
TSStatus writeStatus = (TSStatus) responseMessage.getContentHolder();
@@ -203,7 +252,7 @@ public ConsensusWriteResponse write(
203252
RatisClient client = null;
204253
try {
205254
client = getRaftClient(raftGroup);
206-
RaftClientReply reply = client.getRaftClient().io().send(message);
255+
RaftClientReply reply = writeRemotelyWithRetry(client, message);
207256
if (!reply.isSuccess()) {
208257
return failedWrite(new RatisRequestFailedException(reply.getException()));
209258
}

0 commit comments

Comments
 (0)