Skip to content

Commit 1d79acd

Browse files
committed
HDDS-14425. Implement Ratis follower read exception handling
1 parent a69f4d8 commit 1d79acd

File tree

8 files changed

+244
-7
lines changed

8 files changed

+244
-7
lines changed
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.hadoop.ozone.om.exceptions;
19+
20+
import java.io.IOException;
21+
import org.apache.ratis.protocol.RaftPeerId;
22+
import org.apache.ratis.protocol.exceptions.ReadException;
23+
24+
/**
25+
* Exceptions thrown by
26+
* {@link org.apache.hadoop.ozone.om.protocolPB.OzoneManagerProtocolPB} when
27+
* Ratis read fails due reasons such as timeout.
28+
*/
29+
public class OMReadException extends IOException {
30+
31+
public OMReadException(RaftPeerId currentPeerId, String cause) {
32+
super("OM: " + currentPeerId + " read failed. " +
33+
(cause != null ? "Cause: " + cause : ""));
34+
}
35+
36+
/**
37+
* Convert {@link ReadException} to {@link OMReadException}.
38+
* @param readException Ratis Read exception.
39+
* @param currentPeer Current peer.
40+
* @return OMReadException.
41+
*/
42+
public static OMReadException convertToOMReadException(
43+
ReadException readException, RaftPeerId currentPeer) {
44+
Throwable cause = readException.getCause();
45+
return new OMReadException(currentPeer,
46+
cause != null ? cause.getMessage() : null);
47+
}
48+
}
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.hadoop.ozone.om.exceptions;
19+
20+
import java.io.IOException;
21+
import org.apache.ratis.protocol.RaftPeerId;
22+
import org.apache.ratis.protocol.exceptions.ReadIndexException;
23+
24+
/**
25+
* Exceptions thrown by
26+
* {@link org.apache.hadoop.ozone.om.protocolPB.OzoneManagerProtocolPB} when
27+
* ReadIndex call fails to complete.
28+
*/
29+
public class OMReadIndexException extends IOException {
30+
31+
public OMReadIndexException(RaftPeerId currentPeerId, String cause) {
32+
super("OM: " + currentPeerId + " read index failed. " +
33+
(cause != null ? "Cause: " + cause : ""));
34+
}
35+
36+
/**
37+
* Convert {@link ReadIndexException} to {@link OMReadIndexException}.
38+
* @param readIndexException Ratis ReadIndex exception.
39+
* @param currentPeer Current peer.
40+
* @return OMReadIndexException
41+
*/
42+
public static OMReadIndexException convertToOMReadIndexException(
43+
ReadIndexException readIndexException, RaftPeerId currentPeer) {
44+
Throwable cause = readIndexException.getCause();
45+
return new OMReadIndexException(currentPeer,
46+
cause != null ? cause.getMessage() : null);
47+
}
48+
}

hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/HadoopRpcOMFollowerReadFailoverProxyProvider.java

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919

2020
import static org.apache.hadoop.ozone.om.ha.OMFailoverProxyProviderBase.getLeaderNotReadyException;
2121
import static org.apache.hadoop.ozone.om.ha.OMFailoverProxyProviderBase.getNotLeaderException;
22+
import static org.apache.hadoop.ozone.om.ha.OMFailoverProxyProviderBase.getReadException;
23+
import static org.apache.hadoop.ozone.om.ha.OMFailoverProxyProviderBase.getReadIndexException;
2224

2325
import com.google.common.annotations.VisibleForTesting;
2426
import com.google.protobuf.RpcController;
@@ -38,6 +40,8 @@
3840
import org.apache.hadoop.ozone.OmUtils;
3941
import org.apache.hadoop.ozone.om.exceptions.OMLeaderNotReadyException;
4042
import org.apache.hadoop.ozone.om.exceptions.OMNotLeaderException;
43+
import org.apache.hadoop.ozone.om.exceptions.OMReadException;
44+
import org.apache.hadoop.ozone.om.exceptions.OMReadIndexException;
4145
import org.apache.hadoop.ozone.om.protocolPB.OzoneManagerProtocolPB;
4246
import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.OMRequest;
4347
import org.slf4j.Logger;
@@ -263,6 +267,18 @@ public Object invoke(Object proxy, final Method method, final Object[] args)
263267
// If we break here instead, we will retry the same leader again without waiting
264268
throw e;
265269
}
270+
271+
OMReadIndexException readIndexException = getReadIndexException(e);
272+
if (readIndexException != null) {
273+
// This should trigger failover in the following shouldFailover
274+
LOG.debug("Encountered OMReadIndexException from {}. ", current.proxyInfo);
275+
}
276+
277+
OMReadException readException = getReadException(e);
278+
if (readException != null) {
279+
// This should trigger failover in the following shouldFailover
280+
LOG.debug("Encountered OMReadException from {}. ", current.proxyInfo);
281+
}
266282
}
267283

268284
if (!failoverProxy.shouldFailover(e)) {

hadoop-ozone/common/src/main/java/org/apache/hadoop/ozone/om/ha/OMFailoverProxyProviderBase.java

Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@
4242
import org.apache.hadoop.ozone.om.exceptions.OMException;
4343
import org.apache.hadoop.ozone.om.exceptions.OMLeaderNotReadyException;
4444
import org.apache.hadoop.ozone.om.exceptions.OMNotLeaderException;
45+
import org.apache.hadoop.ozone.om.exceptions.OMReadException;
46+
import org.apache.hadoop.ozone.om.exceptions.OMReadIndexException;
4547
import org.apache.hadoop.security.AccessControlException;
4648
import org.apache.hadoop.security.UserGroupInformation;
4749
import org.apache.hadoop.security.token.SecretManager;
@@ -434,6 +436,44 @@ public static OMNotLeaderException getNotLeaderException(
434436
return null;
435437
}
436438

439+
/**
440+
* Unwrap the exception and return the wrapped ReadIndexException if any.
441+
*
442+
* @param exception exception to unwrap.
443+
* @return the unwrapped OMReadIndexException or null if the wrapped
444+
* exception is not OMReadIndexException.
445+
*/
446+
public static OMReadIndexException getReadIndexException(Exception exception) {
447+
Throwable cause = exception.getCause();
448+
if (cause instanceof RemoteException) {
449+
IOException ioException =
450+
((RemoteException) cause).unwrapRemoteException();
451+
if (ioException instanceof OMReadIndexException) {
452+
return (OMReadIndexException) ioException;
453+
}
454+
}
455+
return null;
456+
}
457+
458+
/**
459+
* Unwrap the exception and return the wrapped ReadException if any.
460+
*
461+
* @param exception exception to unwrap.
462+
* @return the unwrapped OMReadException or null if the wrapped
463+
* exception is not OMReadException.
464+
*/
465+
public static OMReadException getReadException(Exception exception) {
466+
Throwable cause = exception.getCause();
467+
if (cause instanceof RemoteException) {
468+
IOException ioException =
469+
((RemoteException) cause).unwrapRemoteException();
470+
if (ioException instanceof OMReadException) {
471+
return (OMReadException) ioException;
472+
}
473+
}
474+
return null;
475+
}
476+
437477
protected ConfigurationSource getConf() {
438478
return conf;
439479
}

hadoop-ozone/common/src/test/java/org/apache/hadoop/ozone/om/ha/TestHadoopRpcOMFollowerReadFailoverProxyProvider.java

Lines changed: 45 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,8 @@
5858
import org.apache.hadoop.ozone.ha.ConfUtils;
5959
import org.apache.hadoop.ozone.om.exceptions.OMLeaderNotReadyException;
6060
import org.apache.hadoop.ozone.om.exceptions.OMNotLeaderException;
61+
import org.apache.hadoop.ozone.om.exceptions.OMReadException;
62+
import org.apache.hadoop.ozone.om.exceptions.OMReadIndexException;
6163
import org.apache.hadoop.ozone.om.protocolPB.OzoneManagerProtocolPB;
6264
import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.CreateKeyRequest;
6365
import org.apache.hadoop.ozone.protocol.proto.OzoneManagerProtocolProtos.GetKeyInfoRequest;
@@ -337,6 +339,22 @@ void testNullRequest() throws Exception {
337339
assertInstanceOf(RpcNoSuchProtocolException.class, exception.getCause());
338340
}
339341

342+
@Test
343+
void testReadIndexException() throws Exception {
344+
setupProxyProvider(3);
345+
omNodeAnswers[0].isThrowReadIndexException = true;
346+
doRead();
347+
assertHandledBy(1);
348+
}
349+
350+
@Test
351+
void testReadException() throws Exception {
352+
setupProxyProvider(3);
353+
omNodeAnswers[0].isThrowReadException = true;
354+
doRead();
355+
assertHandledBy(1);
356+
}
357+
340358
private void setupProxyProvider(int omNodeCount) throws Exception {
341359
setupProxyProvider(omNodeCount, new OzoneConfiguration());
342360
}
@@ -489,6 +507,8 @@ private static class OMAnswer {
489507
private volatile boolean isLeader = false;
490508
private volatile boolean isLeaderReady = true;
491509
private volatile boolean isFollowerReadSupported = true;
510+
private volatile boolean isThrowReadIndexException = false;
511+
private volatile boolean isThrowReadException = false;
492512

493513
private OMProtocolAnswer clientAnswer = new OMProtocolAnswer();
494514

@@ -524,13 +544,31 @@ public OMResponse answer(InvocationOnMock invocationOnMock) throws Throwable {
524544
}
525545
break;
526546
case GetKeyInfo:
527-
if (!isLeader && !isFollowerReadSupported) {
528-
throw new ServiceException(
529-
new RemoteException(
530-
OMNotLeaderException.class.getCanonicalName(),
531-
"OM follower read is not supported"
532-
)
533-
);
547+
if (!isLeader) {
548+
if (!isFollowerReadSupported) {
549+
throw new ServiceException(
550+
new RemoteException(
551+
OMNotLeaderException.class.getCanonicalName(),
552+
"OM follower read is not supported"
553+
)
554+
);
555+
}
556+
if (isThrowReadIndexException) {
557+
throw new ServiceException(
558+
new RemoteException(
559+
OMReadIndexException.class.getCanonicalName(),
560+
"ReadIndex exception"
561+
)
562+
);
563+
}
564+
if (isThrowReadException) {
565+
throw new ServiceException(
566+
new RemoteException(
567+
OMReadException.class.getCanonicalName(),
568+
"ReadException"
569+
)
570+
);
571+
}
534572
}
535573
if (isLeader && !isLeaderReady) {
536574
throw new ServiceException(

hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAFollowerRead.java

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -428,6 +428,31 @@ protected void createKeyTest(boolean checkSuccess) throws Exception {
428428
}
429429
}
430430

431+
protected void listVolumes(boolean checkSuccess)
432+
throws Exception {
433+
try {
434+
getObjectStore().getClientProxy().listVolumes(null, null, 100);
435+
} catch (IOException e) {
436+
if (!checkSuccess) {
437+
// If the last OM to be tried by the RetryProxy is down, we would get
438+
// ConnectException. Otherwise, we would get a RemoteException from the
439+
// last running OM as it would fail to get a quorum.
440+
if (e instanceof RemoteException) {
441+
// Linearizable read will fail with OMReadIndexException if the follower does not recognize any leader
442+
// or leader is uncontactable. It will throw OMReadException if the read submitted to Ratis encounters
443+
// timeout.
444+
assertThat(e).hasMessageFindingMatch("OMRead(Index)?Exception");
445+
} else if (e instanceof ConnectException) {
446+
assertThat(e).hasMessageContaining("Connection refused");
447+
} else {
448+
assertThat(e).hasMessageContaining("Could not determine or connect to OM Leader");
449+
}
450+
} else {
451+
throw e;
452+
}
453+
}
454+
}
455+
431456
protected void waitForLeaderToBeReady()
432457
throws InterruptedException, TimeoutException {
433458
// Wait for Leader Election timeout

hadoop-ozone/integration-test/src/test/java/org/apache/hadoop/ozone/om/TestOzoneManagerHAFollowerReadWithStoppedNodes.java

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,8 +111,12 @@ void twoOMDown() throws Exception {
111111
getCluster().stopOzoneManager(2);
112112
Thread.sleep(NODE_FAILURE_TIMEOUT * 4);
113113

114+
// Write requests will fail with OMNotLeaderException
114115
createVolumeTest(false);
115116
createKeyTest(false);
117+
118+
// Read requests will fail with either OMReadIndexException or OMReadException
119+
listVolumes(false);
116120
}
117121

118122
@Test

hadoop-ozone/ozone-manager/src/main/java/org/apache/hadoop/ozone/om/ratis/OzoneManagerRatisServer.java

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@
6262
import org.apache.hadoop.ozone.om.exceptions.OMException;
6363
import org.apache.hadoop.ozone.om.exceptions.OMLeaderNotReadyException;
6464
import org.apache.hadoop.ozone.om.exceptions.OMNotLeaderException;
65+
import org.apache.hadoop.ozone.om.exceptions.OMReadException;
66+
import org.apache.hadoop.ozone.om.exceptions.OMReadIndexException;
6567
import org.apache.hadoop.ozone.om.helpers.OMNodeDetails;
6668
import org.apache.hadoop.ozone.om.helpers.OMRatisHelper;
6769
import org.apache.hadoop.ozone.om.ratis.utils.OzoneManagerRatisUtils;
@@ -88,6 +90,8 @@
8890
import org.apache.ratis.protocol.exceptions.LeaderNotReadyException;
8991
import org.apache.ratis.protocol.exceptions.LeaderSteppingDownException;
9092
import org.apache.ratis.protocol.exceptions.NotLeaderException;
93+
import org.apache.ratis.protocol.exceptions.ReadException;
94+
import org.apache.ratis.protocol.exceptions.ReadIndexException;
9195
import org.apache.ratis.protocol.exceptions.StateMachineException;
9296
import org.apache.ratis.rpc.RpcType;
9397
import org.apache.ratis.rpc.SupportedRpcType;
@@ -588,6 +592,20 @@ private OMResponse createOmResponseImpl(OMRequest omRequest,
588592
throw new ServiceException(new OMNotLeaderException(leaderSteppingDownException.getMessage()));
589593
}
590594

595+
ReadIndexException readIndexException = reply.getReadIndexException();
596+
if (readIndexException != null) {
597+
throw new ServiceException(
598+
OMReadIndexException.convertToOMReadIndexException(
599+
readIndexException, getRaftPeerId()));
600+
}
601+
602+
ReadException readException = reply.getReadException();
603+
if (readException != null) {
604+
throw new ServiceException(
605+
OMReadException.convertToOMReadException(
606+
readException, getRaftPeerId()));
607+
}
608+
591609
StateMachineException stateMachineException =
592610
reply.getStateMachineException();
593611
if (stateMachineException != null) {

0 commit comments

Comments
 (0)