2121import java .sql .SQLException ;
2222import java .sql .Statement ;
2323import java .util .*;
24- import java .util .concurrent .ConcurrentHashMap ;
25- import java .util .concurrent .Executors ;
26- import java .util .concurrent .ScheduledExecutorService ;
27- import java .util .concurrent .TimeUnit ;
24+ import java .util .concurrent .*;
2825import java .util .concurrent .locks .Lock ;
2926import java .util .concurrent .locks .ReentrantLock ;
3027
3330import com .alipay .oceanbase .rpc .exception .ObTableTryLockTimeoutException ;
3431import com .alipay .oceanbase .rpc .exception .ObTableUnexpectedException ;
3532import com .alipay .oceanbase .rpc .location .LocationUtil ;
33+ import com .alipay .oceanbase .rpc .table .ObTable ;
3634import org .slf4j .Logger ;
3735
3836import static com .alipay .oceanbase .rpc .util .TableClientLoggerFactory .getLogger ;
@@ -41,6 +39,8 @@ public class RouteTableRefresher {
4139
4240 private static final Logger logger = getLogger (RouteTableRefresher .class );
4341
42+ private static final int failureLimit = 3 ;
43+
4444 private static final String sql = "select 'detect server alive' from dual" ;
4545
4646 private final ObTableClient tableClient ;
@@ -136,7 +136,7 @@ private void doCheckAliveTask() {
136136 checkAlive (entry .getKey ());
137137 } catch (Exception e ) {
138138 // silence resolving
139- logger .warn ("RouteTableRefresher::doCheckAliveTask fail, failed server: {}" , entry .getKey ().toString ());
139+ logger .warn ("RouteTableRefresher::doCheckAliveTask fail, failed server: {}" , entry .getKey ().toString (), e );
140140 }
141141 }
142142 }
@@ -198,7 +198,7 @@ private void checkAlive(ObServerAddr addr) {
198198 }
199199 }
200200
201- public static void addIntoSuspectIPs (SuspectObServer server ) throws Exception {
201+ public static void addIntoSuspectIPs (SuspectObServer server ) throws InterruptedException {
202202 ObServerAddr addr = server .getAddr ();
203203 if (suspectServers .get (addr ) != null ) {
204204 // already in the list, directly return
@@ -264,7 +264,16 @@ private void removeFromSuspectIPs(ObServerAddr addr) {
264264 throw new ObTableTryLockTimeoutException ("try to get suspect server lock timeout, timeout: 1s" );
265265 }
266266 // no need to remove lock
267- suspectServers .remove (addr );
267+ SuspectObServer server = suspectServers .remove (addr );
268+ if (server != null ) {
269+ int failure = server .getFailure ();
270+ if (failure < failureLimit ) {
271+ ObTable obTable = tableClient .getTableRoute ().getTableRoster ().getTable (addr );
272+ if (obTable != null && !obTable .isValid ()) {
273+ obTable .setValid ();
274+ }
275+ }
276+ }
268277 logger .debug ("removed server from suspect list: {}" , addr );
269278 break ;
270279 } catch (ObTableTryLockTimeoutException e ) {
@@ -286,7 +295,6 @@ private void removeFromSuspectIPs(ObServerAddr addr) {
286295 }
287296
288297 private void calcFailureOrClearCache (ObServerAddr addr ) {
289- int failureLimit = 3 ;
290298 TableRoute tableRoute = tableClient .getTableRoute ();
291299 SuspectObServer server = suspectServers .get (addr );
292300 server .incrementFailure ();
0 commit comments