1717package com .alipay .oceanbase .rpc .location .model ;
1818
1919import java .sql .Connection ;
20+ import java .sql .ResultSet ;
2021import java .sql .SQLException ;
2122import java .sql .Statement ;
2223import java .util .*;
2829import java .util .concurrent .locks .ReentrantLock ;
2930
3031import com .alipay .oceanbase .rpc .ObTableClient ;
32+ import com .alipay .oceanbase .rpc .exception .ObTableEntryRefreshException ;
3133import com .alipay .oceanbase .rpc .exception .ObTableTryLockTimeoutException ;
3234import com .alipay .oceanbase .rpc .exception .ObTableUnexpectedException ;
3335import com .alipay .oceanbase .rpc .location .LocationUtil ;
@@ -142,33 +144,51 @@ private void doCheckAliveTask() {
142144 private void checkAlive (ObServerAddr addr ) {
143145 long connectTimeout = 1000L ; // 1s
144146 long socketTimeout = 5000L ; // 5s
145- int failureLimit = 3 ;
146147 String url = LocationUtil .formatObServerUrl (addr , connectTimeout , socketTimeout );
147- TableRoute tableRoute = tableClient .getTableRoute ();
148148 Connection connection = null ;
149149 Statement statement = null ;
150+ ResultSet rs = null ;
150151 try {
152+ logger .debug ("[background keep alive] check alive, server: {}" , addr );
151153 connection = LocationUtil .getMetaRefreshConnection (url , sysUA );
152154 statement = connection .createStatement ();
153- statement .execute (sql );
154- removeFromSuspectIPs (addr );
155- } catch (Exception e ) {
156- if (e instanceof SQLException ) {
157- SuspectObServer server = suspectServers .get (addr );
158- server .incrementFailure ();
159- int failure = server .getFailure ();
160- if (failure >= failureLimit ) {
161- tableRoute .removeObServer (addr );
155+ rs = statement .executeQuery (sql );
156+ boolean alive = false ;
157+ while (rs .next ()) {
158+ String res = rs .getString ("detect server alive" );
159+ logger .debug ("[background keep alive] result: {}" , res );
160+ alive = res .equalsIgnoreCase ("detect server alive" );
161+ }
162+ if (alive ) {
163+ logger .debug ("[background keep alive] alive, remove server: {}" , addr );
164+ removeFromSuspectIPs (addr );
165+ } else {
166+ calcFailureOrClearCache (addr );
167+ }
168+ } catch (Throwable t ) {
169+ logger .debug ("check alive failed, server: {}" , addr , t );
170+ if (t instanceof SQLException ) {
171+ // occurred during query
172+ calcFailureOrClearCache (addr );
173+ } if (t instanceof ObTableEntryRefreshException ) {
174+ // occurred during connection construction
175+ ObTableEntryRefreshException e = (ObTableEntryRefreshException ) t ;
176+ if (e .isConnectInactive ()) {
177+ calcFailureOrClearCache (addr );
178+ } else {
179+ logger .warn ("background check-alive mechanic meet ObTableEntryRefreshException, server: {}" , addr .toString (), t );
162180 removeFromSuspectIPs (addr );
163181 }
164- logger .debug ("background keep-alive mechanic failed to receive response, server: {}, failure: {}" ,
165- addr , failure , e );
166182 } else {
167183 // silence resolving
168- logger .warn ("background check-alive mechanic meet exception, server: {}" , addr .toString (), e );
184+ logger .warn ("background check-alive mechanic meet exception, server: {}" , addr .toString (), t );
185+ removeFromSuspectIPs (addr );
169186 }
170187 } finally {
171188 try {
189+ if (rs != null ) {
190+ rs .close ();
191+ }
172192 if (statement != null ) {
173193 statement .close ();
174194 }
@@ -182,6 +202,7 @@ private void checkAlive(ObServerAddr addr) {
182202 }
183203
184204 public void addIntoSuspectIPs (SuspectObServer server ) throws Exception {
205+ logger .debug ("[background keep alive] enter addInto" );
185206 ObServerAddr addr = server .getAddr ();
186207 if (suspectServers .get (addr ) != null ) {
187208 // already in the list, directly return
@@ -212,6 +233,7 @@ public void addIntoSuspectIPs(SuspectObServer server) throws Exception {
212233 break ;
213234 }
214235 }
236+ logger .debug ("[background keep alive] add into ips, server: {}" , addr );
215237 suspectServers .put (addr , server );
216238 serverLastAccessTimestamps .put (addr , server .getAccessTimestamp ());
217239 break ;
@@ -230,6 +252,7 @@ public void addIntoSuspectIPs(SuspectObServer server) throws Exception {
230252 }
231253
232254 private void removeFromSuspectIPs (ObServerAddr addr ) {
255+ logger .debug ("[background keep alive] remove server, server:{}" , addr );
233256 Lock lock = suspectLocks .get (addr );
234257 if (lock == null ) {
235258 // lock must have been added before remove
@@ -247,6 +270,7 @@ private void removeFromSuspectIPs(ObServerAddr addr) {
247270 }
248271 // no need to remove lock
249272 suspectServers .remove (addr );
273+ logger .debug ("[background keep alive] removed server: {}" , addr );
250274 break ;
251275 } catch (ObTableTryLockTimeoutException e ) {
252276 // if try lock timeout, need to retry
@@ -266,6 +290,20 @@ private void removeFromSuspectIPs(ObServerAddr addr) {
266290 }
267291 }
268292
293+ private void calcFailureOrClearCache (ObServerAddr addr ) {
294+ int failureLimit = 3 ;
295+ TableRoute tableRoute = tableClient .getTableRoute ();
296+ SuspectObServer server = suspectServers .get (addr );
297+ server .incrementFailure ();
298+ int failure = server .getFailure ();
299+ if (failure >= failureLimit ) {
300+ tableRoute .removeObServer (addr );
301+ removeFromSuspectIPs (addr );
302+ }
303+ logger .debug ("background keep-alive mechanic failed to receive response, server: {}, failure: {}" ,
304+ addr , failure );
305+ }
306+
269307 public static class SuspectObServer {
270308 private final ObServerAddr addr ;
271309 private final long accessTimestamp ;
0 commit comments