|
16 | 16 | */ |
17 | 17 | package com.alipay.oceanbase.rpc.location.model; |
18 | 18 |
|
| 19 | +import java.sql.Connection; |
| 20 | +import java.sql.SQLException; |
| 21 | +import java.sql.Statement; |
19 | 22 | import java.util.*; |
| 23 | +import java.util.concurrent.ConcurrentHashMap; |
20 | 24 | import java.util.concurrent.Executors; |
21 | 25 | import java.util.concurrent.ScheduledExecutorService; |
22 | 26 | import java.util.concurrent.TimeUnit; |
| 27 | +import java.util.concurrent.locks.Lock; |
| 28 | +import java.util.concurrent.locks.ReentrantLock; |
23 | 29 |
|
24 | 30 | import com.alipay.oceanbase.rpc.ObTableClient; |
| 31 | +import com.alipay.oceanbase.rpc.exception.ObTableTryLockTimeoutException; |
| 32 | +import com.alipay.oceanbase.rpc.exception.ObTableUnexpectedException; |
25 | 33 | import com.alipay.oceanbase.rpc.location.LocationUtil; |
26 | 34 | import org.slf4j.Logger; |
| 35 | + |
27 | 36 | import static com.alipay.oceanbase.rpc.util.TableClientLoggerFactory.getLogger; |
28 | 37 |
|
29 | 38 | public class RouteTableRefresher { |
30 | 39 |
|
31 | | - private static final Logger logger = getLogger(RouteTableRefresher.class); |
| 40 | + private static final Logger logger = getLogger(RouteTableRefresher.class); |
| 41 | + |
| 42 | + private static final String sql = "select 'detect server alive' from dual"; |
| 43 | + |
| 44 | + private final ObTableClient tableClient; |
| 45 | + |
| 46 | + private final ObUserAuth sysUA; |
| 47 | + |
| 48 | + private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(2); |
32 | 49 |
|
33 | | - private final ObTableClient tableClient; |
| 50 | + private final ConcurrentHashMap<ObServerAddr, Lock> suspectLocks = new ConcurrentHashMap<>(); // ObServer -> access lock |
34 | 51 |
|
35 | | - private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(1); |
| 52 | + private final ConcurrentHashMap<ObServerAddr, SuspectObServer> suspectServers = new ConcurrentHashMap<>(); // ObServer -> information structure |
36 | 53 |
|
37 | | - public RouteTableRefresher(ObTableClient tableClient) { |
| 54 | + private final HashMap<ObServerAddr, Long> serverLastAccessTimestamps = new HashMap<>(); // ObServer -> last access timestamp |
| 55 | + |
| 56 | + public RouteTableRefresher(ObTableClient tableClient, ObUserAuth sysUA) { |
38 | 57 | this.tableClient = tableClient; |
| 58 | + this.sysUA = sysUA; |
| 59 | + } |
| 60 | + |
| 61 | + /** |
| 62 | + * check whether observers have changed every 30 seconds |
| 63 | + * if changed, refresh in the background |
| 64 | + * */ |
| 65 | + public void start() { |
| 66 | + scheduler.scheduleAtFixedRate(this::doRsListCheck, 30, 30, TimeUnit.SECONDS); |
| 67 | + scheduler.scheduleWithFixedDelay(this::doCheckAliveTask, 1, 1, TimeUnit.SECONDS); |
| 68 | + } |
| 69 | + |
| 70 | + public void close() { |
| 71 | + try { |
| 72 | + scheduler.shutdown(); |
| 73 | + // wait at most 1 seconds to close the scheduler |
| 74 | + if (!scheduler.awaitTermination(1, TimeUnit.SECONDS)) { |
| 75 | + scheduler.shutdownNow(); |
| 76 | + } |
| 77 | + } catch (InterruptedException e) { |
| 78 | + logger.warn("scheduler await for terminate interrupted: {}.", e.getMessage()); |
| 79 | + scheduler.shutdownNow(); |
| 80 | + } |
39 | 81 | } |
40 | 82 |
|
41 | 83 | /** |
@@ -86,25 +128,164 @@ private void doRsListCheck() { |
86 | 128 | } |
87 | 129 | } |
88 | 130 |
|
89 | | - /** |
90 | | - * check whether observers have changed every 30 seconds |
91 | | - * if changed, refresh in the background |
92 | | - * */ |
93 | | - public void start() { |
94 | | - scheduler.scheduleAtFixedRate(this::doRsListCheck, 30, 30, TimeUnit.SECONDS); |
| 131 | + private void doCheckAliveTask() { |
| 132 | + for (Map.Entry<ObServerAddr, SuspectObServer> entry : suspectServers.entrySet()) { |
| 133 | + try { |
| 134 | + checkAlive(entry.getKey()); |
| 135 | + } catch (Exception e) { |
| 136 | + // silence resolving |
| 137 | + logger.warn("RouteTableRefresher::doCheckAliveTask fail, failed server: {}", entry.getKey().toString()); |
| 138 | + } |
| 139 | + } |
95 | 140 | } |
96 | 141 |
|
97 | | - public void close() { |
| 142 | + private void checkAlive(ObServerAddr addr) { |
| 143 | + long connectTimeout = 1000L; // 1s |
| 144 | + long socketTimeout = 5000L; // 5s |
| 145 | + int failureLimit = 3; |
| 146 | + String url = LocationUtil.formatObServerUrl(addr, connectTimeout, socketTimeout); |
| 147 | + TableRoute tableRoute = tableClient.getTableRoute(); |
| 148 | + Connection connection = null; |
| 149 | + Statement statement = null; |
98 | 150 | try { |
99 | | - scheduler.shutdown(); |
100 | | - // wait at most 1 seconds to close the scheduler |
101 | | - if (!scheduler.awaitTermination(1, TimeUnit.SECONDS)) { |
102 | | - scheduler.shutdownNow(); |
| 151 | + connection = LocationUtil.getMetaRefreshConnection(url, sysUA); |
| 152 | + statement = connection.createStatement(); |
| 153 | + statement.execute(sql); |
| 154 | + removeFromSuspectIPs(addr); |
| 155 | + } catch (Exception e) { |
| 156 | + if (e instanceof SQLException) { |
| 157 | + SuspectObServer server = suspectServers.get(addr); |
| 158 | + server.incrementFailure(); |
| 159 | + int failure = server.getFailure(); |
| 160 | + if (failure >= failureLimit) { |
| 161 | + tableRoute.removeObServer(addr); |
| 162 | + removeFromSuspectIPs(addr); |
| 163 | + } |
| 164 | + logger.debug("background keep-alive mechanic failed to receive response, server: {}, failure: {}", |
| 165 | + addr, failure, e); |
| 166 | + } else { |
| 167 | + // silence resolving |
| 168 | + logger.warn("background check-alive mechanic meet exception, server: {}", addr.toString(), e); |
| 169 | + } |
| 170 | + } finally { |
| 171 | + try { |
| 172 | + if (statement != null) { |
| 173 | + statement.close(); |
| 174 | + } |
| 175 | + if (connection != null) { |
| 176 | + connection.close(); |
| 177 | + } |
| 178 | + } catch (SQLException e) { |
| 179 | + // ignore |
| 180 | + } |
| 181 | + } |
| 182 | + } |
| 183 | + |
| 184 | + public void addIntoSuspectIPs(SuspectObServer server) throws Exception { |
| 185 | + ObServerAddr addr = server.getAddr(); |
| 186 | + if (suspectServers.get(addr) != null) { |
| 187 | + // already in the list, directly return |
| 188 | + return; |
| 189 | + } |
| 190 | + long addInterval = 20000L; // 20s |
| 191 | + Lock tempLock = new ReentrantLock(); |
| 192 | + Lock lock = suspectLocks.putIfAbsent(addr, tempLock); |
| 193 | + lock = (lock == null) ? tempLock : lock; |
| 194 | + boolean acquired = false; |
| 195 | + try { |
| 196 | + int retryTimes = 0; |
| 197 | + while (true) { |
| 198 | + try { |
| 199 | + acquired = lock.tryLock(1, TimeUnit.SECONDS); |
| 200 | + if (!acquired) { |
| 201 | + throw new ObTableTryLockTimeoutException("try to get suspect server lock timeout, timeout: 1s"); |
| 202 | + } |
| 203 | + if (suspectServers.get(addr) != null) { |
| 204 | + // already in the list, directly break |
| 205 | + break; |
| 206 | + } |
| 207 | + Long lastServerAccessTs = serverLastAccessTimestamps.get(addr); |
| 208 | + if (lastServerAccessTs != null) { |
| 209 | + long interval = System.currentTimeMillis() - lastServerAccessTs; |
| 210 | + if (interval < addInterval) { |
| 211 | + // do not repeatedly add within 20 seconds since last adding |
| 212 | + break; |
| 213 | + } |
| 214 | + } |
| 215 | + suspectServers.put(addr, server); |
| 216 | + serverLastAccessTimestamps.put(addr, server.getAccessTimestamp()); |
| 217 | + break; |
| 218 | + } catch (ObTableTryLockTimeoutException e) { |
| 219 | + // if try lock timeout, need to retry |
| 220 | + ++retryTimes; |
| 221 | + logger.warn("wait to try lock to timeout 1s when add observer into suspect ips, server: {}, tryTimes: {}", |
| 222 | + addr.toString(), retryTimes, e); |
| 223 | + } |
| 224 | + } // end while |
| 225 | + } finally { |
| 226 | + if (acquired) { |
| 227 | + lock.unlock(); |
103 | 228 | } |
104 | | - } catch (InterruptedException e) { |
105 | | - logger.warn("scheduler await for terminate interrupted: {}.", e.getMessage()); |
106 | | - scheduler.shutdownNow(); |
107 | 229 | } |
108 | 230 | } |
109 | 231 |
|
| 232 | + private void removeFromSuspectIPs(ObServerAddr addr) { |
| 233 | + Lock lock = suspectLocks.get(addr); |
| 234 | + if (lock == null) { |
| 235 | + // lock must have been added before remove |
| 236 | + throw new ObTableUnexpectedException(String.format("ObServer [%s:%d] need to be add into suspect ips before remove", |
| 237 | + addr.getIp(), addr.getSvrPort())); |
| 238 | + } |
| 239 | + boolean acquired = false; |
| 240 | + try { |
| 241 | + int retryTimes = 0; |
| 242 | + while (true) { |
| 243 | + try { |
| 244 | + acquired = lock.tryLock(1, TimeUnit.SECONDS); |
| 245 | + if (!acquired) { |
| 246 | + throw new ObTableTryLockTimeoutException("try to get suspect server lock timeout, timeout: 1s"); |
| 247 | + } |
| 248 | + // no need to remove lock |
| 249 | + suspectServers.remove(addr); |
| 250 | + break; |
| 251 | + } catch (ObTableTryLockTimeoutException e) { |
| 252 | + // if try lock timeout, need to retry |
| 253 | + ++retryTimes; |
| 254 | + logger.warn("wait to try lock to timeout when add observer into suspect ips, server: {}, tryTimes: {}", |
| 255 | + addr.toString(), retryTimes, e); |
| 256 | + } catch (InterruptedException e) { |
| 257 | + // do not throw exception to user layer |
| 258 | + // next background task will continue to remove it |
| 259 | + logger.warn("waiting to get lock while interrupted by other threads", e); |
| 260 | + } |
| 261 | + } |
| 262 | + } finally { |
| 263 | + if (acquired) { |
| 264 | + lock.unlock(); |
| 265 | + } |
| 266 | + } |
| 267 | + } |
| 268 | + |
| 269 | + public static class SuspectObServer { |
| 270 | + private final ObServerAddr addr; |
| 271 | + private final long accessTimestamp; |
| 272 | + private int failure; |
| 273 | + public SuspectObServer(ObServerAddr addr) { |
| 274 | + this.addr = addr; |
| 275 | + accessTimestamp = System.currentTimeMillis(); |
| 276 | + failure = 0; |
| 277 | + } |
| 278 | + public ObServerAddr getAddr() { |
| 279 | + return this.addr; |
| 280 | + } |
| 281 | + public long getAccessTimestamp() { |
| 282 | + return this.accessTimestamp; |
| 283 | + } |
| 284 | + public int getFailure() { |
| 285 | + return this.failure; |
| 286 | + } |
| 287 | + public void incrementFailure() { |
| 288 | + ++failure; |
| 289 | + } |
| 290 | + } |
110 | 291 | } |
0 commit comments