Skip to content

Commit dce57a2

Browse files
authored
Fix ob down ls timeout (#299)
* SQLException need to refresh obTable roster * refresh tablet address after refresh obTable roster * refresh table roster if obtable is null * use force renew to update roster * add log * update addrExpired after refresh
1 parent b466014 commit dce57a2

File tree

4 files changed

+96
-49
lines changed

4 files changed

+96
-49
lines changed

src/main/java/com/alipay/oceanbase/rpc/ObTableClient.java

Lines changed: 68 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -924,9 +924,9 @@ public void resetExecuteContinuousFailureCount(String tableName) {
924924
*
925925
* @throws Exception if fail
926926
*/
927-
public void syncRefreshMetadata() throws Exception {
927+
public void syncRefreshMetadata(boolean forceRenew) throws Exception {
928928

929-
if (System.currentTimeMillis() - lastRefreshMetadataTimestamp < metadataRefreshInterval) {
929+
if (!forceRenew && System.currentTimeMillis() - lastRefreshMetadataTimestamp < metadataRefreshInterval) {
930930
logger
931931
.warn(
932932
"try to lock metadata refreshing, it has refresh at: {}, dataSourceName: {}, url: {}",
@@ -947,7 +947,7 @@ public void syncRefreshMetadata() throws Exception {
947947

948948
try {
949949

950-
if (System.currentTimeMillis() - lastRefreshMetadataTimestamp < metadataRefreshInterval) {
950+
if (!forceRenew && System.currentTimeMillis() - lastRefreshMetadataTimestamp < metadataRefreshInterval) {
951951
logger.warn("it has refresh metadata at: {}, dataSourceName: {}, url: {}",
952952
lastRefreshMetadataTimestamp, dataSourceName, paramURL);
953953
return;
@@ -1295,7 +1295,7 @@ public TableEntry getOrRefreshTableEntry(final String tableName, final boolean r
12951295
if (logger.isInfoEnabled()) {
12961296
logger.info("server addr is expired and it will refresh metadata.");
12971297
}
1298-
syncRefreshMetadata();
1298+
syncRefreshMetadata(false);
12991299
tableEntryRefreshContinuousFailureCount.set(0);
13001300
} catch (ObTableEntryRefreshException e) {
13011301
RUNTIME.error("getOrRefreshTableEntry meet exception", e);
@@ -1307,11 +1307,11 @@ public TableEntry getOrRefreshTableEntry(final String tableName, final boolean r
13071307
if (tableEntryRefreshContinuousFailureCount.incrementAndGet() > tableEntryRefreshContinuousFailureCeiling) {
13081308
logger.error(LCD.convert("01-00019"),
13091309
tableEntryRefreshContinuousFailureCeiling);
1310-
syncRefreshMetadata();
1310+
syncRefreshMetadata(false);
13111311
tableEntryRefreshContinuousFailureCount.set(0);
13121312
} else if (e.isConnectInactive()) {
13131313
// getMetaRefreshConnection failed, maybe the server is down, so we need to refresh metadata directly
1314-
syncRefreshMetadata();
1314+
syncRefreshMetadata(false);
13151315
tableEntryRefreshContinuousFailureCount.set(0);
13161316
}
13171317
} catch (Throwable t) {
@@ -1326,7 +1326,7 @@ public TableEntry getOrRefreshTableEntry(final String tableName, final boolean r
13261326
"refresh table entry has tried {}-times failure and will sync refresh metadata",
13271327
refreshTryTimes);
13281328
}
1329-
syncRefreshMetadata();
1329+
syncRefreshMetadata(false);
13301330
return refreshTableEntry(tableEntry, tableName);
13311331
}
13321332
return tableEntry;
@@ -1405,7 +1405,7 @@ public TableEntry refreshTableLocationByTabletId(TableEntry tableEntry, String t
14051405
throw e;
14061406
} catch (ObTableServerCacheExpiredException e) {
14071407
RUNTIME.warn("RefreshTableEntry encountered an exception", e);
1408-
syncRefreshMetadata();
1408+
syncRefreshMetadata(false);
14091409
tableEntryRefreshContinuousFailureCount.set(0);
14101410
} catch (ObTableEntryRefreshException e) {
14111411
RUNTIME.error("getOrRefreshTableEntry meet exception", e);
@@ -1416,11 +1416,11 @@ public TableEntry refreshTableLocationByTabletId(TableEntry tableEntry, String t
14161416
if (tableEntryRefreshContinuousFailureCount.incrementAndGet() > tableEntryRefreshContinuousFailureCeiling) {
14171417
logger.error(LCD.convert("01-00019"),
14181418
tableEntryRefreshContinuousFailureCeiling);
1419-
syncRefreshMetadata();
1419+
syncRefreshMetadata(false);
14201420
tableEntryRefreshContinuousFailureCount.set(0);
14211421
} else if (e.isConnectInactive()) {
14221422
// getMetaRefreshConnection failed, maybe the server is down, so we need to refresh metadata directly
1423-
syncRefreshMetadata();
1423+
syncRefreshMetadata(false);
14241424
tableEntryRefreshContinuousFailureCount.set(0);
14251425
}
14261426
} catch (Throwable t) {
@@ -2022,38 +2022,49 @@ public ObPair<Long, ObTableParam> getTableInternal(String tableName, TableEntry
20222022
RUNTIME.error("Cannot get replica by partId: " + partId);
20232023
throw new ObTableGetException("Cannot get replica by partId: " + partId);
20242024
}
2025+
int retryTimes = 0;
20252026
ObServerAddr addr = replica.getAddr();
20262027
ObTable obTable = tableRoster.get(addr);
20272028
boolean addrExpired = addr.isExpired(serverAddressCachingTimeout);
2028-
if (obTable == null || addrExpired) {
2029-
if (obTable == null) {
2030-
logger.warn("Cannot get ObTable by addr {}, refreshing metadata.", addr);
2031-
syncRefreshMetadata();
2032-
}
2033-
if (addr.isExpired(serverAddressCachingTimeout)) {
2029+
while ((obTable == null || addrExpired) && retryTimes < 2) {
2030+
++retryTimes;
2031+
if (addrExpired) {
20342032
logger.info("Server addr {} is expired, refreshing tableEntry.", addr);
20352033
if (ObGlobal.obVsnMajor() >= 4) {
20362034
refreshTableLocationByTabletId(tableEntry, tableName, tabletId);
20372035
} else {
20382036
tableEntry = getOrRefreshTableEntry(tableName, true, waitForRefresh, false);
20392037
}
2038+
addrExpired = addr.isExpired(serverAddressCachingTimeout);
20402039
}
2041-
2042-
if (ObGlobal.obVsnMajor() >= 4) {
2043-
obPartitionLocationInfo = getOrRefreshPartitionInfo(tableEntry, tableName, tabletId);
2044-
replica = getPartitionLocation(obPartitionLocationInfo, route);
2045-
} else {
2046-
replica = getPartitionReplica(tableEntry, partitionId, route).getRight();
2047-
}
2048-
2049-
addr = replica.getAddr();
2050-
obTable = tableRoster.get(addr);
2051-
20522040
if (obTable == null) {
2053-
RUNTIME.error("Cannot get table by addr: " + addr);
2054-
throw new ObTableGetException("Cannot get table by addr: " + addr);
2041+
// need to refresh table roster to ensure the current roster is the latest
2042+
syncRefreshMetadata(true);
2043+
// the addr is wrong, need to refresh location
2044+
if (logger.isInfoEnabled()) {
2045+
logger.info("Cannot get ObTable by addr {}, refreshing metadata.", addr);
2046+
}
2047+
// refresh tablet location based on the latest roster, in case that some of the observers hase been killed
2048+
// and used the old location
2049+
tableEntry = refreshTableLocationByTabletId(tableEntry, tableName, tabletId);
2050+
if (ObGlobal.obVsnMajor() >= 4) {
2051+
obPartitionLocationInfo = getOrRefreshPartitionInfo(tableEntry, tableName, tabletId);
2052+
replica = getPartitionLocation(obPartitionLocationInfo, route);
2053+
} else {
2054+
replica = getPartitionReplica(tableEntry, partitionId, route).getRight();
2055+
}
2056+
if (replica == null) {
2057+
RUNTIME.error("Cannot get replica by partId: " + partId);
2058+
throw new ObTableGetException("Cannot get replica by partId: " + partId);
2059+
}
2060+
addr = replica.getAddr();
2061+
obTable = tableRoster.get(addr);
20552062
}
20562063
}
2064+
if (obTable == null) {
2065+
RUNTIME.error("cannot get table by addr: " + addr);
2066+
throw new ObTableGetException("obTable is null, addr is: " + addr.getIp() + ":" + addr.getSvrPort());
2067+
}
20572068
ObTableParam param = createTableParam(obTable, tableEntry, obPartitionLocationInfo, partId, tabletId);
20582069
if (ObGlobal.obVsnMajor() >= 4) {
20592070
} else {
@@ -2317,33 +2328,46 @@ private List<ObPair<Long, ObTableParam>> getTables(String tableName, ObTableQuer
23172328
ReplicaLocation replica = partIdWithReplica.getRight();
23182329
ObServerAddr addr = replica.getAddr();
23192330
ObTable obTable = tableRoster.get(addr);
2331+
int retryTimes = 0;
23202332
boolean addrExpired = addr.isExpired(serverAddressCachingTimeout);
2321-
if (addrExpired || obTable == null) {
2322-
if (obTable == null) {
2323-
logger.warn("Cannot get ObTable by addr {}, refreshing metadata.", addr);
2324-
syncRefreshMetadata();
2325-
}
2333+
while ((obTable == null || addrExpired) && retryTimes < 2) {
2334+
++retryTimes;
23262335
if (addrExpired) {
23272336
logger.info("Server addr {} is expired, refreshing tableEntry.", addr);
23282337
if (ObGlobal.obVsnMajor() >= 4) {
23292338
refreshTableLocationByTabletId(tableEntry, tableName, tabletId);
23302339
} else {
23312340
tableEntry = getOrRefreshTableEntry(tableName, true, waitForRefresh, false);
23322341
}
2342+
addrExpired = addr.isExpired(serverAddressCachingTimeout);
23332343
}
2334-
if (ObGlobal.obVsnMajor() >= 4) {
2335-
ObPartitionLocationInfo locationInfo = getOrRefreshPartitionInfo(tableEntry, tableName, tabletId);
2336-
replica = getPartitionLocation(locationInfo, route);
2337-
} else {
2338-
replica = getPartitionLocation(tableEntry, partId, route);
2344+
if (obTable == null) {
2345+
// need to refresh table roster to ensure the current roster is the latest
2346+
syncRefreshMetadata(true);
2347+
// the addr is wrong, need to refresh location
2348+
if (logger.isInfoEnabled()) {
2349+
logger.info("Cannot get ObTable by addr {}, refreshing metadata.", addr);
2350+
}
2351+
// refresh tablet location based on the latest roster, in case that some of the observers hase been killed
2352+
// and used the old location
2353+
tableEntry = refreshTableLocationByTabletId(tableEntry, tableName, tabletId);
2354+
if (ObGlobal.obVsnMajor() >= 4) {
2355+
ObPartitionLocationInfo locationInfo = getOrRefreshPartitionInfo(tableEntry, tableName, tabletId);
2356+
replica = getPartitionLocation(locationInfo, route);
2357+
} else {
2358+
replica = getPartitionLocation(tableEntry, partId, route);
2359+
}
2360+
if (replica == null) {
2361+
RUNTIME.error("Cannot get replica by partId: " + partId);
2362+
throw new ObTableGetException("Cannot get replica by partId: " + partId);
2363+
}
2364+
addr = replica.getAddr();
2365+
obTable = tableRoster.get(addr);
23392366
}
2340-
addr = replica.getAddr();
2341-
obTable = tableRoster.get(addr);
23422367
}
2343-
23442368
if (obTable == null) {
23452369
RUNTIME.error("cannot get table by addr: " + addr);
2346-
throw new ObTableGetException("cannot get table by addr: " + addr);
2370+
throw new ObTableGetException("obTable is null, addr is: " + addr.getIp() + ":" + addr.getSvrPort());
23472371
}
23482372

23492373
ObTableParam param = new ObTableParam(obTable);
@@ -2487,7 +2511,7 @@ public String tryGetTableNameFromTableGroupCache(final String tableGroupName,
24872511
if (logger.isInfoEnabled()) {
24882512
logger.info("server addr is expired and it will refresh metadata.");
24892513
}
2490-
syncRefreshMetadata();
2514+
syncRefreshMetadata(false);
24912515
} catch (Throwable t) {
24922516
RUNTIME.error("getOrRefreshTableName from TableGroup meet exception", t);
24932517
throw t;

src/main/java/com/alipay/oceanbase/rpc/location/LocationUtil.java

Lines changed: 26 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -782,12 +782,19 @@ private static TableEntry getTableEntryFromRemote(Connection connection, TableEn
782782
}
783783
}
784784
}
785+
} catch (SQLException e) {
786+
// cannot execute sql, maybe some of the observers have been killed
787+
RUNTIME.error(LCD.convert("01-00010"), key, e.getMessage());
788+
throw new ObTableEntryRefreshException("fail to get partition location entry from remote", e, true);
785789
} catch (ObTableNotExistException e) {
786790
// avoid to refresh meta for ObTableNotExistException
787791
RUNTIME.error("getTableEntryFromRemote meet exception", e);
788792
throw e;
789793
} catch (Exception e) {
790794
RUNTIME.error(LCD.convert("01-00009"), key, e);
795+
if (e instanceof ObTableEntryRefreshException) {
796+
throw e;
797+
}
791798
throw new ObTableEntryRefreshException(format(
792799
"fail to get table entry from remote, key=%s", key), e);
793800
} finally {
@@ -897,6 +904,10 @@ public static TableEntry getTableEntryLocationFromRemote(Connection connection,
897904
ps.setString(5, key.getTableName());
898905
rs = ps.executeQuery();
899906
getPartitionLocationFromResultSetByTablet(tableEntry, rs, partitionEntry, tabletId);
907+
} catch (SQLException e) {
908+
// cannot execute sql, maybe some of the observers have been killed
909+
RUNTIME.error(LCD.convert("01-00010"), key, tableEntry, e.getMessage());
910+
throw new ObTableEntryRefreshException("fail to get partition location entry from remote", e, true);
900911
} catch (Exception e) {
901912
RUNTIME.error(LCD.convert("01-00010"), key, tableEntry, e);
902913
throw new ObTablePartitionLocationRefreshException(format(
@@ -946,6 +957,9 @@ public static TableEntry getTableEntryLocationFromRemote(Connection connection,
946957
}
947958
rs = ps.executeQuery();
948959
partitionEntry = getPartitionLocationFromResultSet(tableEntry, rs, partitionEntry);
960+
} catch (SQLException e) {
961+
RUNTIME.error(LCD.convert("01-00010"), key, partitionNum, tableEntry, e);
962+
throw new ObTableEntryRefreshException("fail to get partition location entry from remote", e, true);
949963
} catch (Exception e) {
950964
RUNTIME.error(LCD.convert("01-00010"), key, partitionNum, tableEntry, e);
951965
throw new ObTablePartitionLocationRefreshException(format(
@@ -1061,7 +1075,8 @@ public static ObIndexInfo getIndexInfoFromRemote(ObServerAddr obServerAddr, ObUs
10611075

10621076
private static void fetchFirstPart(Connection connection, TableEntry tableEntry,
10631077
ObPartFuncType obPartFuncType)
1064-
throws ObTablePartitionInfoRefreshException {
1078+
throws ObTablePartitionInfoRefreshException,
1079+
SQLException {
10651080
String tableName = "";
10661081
TableEntryKey key = tableEntry.getTableEntryKey();
10671082
if (key != null) {
@@ -1109,6 +1124,8 @@ private static void fetchFirstPart(Connection connection, TableEntry tableEntry,
11091124
tableEntry.getPartitionInfo().setPartTabletIdMap(
11101125
parseFirstPartKeyHash(rs, tableEntry));
11111126
}
1127+
} catch (SQLException e) {
1128+
throw e;
11121129
} catch (Exception e) {
11131130
RUNTIME.error(LCD.convert("01-00011"), tableEntry, obPartFuncType, e);
11141131

@@ -1131,7 +1148,8 @@ private static void fetchFirstPart(Connection connection, TableEntry tableEntry,
11311148

11321149
private static void fetchSubPart(Connection connection, TableEntry tableEntry,
11331150
ObPartFuncType subPartFuncType)
1134-
throws ObTablePartitionInfoRefreshException {
1151+
throws ObTablePartitionInfoRefreshException,
1152+
SQLException {
11351153
String tableName = "";
11361154
TableEntryKey key = tableEntry.getTableEntryKey();
11371155
if (key != null) {
@@ -1178,6 +1196,8 @@ private static void fetchSubPart(Connection connection, TableEntry tableEntry,
11781196
tableEntry.getPartitionInfo().setPartTabletIdMap(
11791197
parseSubPartKeyHash(rs, tableEntry));
11801198
}
1199+
} catch (SQLException e) {
1200+
throw e;
11811201
} catch (Exception e) {
11821202
RUNTIME.error(LCD.convert("01-00012"), tableEntry, subPartFuncType, e);
11831203
throw new ObTablePartitionInfoRefreshException(format(
@@ -1454,7 +1474,8 @@ private static ReplicaLocation buildReplicaLocation(ResultSet rs) throws SQLExce
14541474
}
14551475

14561476
private static void fetchPartitionInfo(Connection connection, TableEntry tableEntry)
1457-
throws ObTablePartitionInfoRefreshException {
1477+
throws ObTablePartitionInfoRefreshException,
1478+
SQLException {
14581479
PreparedStatement pstmt = null;
14591480
ResultSet rs = null;
14601481
ObPartitionInfo info = null;
@@ -1477,6 +1498,8 @@ private static void fetchPartitionInfo(Connection connection, TableEntry tableEn
14771498
logger.info("get part info from remote info:{}", JSON.toJSON(info));
14781499
}
14791500
tableEntry.setPartitionInfo(info);
1501+
} catch (SQLException e) {
1502+
throw e;
14801503
} catch (Exception e) {
14811504
RUNTIME.error(LCD.convert("01-00014"), tableEntry);
14821505
RUNTIME.error("fail to get part info from remote");

src/test/java/com/alipay/oceanbase/rpc/bolt/ObTableClientTestBase.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1315,7 +1315,7 @@ public void syncRefreshMetaHelper(final ObTableClient obTableClient) {
13151315
public void run() {
13161316
for (int i = 0; i < 10; i++) {
13171317
try {
1318-
obTableClient.syncRefreshMetadata();
1318+
obTableClient.syncRefreshMetadata(false);
13191319
} catch (Exception e) {
13201320
e.printStackTrace();
13211321
Assert.fail();

src/test/java/com/alipay/oceanbase/rpc/util/ObTableHotkeyThrottleUtil.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -407,7 +407,7 @@ public void syncRefreshMetaHelper(final ObTableClient obTableClient) {
407407
public void run() {
408408
for (int i = 0; i < 10; i++) {
409409
try {
410-
obTableClient.syncRefreshMetadata();
410+
obTableClient.syncRefreshMetadata(false);
411411
} catch (Exception e) {
412412
e.printStackTrace();
413413
Assert.fail();

0 commit comments

Comments
 (0)