Skip to content

Commit b621455

Browse files
authored
[fix](cloud) Fix the issue where it takes a long time to come alive on first boot (apache#58152)
fix heartbeat abort txn too slow,This caused the first startup to take a very long time to come alive.
1 parent 112a299 commit b621455

File tree

3 files changed

+92
-6
lines changed

3 files changed

+92
-6
lines changed

fe/fe-core/src/main/java/org/apache/doris/cloud/transaction/CloudGlobalTransactionMgr.java

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2091,6 +2091,14 @@ public void abortTxnWhenCoordinateBeRestart(long coordinateBeId, String coordina
20912091
response = MetaServiceProxy
20922092
.getInstance().abortTxnWithCoordinator(request);
20932093
LOG.info("AbortTxnWithCoordinatorResponse: {}", response);
2094+
if (DebugPointUtil.isEnable("FE.abortTxnWhenCoordinateBeRestart.slow")) {
2095+
LOG.info("debug point FE.abortTxnWhenCoordinateBeRestart.slow enabled, sleep 15s");
2096+
try {
2097+
Thread.sleep(15 * 1000);
2098+
} catch (InterruptedException ie) {
2099+
LOG.info("error ", ie);
2100+
}
2101+
}
20942102
} catch (RpcException e) {
20952103
LOG.warn("Abort txn on coordinate BE {} failed, msg={}", coordinateHost, e.getMessage());
20962104
}

fe/fe-core/src/main/java/org/apache/doris/system/HeartbeatMgr.java

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,7 @@ public class HeartbeatMgr extends MasterDaemon {
7878
private final ExecutorService executor;
7979
private SystemInfoService nodeMgr;
8080
private HeartbeatFlags heartbeatFlags;
81+
private final ExecutorService abortTxnExecutor;
8182

8283
private static volatile AtomicReference<TMasterInfo> masterInfo = new AtomicReference<>();
8384

@@ -86,6 +87,8 @@ public HeartbeatMgr(SystemInfoService nodeMgr, boolean needRegisterMetric) {
8687
this.nodeMgr = nodeMgr;
8788
this.executor = ThreadPoolManager.newDaemonFixedThreadPool(Config.heartbeat_mgr_threads_num,
8889
Config.heartbeat_mgr_blocking_queue_size, "heartbeat-mgr-pool", needRegisterMetric);
90+
this.abortTxnExecutor = ThreadPoolManager.newDaemonFixedThreadPool(1,
91+
Config.heartbeat_mgr_blocking_queue_size, "abort-txn-executor", needRegisterMetric);
8992
this.heartbeatFlags = new HeartbeatFlags();
9093
}
9194

@@ -192,18 +195,21 @@ private boolean handleHbResponse(HeartbeatResponse response, boolean isReplay) {
192195
boolean isChanged = be.handleHbResponse(hbResponse, isReplay);
193196
if (hbResponse.getStatus() == HbStatus.OK) {
194197
long newStartTime = be.getLastStartTime();
198+
// oldStartTime > 0 means it is not the first heartbeat
195199
if (!isReplay && Config.enable_abort_txn_by_checking_coordinator_be
196-
&& oldStartTime != newStartTime) {
197-
Env.getCurrentGlobalTransactionMgr().abortTxnWhenCoordinateBeRestart(
198-
be.getId(), be.getHost(), newStartTime);
200+
&& oldStartTime != newStartTime && oldStartTime > 0) {
201+
submitAbortTxnTaskByExecutor(() -> Env.getCurrentGlobalTransactionMgr()
202+
.abortTxnWhenCoordinateBeRestart(be.getId(), be.getHost(), newStartTime),
203+
"restart");
199204
}
200205
} else {
201206
// invalid all connections cached in ClientPool
202207
ClientPool.backendPool.clearPool(new TNetworkAddress(be.getHost(), be.getBePort()));
203208
if (!isReplay && System.currentTimeMillis() - be.getLastUpdateMs()
204-
>= Config.abort_txn_after_lost_heartbeat_time_second * 1000L) {
205-
Env.getCurrentGlobalTransactionMgr().abortTxnWhenCoordinateBeDown(
206-
be.getId(), be.getHost(), 100);
209+
>= Config.abort_txn_after_lost_heartbeat_time_second * 1000L
210+
&& be.getLastUpdateMs() > 0) {
211+
submitAbortTxnTaskByExecutor(() -> Env.getCurrentGlobalTransactionMgr()
212+
.abortTxnWhenCoordinateBeDown(be.getId(), be.getHost(), 100), "down");
207213
}
208214
}
209215
return isChanged;
@@ -230,6 +236,26 @@ private boolean handleHbResponse(HeartbeatResponse response, boolean isReplay) {
230236
return false;
231237
}
232238

239+
private void submitAbortTxnTaskByExecutor(Runnable task, String reason) {
240+
long start = System.currentTimeMillis();
241+
try {
242+
abortTxnExecutor.submit(() -> {
243+
LOG.info("start abort txn task, reason={}, start_ts={}", reason, start);
244+
try {
245+
task.run();
246+
long duration = System.currentTimeMillis() - start;
247+
LOG.info("finish abort txn task, reason={}, start_ts={}, cost_ms={}", reason, start, duration);
248+
} catch (Exception e) {
249+
long duration = System.currentTimeMillis() - start;
250+
LOG.warn("abort txn task({}) failed, start_ts={}, cost_ms={}", reason, start, duration, e);
251+
}
252+
});
253+
} catch (Exception e) {
254+
long duration = System.currentTimeMillis() - start;
255+
LOG.warn("failed to submit abort txn task({}), start_ts={}, cost_ms={}", reason, start, duration, e);
256+
}
257+
}
258+
233259
// backend heartbeat
234260
private class BackendHeartbeatHandler implements Callable<HeartbeatResponse> {
235261
private Backend backend;
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
import groovy.json.JsonSlurper
18+
import groovy.json.JsonOutput
19+
import org.apache.doris.regression.suite.ClusterOptions
20+
21+
suite("test_cloud_add_backend_heartbeat", 'p0, docker') {
22+
if (!isCloudMode()) {
23+
return
24+
}
25+
26+
def options = new ClusterOptions()
27+
options.feConfigs += [
28+
'cloud_cluster_check_interval_second=1',
29+
'sys_log_verbose_modules=org',
30+
'heartbeat_interval_second=1'
31+
]
32+
options.setFeNum(1)
33+
options.setBeNum(1)
34+
options.enableDebugPoints()
35+
options.cloudMode = true
36+
37+
docker(options) {
38+
def ms = cluster.getAllMetaservices().get(0)
39+
def msHttpPort = ms.host + ":" + ms.httpPort
40+
logger.info("ms1 addr={}, port={}, ms endpoint={}", ms.host, ms.httpPort, msHttpPort)
41+
42+
GetDebugPoint().enableDebugPointForAllFEs("FE.abortTxnWhenCoordinateBeRestart.slow")
43+
44+
cluster.addBackend(10, "new_cluster")
45+
46+
sql """admin set frontend config("cloud_tablet_rebalancer_interval_second"="3");"""
47+
48+
cluster.restartBackends();
49+
50+
}
51+
52+
}

0 commit comments

Comments
 (0)