Skip to content

Commit ad1047e

Browse files
branch-3.1: [fix](cloud)Support cloud_tablet_rebalancer_interval_second config dynamic modification #58198 (#58376)
Cherry-picked from #58198 Co-authored-by: deardeng <dengxin@selectdb.com>
1 parent dfe2e8a commit ad1047e

File tree

3 files changed

+130
-3
lines changed

3 files changed

+130
-3
lines changed

fe/fe-common/src/main/java/org/apache/doris/common/Config.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3178,7 +3178,7 @@ public static int metaServiceRpcRetryTimes() {
31783178
public static int drop_user_notify_ms_max_times = 86400;
31793179

31803180
@ConfField(mutable = true, masterOnly = true)
3181-
public static long cloud_tablet_rebalancer_interval_second = 20;
3181+
public static long cloud_tablet_rebalancer_interval_second = 1;
31823182

31833183
@ConfField(mutable = true, masterOnly = true)
31843184
public static boolean enable_cloud_partition_balance = true;

fe/fe-core/src/main/java/org/apache/doris/cloud/catalog/CloudTabletRebalancer.java

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -316,7 +316,23 @@ protected void runAfterCatalogReady() {
316316

317317
checkDecommissionState(clusterToBes);
318318
inited = true;
319-
LOG.info("finished to rebalancer. cost: {} ms", (System.currentTimeMillis() - start));
319+
long sleepSeconds = Config.cloud_tablet_rebalancer_interval_second;
320+
if (sleepSeconds < 0L) {
321+
LOG.warn("cloud tablet rebalance interval second is negative, change it to default 1s");
322+
sleepSeconds = 1L;
323+
}
324+
long balanceEnd = System.currentTimeMillis();
325+
if (DebugPointUtil.isEnable("CloudTabletRebalancer.balanceEnd.tooLong")) {
326+
LOG.info("debug pointCloudTabletRebalancer.balanceEnd.tooLong");
327+
// slower the balance end time to trigger next balance immediately
328+
balanceEnd += (Config.cloud_tablet_rebalancer_interval_second + 10L) * 1000L;
329+
}
330+
if (balanceEnd - start > Config.cloud_tablet_rebalancer_interval_second * 1000L) {
331+
sleepSeconds = 0L;
332+
}
333+
setInterval(sleepSeconds * 1000L);
334+
LOG.info("finished to rebalancer. cost: {} ms, rebalancer sche interval {} s",
335+
(System.currentTimeMillis() - start), sleepSeconds);
320336
}
321337

322338
private void buildClusterToBackendMap() {
@@ -907,7 +923,7 @@ private Map<Long, Boolean> sendCheckWarmUpCacheAsyncRpc(List<Long> tabletIds, lo
907923
LOG.warn("check pre tablets {} cache status {} {}", tabletIds, result.getStatus().getStatusCode(),
908924
result.getStatus().getErrorMsgs());
909925
} else {
910-
LOG.info("check pre tablets {} cache succ status {} {}", tabletIds, result.getStatus().getStatusCode(),
926+
LOG.debug("check pre tablets {} cache succ status {} {}", tabletIds, result.getStatus().getStatusCode(),
911927
result.getStatus().getErrorMsgs());
912928
}
913929
return result.getTaskDone();
Lines changed: 111 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,111 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
import org.apache.doris.regression.suite.ClusterOptions
19+
import org.codehaus.groovy.runtime.IOGroovyMethods
20+
21+
suite('test_expanding_node_balance', 'docker') {
22+
if (!isCloudMode()) {
23+
return;
24+
}
25+
26+
def clusterOptions = [
27+
new ClusterOptions(),
28+
new ClusterOptions(),
29+
new ClusterOptions(),
30+
]
31+
32+
for (options in clusterOptions) {
33+
options.feConfigs += [
34+
'cloud_cluster_check_interval_second=1',
35+
'cloud_tablet_rebalancer_interval_second=20',
36+
'sys_log_verbose_modules=org',
37+
'heartbeat_interval_second=1',
38+
'rehash_tablet_after_be_dead_seconds=3600',
39+
'cloud_warm_up_for_rebalance_type=peer_read_async_warmup',
40+
// disable Auto Analysis Job Executor
41+
'auto_check_statistics_in_minutes=60',
42+
]
43+
options.cloudMode = true
44+
options.setFeNum(1)
45+
options.setBeNum(1)
46+
options.enableDebugPoints()
47+
}
48+
49+
50+
def testCase = { command, expectCost ->
51+
sql """
52+
CREATE TABLE `fact_sales` (
53+
`order_id` varchar(255) NOT NULL,
54+
`order_line_id` varchar(255) NOT NULL,
55+
`order_date` date NOT NULL,
56+
`time_of_day` varchar(50) NOT NULL,
57+
`season` varchar(50) NOT NULL,
58+
`month` int NOT NULL,
59+
`location_id` varchar(255) NOT NULL,
60+
`region` varchar(100) NOT NULL,
61+
`product_name` varchar(255) NOT NULL,
62+
`quantity` int NOT NULL,
63+
`sales_amount` double NOT NULL,
64+
`discount_percentage` int NOT NULL,
65+
`product_id` varchar(255) NOT NULL
66+
) ENGINE=OLAP
67+
DUPLICATE KEY(`order_id`, `order_line_id`)
68+
DISTRIBUTED BY HASH(`order_id`) BUCKETS 256
69+
PROPERTIES (
70+
"replication_allocation" = "tag.location.default: 1"
71+
)
72+
"""
73+
74+
cluster.addBackend(15, "compute_cluster")
75+
76+
sql """
77+
$command
78+
"""
79+
def begin = System.currentTimeMillis();
80+
awaitUntil(1000, 10) {
81+
def showRet = sql_return_maparray """ADMIN SHOW REPLICA DISTRIBUTION FROM fact_sales"""
82+
logger.info("show result {}", showRet)
83+
showRet.any { row ->
84+
Integer.valueOf((String) row.ReplicaNum) == 16
85+
}
86+
}
87+
def cost = (System.currentTimeMillis() - begin) / 1000;
88+
log.info("exec command: {}\n time cost: {}s", command, cost)
89+
assertTrue(cost < expectCost, "cost assert wrong")
90+
}
91+
92+
docker(clusterOptions[0]) {
93+
def command = 'admin set frontend config("cloud_min_balance_tablet_num_per_run"="16");'
94+
// assert < 300s
95+
testCase(command, 300)
96+
}
97+
98+
docker(clusterOptions[1]) {
99+
def command = 'admin set frontend config("cloud_tablet_rebalancer_interval_second"="0");'
100+
// assert < 50s
101+
testCase(command, 50)
102+
}
103+
104+
docker(clusterOptions[2]) {
105+
GetDebugPoint().enableDebugPointForAllFEs("CloudTabletRebalancer.balanceEnd.tooLong")
106+
// do nothing
107+
def command = 'select 1'
108+
// assert < 50s
109+
testCase(command, 50)
110+
}
111+
}

0 commit comments

Comments
 (0)