From 40d18df66ff65bfa6b4a8535c26957a05245038f Mon Sep 17 00:00:00 2001 From: hui lai Date: Sun, 28 Dec 2025 02:56:23 +0800 Subject: [PATCH] [enhance](job) optimize auto resume rule to adapt VCG failover (#59421) ### What problem does this PR solve? In https://github.com/apache/doris/pull/52515 introduces VCG(Virtual Compute Group) to be used for multi availability zone disaster recovery. But routine load job do not adapt it perfectly: If a cluster in an availability zone crashes, VCG provides disaster recovery capabilities, but the job will not be automatically resume. So this PR removed the `dead BE count` calculation when judge `isNeedAutoSchedule`. ### Release note None --- .../src/main/java/org/apache/doris/common/Config.java | 6 ------ .../apache/doris/load/routineload/ScheduleRule.java | 10 ---------- 2 files changed, 16 deletions(-) diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java index c91faab96615bf..73d7a1b39ab4ab 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java @@ -1381,12 +1381,6 @@ public class Config extends ConfigBase { @ConfField public static boolean check_java_version = true; - /** - * it can't auto-resume routine load job as long as one of the backends is down - */ - @ConfField(mutable = true, masterOnly = true) - public static int max_tolerable_backend_down_num = 0; - /** * a period for auto resume routine load */ diff --git a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/ScheduleRule.java b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/ScheduleRule.java index 8454bba73031ca..aff189a7c37865 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/load/routineload/ScheduleRule.java +++ b/fe/fe-core/src/main/java/org/apache/doris/load/routineload/ScheduleRule.java @@ -59,16 +59,6 @@ public static boolean isNeedAutoSchedule(RoutineLoadJob jobRoutine) { && jobRoutine.pauseReason.getCode() != InternalErrorCode.MANUAL_PAUSE_ERR && jobRoutine.pauseReason.getCode() != InternalErrorCode.TOO_MANY_FAILURE_ROWS_ERR && jobRoutine.pauseReason.getCode() != InternalErrorCode.CANNOT_RESUME_ERR) { - int dead = deadBeCount(); - if (dead > Config.max_tolerable_backend_down_num) { - if (LOG.isDebugEnabled()) { - LOG.debug("dead backend num {} is larger than config {}, " - + "routine load job {} can not be auto rescheduled", - dead, Config.max_tolerable_backend_down_num, jobRoutine.id); - } - return false; - } - if (jobRoutine.latestResumeTimestamp == 0) { //the first resume jobRoutine.latestResumeTimestamp = System.currentTimeMillis(); jobRoutine.autoResumeCount = 1;