apache · caishunfeng · Nov 20, 2024 · Nov 19, 2024
@@ -294,7 +294,6 @@ Location: `master-server/conf/application.yaml`
 | master.server-load-protection.max-disk-usage-percentage-thresholds          | 0.7                          | Master max disk usage , when the master's disk usage is smaller then this value, master server can execute workflow.                                                                                                                                                                                                         |
 | master.failover-interval                                                    | 10                           | failover interval, the unit is minute                                                                                                                                                                                                                                                                                        |
 | master.kill-application-when-task-failover                                  | true                         | whether to kill yarn/k8s application when failover taskInstance                                                                                                                                                                                                                                                              |
-| master.registry-disconnect-strategy.strategy                                | stop                         | Used when the master disconnect from registry, default value: stop. Optional values include stop, waiting                                                                                                                                                                                                                    |
 | master.registry-disconnect-strategy.max-waiting-time                        | 100s                         | Used when the master disconnect from registry, and the disconnect strategy is waiting, this config means the master will waiting to reconnect to registry in given times, and after the waiting times, if the master still cannot connect to registry, will stop itself, if the value is 0s, the Master will wait infinitely |
 | master.worker-group-refresh-interval                                        | 10s                          | The interval to refresh worker group from db to memory                                                                                                                                                                                                                                                                       |
 | master.command-fetch-strategy.type                                          | ID_SLOT_BASED                | The command fetch strategy, only support `ID_SLOT_BASED`                                                                                                                                                                                                                                                                     |

@@ -33,4 +33,5 @@ This document records the incompatible updates between each version. You need to
 * Uniformly name `process` in code as `workflow` ([#16515])(https://github.com/apache/dolphinscheduler/pull/16515)
 * Deprecated upgrade code of 1.x and 2.x ([#16543])(https://github.com/apache/dolphinscheduler/pull/16543)
 * Remove the `Data Quality` module ([#16794])(https://github.com/apache/dolphinscheduler/pull/16794)
+* Remove the `registry-disconnect-strategy` in `application.yaml` ([#16821])(https://github.com/apache/dolphinscheduler/pull/16821)
 
@@ -276,30 +276,28 @@ common.properties配置文件目前主要是配置hadoop/s3/yarn/applicationId
 
 位置：`master-server/conf/application.yaml`
 
-|                                     参数                                      |             默认值              |                                                                    描述                                                                    |
-|-----------------------------------------------------------------------------|------------------------------|------------------------------------------------------------------------------------------------------------------------------------------|
-| master.listen-port                                                          | 5678                         | master监听端口                                                                                                                               |
-| master.pre-exec-threads                                                     | 10                           | master准备执行任务的数量，用于限制并行的command                                                                                                           |
-| master.exec-threads                                                         | 100                          | master工作线程数量,用于限制并行的流程实例数量                                                                                                               |
-| master.dispatch-task-number                                                 | 3                            | master每个批次的派发任务数量                                                                                                                        |
-| master.worker-load-balancer-configuration-properties.type                   | DYNAMIC_WEIGHTED_ROUND_ROBIN | Master 将会使用Worker的动态CPU/Memory/线程池使用率来计算Worker的负载，负载越低的worker将会有更高的机会被分发任务                                                               |
-| master.max-heartbeat-interval                                               | 10s                          | master最大心跳间隔                                                                                                                             |
-| master.task-commit-retry-times                                              | 5                            | 任务重试次数                                                                                                                                   |
-| master.task-commit-interval                                                 | 1000                         | 任务提交间隔,单位为毫秒                                                                                                                             |
-| master.state-wheel-interval                                                 | 5                            | 轮询检查状态时间                                                                                                                                 |
-| master.server-load-protection.enabled                                       | true                         | 是否开启系统保护策略                                                                                                                               |
-| master.server-load-protection.max-system-cpu-usage-percentage-thresholds    | 0.7                          | master最大系统cpu使用值,只有当前系统cpu使用值低于最大系统cpu使用值,master服务才能调度任务. 默认值为0.7: 会使用70%的操作系统CPU                                                        |
-| master.server-load-protection.max-jvm-cpu-usage-percentage-thresholds       | 0.7                          | master最大JVM cpu使用值,只有当前JVM cpu使用值低于最大JVM cpu使用值,master服务才能调度任务. 默认值为0.7: 会使用70%的JVM CPU                                                  |
-| master.server-load-protection.max-system-memory-usage-percentage-thresholds | 0.7                          | master最大系统 内存使用值,只有当前系统内存使用值低于最大系统内存使用值,master服务才能调度任务. 默认值为0.7: 会使用70%的操作系统内存                                                           |
-| master.server-load-protection.max-disk-usage-percentage-thresholds          | 0.7                          | master最大系统磁盘使用值,只有当前系统磁盘使用值低于最大系统磁盘使用值,master服务才能调度任务. 默认值为0.7: 会使用70%的操作系统磁盘空间                                                          |
-| master.failover-interval                                                    | 10                           | failover间隔，单位为分钟                                                                                                                         |
-| master.kill-application-when-task-failover                                  | true                         | 当任务实例failover时，是否kill掉yarn或k8s application                                                                                               |
-| master.registry-disconnect-strategy.strategy                                | stop                         | 当Master与注册中心失联之后采取的策略, 默认值是: stop. 可选值包括： stop, waiting                                                                                  |
-| master.registry-disconnect-strategy.max-waiting-time                        | 100s                         | 当Master与注册中心失联之后重连时间, 之后当strategy为waiting时，该值生效。 该值表示当Master与注册中心失联时会在给定时间之内进行重连, 在给定时间之内重连失败将会停止自己，在重连时，Master会丢弃目前正在执行的工作流，值为0表示会无限期等待 |
-| master.master.worker-group-refresh-interval                                 | 10s                          | 定期将workerGroup从数据库中同步到内存的时间间隔                                                                                                            |
-| master.command-fetch-strategy.type                                          | ID_SLOT_BASED                | Command拉取策略, 目前仅支持 `ID_SLOT_BASED`                                                                                                       |
-| master.command-fetch-strategy.config.id-step                                | 1                            | 数据库中t_ds_command的id自增步长                                                                                                                  |
-| master.command-fetch-strategy.config.fetch-size                             | 10                           | master拉取command数量                                                                                                                        |
+|                                     参数                                      |             默认值              |                                           描述                                            |
+|-----------------------------------------------------------------------------|------------------------------|-----------------------------------------------------------------------------------------|
+| master.listen-port                                                          | 5678                         | master监听端口                                                                              |
+| master.pre-exec-threads                                                     | 10                           | master准备执行任务的数量，用于限制并行的command                                                          |
+| master.exec-threads                                                         | 100                          | master工作线程数量,用于限制并行的流程实例数量                                                              |
+| master.dispatch-task-number                                                 | 3                            | master每个批次的派发任务数量                                                                       |
+| master.worker-load-balancer-configuration-properties.type                   | DYNAMIC_WEIGHTED_ROUND_ROBIN | Master 将会使用Worker的动态CPU/Memory/线程池使用率来计算Worker的负载，负载越低的worker将会有更高的机会被分发任务              |
+| master.max-heartbeat-interval                                               | 10s                          | master最大心跳间隔                                                                            |
+| master.task-commit-retry-times                                              | 5                            | 任务重试次数                                                                                  |
+| master.task-commit-interval                                                 | 1000                         | 任务提交间隔,单位为毫秒                                                                            |
+| master.state-wheel-interval                                                 | 5                            | 轮询检查状态时间                                                                                |
+| master.server-load-protection.enabled                                       | true                         | 是否开启系统保护策略                                                                              |
+| master.server-load-protection.max-system-cpu-usage-percentage-thresholds    | 0.7                          | master最大系统cpu使用值,只有当前系统cpu使用值低于最大系统cpu使用值,master服务才能调度任务. 默认值为0.7: 会使用70%的操作系统CPU       |
+| master.server-load-protection.max-jvm-cpu-usage-percentage-thresholds       | 0.7                          | master最大JVM cpu使用值,只有当前JVM cpu使用值低于最大JVM cpu使用值,master服务才能调度任务. 默认值为0.7: 会使用70%的JVM CPU |
+| master.server-load-protection.max-system-memory-usage-percentage-thresholds | 0.7                          | master最大系统 内存使用值,只有当前系统内存使用值低于最大系统内存使用值,master服务才能调度任务. 默认值为0.7: 会使用70%的操作系统内存          |
+| master.server-load-protection.max-disk-usage-percentage-thresholds          | 0.7                          | master最大系统磁盘使用值,只有当前系统磁盘使用值低于最大系统磁盘使用值,master服务才能调度任务. 默认值为0.7: 会使用70%的操作系统磁盘空间         |
+| master.failover-interval                                                    | 10                           | failover间隔，单位为分钟                                                                        |
+| master.kill-application-when-task-failover                                  | true                         | 当任务实例failover时，是否kill掉yarn或k8s application                                              |
+| master.master.worker-group-refresh-interval                                 | 10s                          | 定期将workerGroup从数据库中同步到内存的时间间隔                                                           |
+| master.command-fetch-strategy.type                                          | ID_SLOT_BASED                | Command拉取策略, 目前仅支持 `ID_SLOT_BASED`                                                      |
+| master.command-fetch-strategy.config.id-step                                | 1                            | 数据库中t_ds_command的id自增步长                                                                 |
+| master.command-fetch-strategy.config.fetch-size                             | 10                           | master拉取command数量                                                                       |
 
 ## Worker Server相关配置
 
@@ -319,7 +317,6 @@ common.properties配置文件目前主要是配置hadoop/s3/yarn/applicationId
 | worker.server-load-protection.max-disk-usage-percentage-thresholds          | 0.7       | worker最大系统磁盘使用值,只有当前系统磁盘使用值低于最大系统磁盘使用值,worker服务才能接收任务. 默认值为0.7: 会使用70%的操作系统磁盘空间                                                           |
 | worker.alert-listen-host                                                    | localhost | alert监听host                                                                                                                               |
 | worker.alert-listen-port                                                    | 50052     | alert监听端口                                                                                                                                 |
-| worker.registry-disconnect-strategy.strategy                                | stop      | 当Worker与注册中心失联之后采取的策略, 默认值是: stop. 可选值包括： stop, waiting                                                                                   |
 | worker.registry-disconnect-strategy.max-waiting-time                        | 100s      | 当Worker与注册中心失联之后重连时间, 之后当strategy为waiting时，该值生效。 该值表示当Worker与注册中心失联时会在给定时间之内进行重连, 在给定时间之内重连失败将会停止自己，在重连时，Worker会丢弃kill正在执行的任务。值为0表示会无限期等待 |
 | worker.task-execute-threads-full-policy                                     | REJECT    | 如果是 REJECT, 当Worker中等待队列中的任务数达到exec-threads时, Worker将会拒绝接下来新接收的任务，Master将会重新分发该任务; 如果是 CONTINUE, Worker将会接收任务，放入等待队列中等待空闲线程去执行该任务         |
 | worker.tenant-config.auto-create-tenant-enabled                             | true      | 租户对应于系统的用户,由worker提交作业.如果系统没有该用户,则在参数worker.tenant.auto.create为true后自动创建。                                                                 |

@@ -31,4 +31,5 @@
 * 统一代码中的 `process` 为 `workflow` ([#16515])(https://github.com/apache/dolphinscheduler/pull/16515)
 * 废弃从 1.x 至 2.x 的升级代码  ([#16543])(https://github.com/apache/dolphinscheduler/pull/16543)
 * 移除 `数据质量` 模块  ([#16794])(https://github.com/apache/dolphinscheduler/pull/16794)
+* 在`application.yaml`中移除`registry-disconnect-strategy`配置 ([#16821])(https://github.com/apache/dolphinscheduler/pull/16821)
 
@@ -20,12 +20,17 @@
 import org.apache.dolphinscheduler.common.enums.ServerStatus;
 
 import lombok.Data;
+import lombok.ToString;
 import lombok.experimental.SuperBuilder;
 
 @Data
+@ToString
 @SuperBuilder
 public abstract class BaseServerMetadata implements IClusters.IServerMetadata {
 
+    // The server startup time in milliseconds.
+    private final long serverStartupTime;
+
     private final String address;
 
     private final double cpuUsage;

@@ -46,12 +46,16 @@ public void start() {
         log.info("ClusterStateMonitors started...");
     }
 
-    void masterRemoved(MasterServerMetadata masterServer) {
-        systemEventBus.publish(MasterFailoverEvent.of(masterServer.getAddress(), new Date()));
+    void masterRemoved(final MasterServerMetadata masterServer) {
+        // We set a delay of 30 seconds for the master failover event
+        // If the master can reconnect to registry within 30 seconds, the master will skip failover.
+        systemEventBus.publish(MasterFailoverEvent.of(masterServer, new Date(), 30_000));
     }
 
-    void workerRemoved(WorkerServerMetadata workerServer) {
-        systemEventBus.publish(WorkerFailoverEvent.of(workerServer.getAddress(), new Date()));
+    void workerRemoved(final WorkerServerMetadata workerServer) {
+        // We set a delay of 30 seconds for the worker failover event
+        // If the worker can reconnect to registry within 30 seconds, the worker will skip failover.
+        systemEventBus.publish(WorkerFailoverEvent.of(workerServer, new Date(), 30_000));
     }
 
 }