intelligent-machine-learning
diff --git a/‎dlrover/python/common/comm.py‎
Lines changed: 1 addition & 1 deletion b/‎dlrover/python/common/comm.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎dlrover/python/common/constants.py‎
Lines changed: 3 additions & 0 deletions b/‎dlrover/python/common/constants.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎dlrover/python/common/enums.py‎
Lines changed: 8 additions & 1 deletion b/‎dlrover/python/common/enums.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎dlrover/python/common/failover.py‎
Lines changed: 56 additions & 0 deletions b/‎dlrover/python/common/failover.py‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎dlrover/python/common/global_context.py‎
Lines changed: 2 additions & 0 deletions b/‎dlrover/python/common/global_context.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎dlrover/python/diagnosis/common/constants.py‎
Lines changed: 5 additions & 0 deletions b/‎dlrover/python/diagnosis/common/constants.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎dlrover/python/diagnosis/common/diagnosis_action.py‎
Lines changed: 31 additions & 2 deletions b/‎dlrover/python/diagnosis/common/diagnosis_action.py‎
Lines changed: 31 additions & 2 deletions
diff --git a/‎dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py‎
Lines changed: 81 additions & 1 deletion b/‎dlrover/python/elastic_agent/diagnosis/diagnosis_agent.py‎
Lines changed: 81 additions & 1 deletion
diff --git a/‎dlrover/python/elastic_agent/master_client.py‎
Lines changed: 7 additions & 0 deletions b/‎dlrover/python/elastic_agent/master_client.py‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎dlrover/python/elastic_agent/torch/dynamic_failover.py‎
Lines changed: 53 additions & 0 deletions b/‎dlrover/python/elastic_agent/torch/dynamic_failover.py‎
Lines changed: 53 additions & 0 deletions
@@ -1,4 +1,4 @@
-# Copyright 2020 The DLRover Authors. All rights reserved.
+# Copyright 2026 The DLRover Authors. All rights reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 
@@ -352,6 +352,9 @@ class NodeEnv(object):
     # grpc env
     MASTER_CLIENT_TIMEOUT = "MASTER_CLIENT_TIMEOUT"
 
+    # extension env
+    DLROVER_EXTENSION_DYNAMIC_FAILOVER = "DLROVER_EXTENSION_DYNAMIC_FAILOVER"
+
 
 class DatasetType(object):
     TEXT = "text"
 
@@ -1,4 +1,4 @@
-# Copyright 2025 The EasyDL Authors. All rights reserved.
+# Copyright 2026 The DLRover Authors. All rights reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -17,3 +17,10 @@
 class ResourceType(Enum):
     CPU = "CPU"
     GPU = "GPU"
+
+
+class FailoverStrategy(Enum):
+    NORMAL_FAILOVER = "NORMAL_FAILOVER"
+    NODE_FAILOVER = "NODE_FAILOVER"
+    GLOBAL_FAILOVER = "GLOBAL_FAILOVER"
+    ABORTION_FAILOVER = "ABORTION_FAILOVER"
@@ -0,0 +1,56 @@
+#  Copyright 2026 The DLRover Authors. All rights reserved.
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import Any
+
+from dlrover.python.common.enums import FailoverStrategy
+
+
+USER_FAILOVER_TRIGGER_JOB_ABORTION = "USER_FAILOVER_TRIGGER_JOB_ABORTION"
+USER_FAILOVER_TRIGGER_JOB_RESTART = "USER_FAILOVER_TRIGGER_JOB_RESTART"
+
+
+@dataclass
+class FailureInfo(object):
+    timestamp: int = int(time.time())
+    log_content: str = ""
+    extra_info: dict = field(default_factory=dict)
+
+
+class DynamicFailoverExtension(ABC):
+    """
+    Dynamic extension for fault-tolerance execution.
+    """
+
+    @abstractmethod
+    def get_user_failover_strategy(
+        self, failure_info: Any
+    ) -> FailoverStrategy:
+        """
+        The user-side implementation to specify a failover-strategy to DLRover
+        according to the failure info of a process. Defaults to returning
+        FailoverStrategy.NORMAL_FAILOVER, which employs DLRover's internal logic.
+
+        This implementation can be based on simple rule definitions using error
+        codes or complex logic calls involving external services or model inference.
+
+        Args:
+            failure_info (Any): The basic context when failure happens.
+
+        Returns:
+            FailoverStrategy: The failover strategy.
+        """
+
+        return FailoverStrategy.NORMAL_FAILOVER
@@ -148,6 +148,8 @@ def __init__(self):
         self.max_relaunch_count = DefaultValues.MAX_RELAUNCH_COUNT
         self.max_group_relaunch_count = DefaultValues.MAX_GROUP_RELAUNCH_COUNT
         self.training_elastic_mode = DefaultValues.TRAINING_ELASTIC_MODE
+        # extensions
+        self.dynamic_failover_extension = None
 
     def set_params_from_brain(self):
         self.train_speed_record_num = self.get_param_value_from_brain(
 
@@ -75,13 +75,18 @@ class DiagnosisActionType(object):
 
     # master operation
     JOB_ABORT = "job_abortion"
+    JOB_RESTART = "job_restart"
     MASTER_RELAUNCH_WORKER = "master_relaunch_worker"
     EVENT = "event"
 
     # node operation
     RESTART_WORKER = "restart_worker"
     RELAUNCH_WORKER = "relaunch_worker"
 
+    # job operation
+    RESTART_JOB = "restart_job"
+    ABORT_JOB = "abort_job"
+
 
 class DiagnosisResult(object):
     # diag invalid param
 
@@ -249,15 +249,16 @@ def __repr__(self):
         )
 
 
-class JobAbortionAction(DiagnosisAction):
+class JobAction(DiagnosisAction):
     def __init__(
         self,
+        action_type: str,
         reason: str = "",
         msg: str = "",
         **kwargs,
     ):
         super().__init__(
-            DiagnosisActionType.JOB_ABORT,
+            action_type,
             DiagnosisConstant.MASTER_INSTANCE,
             0,
             0,
@@ -284,6 +285,34 @@ def __repr__(self):
         )
 
 
+class JobAbortionAction(JobAction):
+    def __init__(
+        self,
+        reason: str = "",
+        msg: str = "",
+        **kwargs,
+    ):
+        super().__init__(
+            action_type=DiagnosisActionType.JOB_ABORT,
+            reason=reason,
+            msg=msg,
+        )
+
+
+class JobRestartAction(JobAction):
+    def __init__(
+        self,
+        reason: str = "",
+        msg: str = "",
+        **kwargs,
+    ):
+        super().__init__(
+            action_type=DiagnosisActionType.JOB_RESTART,
+            reason=reason,
+            msg=msg,
+        )
+
+
 def is_same_action(action1: DiagnosisAction, action2: DiagnosisAction) -> bool:
     if isinstance(action1, EventAction) and isinstance(action2, EventAction):
         action1.__class__ = EventAction
 
@@ -1,4 +1,4 @@
-# Copyright 2024 The DLRover Authors. All rights reserved.
+# Copyright 2026 The DLRover Authors. All rights reserved.
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -18,7 +18,12 @@
 
 from dlrover.python.common import env_utils
 from dlrover.python.common.constants import TrainingExceptionLevel
+from dlrover.python.common.enums import FailoverStrategy
 from dlrover.python.common.error import ProcessError
+from dlrover.python.common.failover import (
+    USER_FAILOVER_TRIGGER_JOB_ABORTION,
+    USER_FAILOVER_TRIGGER_JOB_RESTART,
+)
 from dlrover.python.common.log import default_logger as logger
 from dlrover.python.common.singleton import Singleton
 from dlrover.python.diagnosis.common.constants import (
@@ -30,6 +35,8 @@
 from dlrover.python.diagnosis.common.diagnosis_action import (
     DiagnosisAction,
     NodeAction,
+    JobAbortionAction,
+    JobRestartAction,
 )
 from dlrover.python.diagnosis.common.diagnosis_data import WorkerTrainingMetric
 from dlrover.python.diagnosis.common.diagnosis_manager import DiagnosisManager
@@ -50,6 +57,10 @@
 )
 from dlrover.python.elastic_agent.context import get_agent_context
 from dlrover.python.elastic_agent.master_client import MasterClient
+from dlrover.python.elastic_agent.torch.dynamic_failover import (
+    DynamicAgentFailoverExtension,
+    AgentFailureInfo,
+)
 from dlrover.python.training_event.config import is_dlrover_event_enabled
 
 
@@ -60,12 +71,16 @@ def __init__(
         errors="",
         node_rank=-1,
         local_world_size=0,
+        dynamic_failover_extension=None,
     ):
         self._client = MasterClient.singleton_instance()
         self._training_log_file = training_log_file
         self._errors = errors
         self._stopped = False
         self._agent_context = get_agent_context()
+        self._extension: DynamicAgentFailoverExtension = (
+            dynamic_failover_extension
+        )
 
         DiagnosisManager.__init__(self, self._agent_context)
 
@@ -140,6 +155,71 @@ def diagnose_training_failure(self) -> DiagnosisAction:
             self._agent_context.run_result.failures,
             self._agent_context.restart_count,
         )
+
+        def serialize_failures(failures: dict):
+            try:
+                str_result = json.dumps(failures)
+            except Exception:
+                str_result = str(failures)
+            return str_result
+
+        failure_info = AgentFailureInfo(
+            node_rank=self._node_rank,
+            log_content=serialize_failures(
+                self._agent_context.run_result.failures
+            ),
+        )
+
+        if self._extension is not None:
+            extension_cls_info = self._extension.__class__
+
+            try:
+                # user strategy
+                user_strategy = self._extension.get_user_failover_strategy(
+                    failure_info
+                )
+            except Exception as e:
+                logger.warning(
+                    f"Failed to get user_strategy from extension: {extension_cls_info} "
+                    f"by exception: {e}. Use default dlrover failover processing."
+                )
+                user_strategy = FailoverStrategy.NORMAL_FAILOVER
+
+            if user_strategy == FailoverStrategy.NODE_FAILOVER:
+                logger.info(
+                    f"[{self._agent_context.worker_spec.role}] Worker group "
+                    f"{self._agent_context.run_result.state.name}, "
+                    f"will relaunch node by user strategy: {extension_cls_info}."
+                )
+                return NodeAction(
+                    node_id=env_utils.get_node_id(),
+                    node_type=env_utils.get_node_type(),
+                    instance=DiagnosisConstant.LOCAL_INSTANCE,
+                    action_type=DiagnosisActionType.RELAUNCH_WORKER,
+                )
+            elif user_strategy == FailoverStrategy.ABORTION_FAILOVER:
+                logger.info(
+                    f"[{self._agent_context.worker_spec.role}] Worker group "
+                    f"{self._agent_context.run_result.state.name}, "
+                    f"will abort job by user strategy: {extension_cls_info}."
+                )
+                return JobAbortionAction(
+                    reason=USER_FAILOVER_TRIGGER_JOB_ABORTION
+                )
+            elif user_strategy == FailoverStrategy.GLOBAL_FAILOVER:
+                logger.info(
+                    f"[{self._agent_context.worker_spec.role}] Worker group "
+                    f"{self._agent_context.run_result.state.name}, "
+                    f"will relaunch job by user strategy: {extension_cls_info}."
+                )
+                return JobRestartAction(
+                    reason=USER_FAILOVER_TRIGGER_JOB_RESTART
+                )
+            else:
+                # FailoverStrategy.NORMAL_FAILOVER: continue with dlrover default logic
+                pass
+
+        # dlrover default logic
         ob = self.observe(
             DiagnosticianType.NODE_FAILURE,
             log_file=self._training_log_file,
 
@@ -519,6 +519,13 @@ def set_rdzv_blocked(self, blocked, reason=""):
         message = comm.RdzvBlocked(blocked=blocked, reason=reason)
         self._report(message)
 
+    def report_action(self, action: DiagnosisAction):
+        message = comm.DiagnosisAction(
+            action_cls=action.__class__.__name__,
+            action_content=action.to_json(),
+        )
+        self._report(message)
+
     @classmethod
     def singleton_instance(cls, *args, **kwargs):
         if not cls._instance:
 
@@ -0,0 +1,53 @@
+#  Copyright 2026 The DLRover Authors. All rights reserved.
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from abc import abstractmethod
+from dataclasses import dataclass
+
+from dlrover.python.common.enums import FailoverStrategy
+from dlrover.python.common.failover import (
+    DynamicFailoverExtension,
+    FailureInfo,
+)
+
+
+@dataclass
+class AgentFailureInfo(FailureInfo):
+    node_rank: int = -1
+
+
+class DynamicAgentFailoverExtension(DynamicFailoverExtension):
+    """
+    Dynamic extension for agent(elastic agent) fault-tolerance execution.
+    """
+
+    @abstractmethod
+    def get_user_failover_strategy(
+        self, failure_info: AgentFailureInfo
+    ) -> FailoverStrategy:
+        """
+        The user-side implementation to specify a failover-strategy to DLRover
+        according to the failure info of a process. Defaults to returning
+        FailoverStrategy.NORMAL_FAILOVER, which employs DLRover's internal logic.
+
+        This implementation can be based on simple rule definitions using error
+        codes or complex logic calls involving external services or model inference.
+
+        Args:
+            failure_info (AgentFailureInfo): The basic failure context of agent
+                when failure happens.
+
+        Returns:
+            FailoverStrategy: The failover strategy.
+        """
+
+        return FailoverStrategy.NORMAL_FAILOVER
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# Copyright 2020 The DLRover Authors. All rights reserved.`
	`1`	`+# Copyright 2026 The DLRover Authors. All rights reserved.`
`2`	`2`	`# Licensed under the Apache License, Version 2.0 (the "License");`
`3`	`3`	`# you may not use this file except in compliance with the License.`
`4`	`4`	`# You may obtain a copy of the License at`