feat: add by_dlrover_run_cmd() shortcut for Ray backend (#1687)

Torino233 · BalaBalaYi · web-flow · commit 6ee822ca0348 · 2026-01-23T13:56:14.000+08:00
* feat: add by_dlrover_run_cmd() shortcut for Ray backend

* feat: add by_dlrover_run_cmd() shortcut for Ray backend

* feat: add by_dlrover_run_cmd() shortcut for Ray backend

* feat: add by_dlrover_run_cmd() shortcut for Ray backend

* feat: add by_dlrover_run_cmd() shortcut for Ray backend

* feat: add by_dlrover_run_cmd() shortcut for Ray backend

* feat: add by_dlrover_run_cmd() shortcut for Ray backend

* feat: add by_dlrover_run_cmd() shortcut for Ray backend

---------

Co-authored-by: Tianyi Chen &lt;chentianyi.cty@antfin.com&gt;
diff --git a/dlrover/python/unified/api/builder/base.py b/dlrover/python/unified/api/builder/base.py
@@ -27,6 +27,10 @@
 
 from pydantic import Field, model_validator
 
+import shlex
+
+from torch.distributed.run import get_args_parser
+
 from dlrover.python.unified.common.config import (
     DLConfig,
     JobConfig,
@@ -329,6 +333,33 @@ def _build_role(self) -> Dict[str, WorkloadDesc]:
         }
 
 
+def parse_run_cmd_argument(launcher, args):
+    if launcher not in ["dlrover-run", "torchrun"]:
+        raise ValueError(
+            f"Only 'dlrover-run' and 'torchrun' command is supported, got '{launcher}'"
+        )
+
+    if launcher == "torchrun":
+        parser = get_args_parser()
+        args = parser.parse_args(args)
+    else:
+        parser = get_args_parser()
+
+        # deprecated arguments
+        parser.add_argument(
+            "--node_check",
+            "--node-check",
+            "--network-check",
+            "--network_check",
+            action="store_true",
+            help="Whether to check node before starting training process.",
+        )
+        parser.allow_abbrev = False
+        args = parser.parse_args(args)
+
+    return args
+
+
 class DLJobBuilder(object):
     def __init__(self):
         # Dummy object to hold parameters, use default if not assigned.
@@ -555,3 +586,46 @@ def with_collocation_all(self, *exclude_roles):
             roles.add(role)
         self._collocations.append(roles)
         return self
+
+    def by_dlrover_run_cmd(self, cmd: str):
+        """
+        Automatically build DLJob from dlrover run command.
+        Args:
+            cmd: The dlrover run command string to build the job.
+                e.g.
+                "dlrover-run --nnodes=2 --nproc_per_node=2 ./dlrover/python/unified/tests/integration_test/dummy_run.py"
+
+            cmd contains the parameters:
+                --nnodes: number of nodes
+                --nproc_per_node: number of processes per node
+                --node_check: Whether to check node before starting training process.
+                entrypoint: the training script path with args
+        """
+        parts = shlex.split(cmd.strip())
+        launcher = parts[0]  # dlrover-run or torchrun
+        args = parts[1:]
+
+        args = parse_run_cmd_argument(launcher, args)
+
+        if launcher == "dlrover-run" and not args.node_check:
+            self = self.skip_node_check()
+
+        node_num = int(args.nnodes)
+        device_per_node = int(args.nproc_per_node)
+        nnodes = int(args.nnodes)
+        nproc_per_node = int(args.nproc_per_node)
+        training_script = args.training_script
+        for arg in args.training_script_args:
+            training_script += " " + arg
+
+        return (
+            self.node_num(node_num)
+            .device_per_node(device_per_node)
+            .device_type("CPU")
+            .config({"c1": "v1"})
+            .global_env({"eα": "ve", "DLROVER_LOG_LEVEL": "DEBUG"})
+            .train(training_script)
+            .nnodes(nnodes)
+            .nproc_per_node(nproc_per_node)
+            .end()
+        )
diff --git a/dlrover/python/unified/tests/api/test_builder.py b/dlrover/python/unified/tests/api/test_builder.py
@@ -29,6 +29,7 @@
     RLRoleType,
 )
 from dlrover.python.unified.tests.base import BaseTest
+import os
 
 
 class ApiTest(BaseTest):
@@ -118,6 +119,58 @@ def test_basic(self):
 
         self.assertEqual(len(rl_job.workloads), 6)
 
+    def test_by_dlrover_run_cmd(self):
+        root_dir = os.path.dirname(
+            os.path.dirname(
+                os.path.dirname(
+                    os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+                )
+            )
+        )
+        cmd = f"dlrover-run --nnodes=2 --nproc_per_node=2 --node_check {root_dir}/dlrover/python/unified/tests/integration_test/dummy_run.py --test 0"
+
+        dl_job = DLJobBuilder().by_dlrover_run_cmd(cmd).build()
+
+        for workload in dl_job.workloads.values():
+            if workload.backend == "elastic":
+                self.assertEqual(workload.comm_pre_check, True)
+
+        self.assertEqual(dl_job.node_num, 2)
+        self.assertEqual(dl_job.device_per_node, 2)
+        workload = dl_job.workloads["ELASTIC"]
+        self.assertEqual(
+            workload.entry_point,
+            f"{root_dir}/dlrover/python/unified/tests/integration_test/dummy_run.py --test 0",
+        )
+        self.assertEqual(workload.total, 4)  # nnodes * nproc_per_node
+
+        # test unspported cases
+        with self.assertRaises(ValueError):
+            DLJobBuilder().by_dlrover_run_cmd(
+                "unsupported-run --nnodes=1 train.py"
+            )
+
+    def test_by_torchrun_cmd(self):
+        root_dir = os.path.dirname(
+            os.path.dirname(
+                os.path.dirname(
+                    os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+                )
+            )
+        )
+        cmd = f"torchrun --nnodes=2 --nproc_per_node=2  {root_dir}/dlrover/python/unified/tests/integration_test/dummy_run.py --test 0"
+
+        dl_job = DLJobBuilder().by_dlrover_run_cmd(cmd).build()
+
+        self.assertEqual(dl_job.node_num, 2)
+        self.assertEqual(dl_job.device_per_node, 2)
+        workload = dl_job.workloads["ELASTIC"]
+        self.assertEqual(
+            workload.entry_point,
+            f"{root_dir}/dlrover/python/unified/tests/integration_test/dummy_run.py --test 0",
+        )
+        self.assertEqual(workload.total, 4)  # nnodes * nproc_per_node
+
     def test_extra_flag(self):
         job = (
             DLJobBuilder()
diff --git a/dlrover/python/unified/tests/integration_test/elastic_training_test.py b/dlrover/python/unified/tests/integration_test/elastic_training_test.py
@@ -69,6 +69,40 @@ def test_api_full(tmp_ray):
     assert ret == 0, "Job should succeed"
 
 
+@pytest.mark.timeout(40, func_only=True)
+def test_api_full_by_dlrover_run_cmd(tmp_ray):
+    root_dir = os.path.dirname(
+        os.path.dirname(
+            os.path.dirname(
+                os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+            )
+        )
+    )
+    cmd = f"dlrover-run --nnodes=2 --nproc_per_node=2 {root_dir}/dlrover/python/unified/tests/integration_test/dummy_run.py --test 0"
+
+    dl_job = DLJobBuilder().by_dlrover_run_cmd(cmd).build()
+
+    ret = dl_job.submit("test_cmd_api", master_cpu=1, master_memory=128)
+    assert ret == 0, "Job submitted via by_dlrover_run_cmd should succeed"
+
+
+@pytest.mark.timeout(40, func_only=True)
+def test_api_full_by_torchrun_cmd(tmp_ray):
+    root_dir = os.path.dirname(
+        os.path.dirname(
+            os.path.dirname(
+                os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+            )
+        )
+    )
+    cmd = f"torchrun --nnodes=2 --nproc_per_node=2 {root_dir}/dlrover/python/unified/tests/integration_test/dummy_run.py --test 0"
+
+    dl_job = DLJobBuilder().by_dlrover_run_cmd(cmd).build()
+
+    ret = dl_job.submit("test_cmd_api", master_cpu=1, master_memory=128)
+    assert ret == 0, "Job submitted via by_dlrover_run_cmd should succeed"
+
+
 @pytest.mark.timeout(40, func_only=True)  # 25s in ci
 def test_api_full_with_cmd(tmp_ray):
     root_dir = os.path.dirname(
diff --git a/docs/tutorial/unified/02-unified-api-guide.md b/docs/tutorial/unified/02-unified-api-guide.md
@@ -1,4 +1,4 @@
-# 02. Unified API Guide [Experimental]
+﻿# 02. Unified API Guide [Experimental]
 
 This section focuses on the DLJobBuilder and submission patterns: how to
 construct job configurations programmatically and submit them to the
@@ -36,7 +36,19 @@ job = (
 
 job.submit(job_name="nanogpt")
 ```
+- Single role(via CLI command): You can also initialize a single-role job by directly parsing a dlrover-run or
+torchrun command string. This automatically configures nnodes, nproc_per_node, 
+and the training entrypoint.
+```python
+from dlrover.python.unified.api.builder import DLJobBuilder
+
+# Conveniently convert a CLI command into a Ray job
+cmd = f"dlrover-run --nnodes=1 --nproc_per_node=1 {Your_dlrover_root_dir}/dlrover/python/unified/tests/integration_test/dummy_run.py --test 0"
 
+job = DLJobBuilder().by_dlrover_run_cmd(cmd).build()
+
+job.submit("test_cmd_api", master_cpu=1, master_memory=128)
+```
 ### Advanced examples
 
 - Multiple roles(outline):
@@ -87,7 +99,8 @@ version.)
 - role(str): defines the role name for multi-role jobs.
 - run(entrypoint): define a non-training workload with entrypoint, and return a sub builder.
 - workload(role, entrypoint): single method combine role + run
-- train(entrypoint): define a training workload with entrypoint (module path + function or command with python file), and return a sub builder.
+- train(entrypoint): define a training workload with entrypoint (module path + function or command with python file), and return a sub builder.
+- by_dlrover_run_cmd(command_str): Parses a dlrover-run or torchrun command to set up a single-role training job.
 
 ### Workload / Role patterns