support torchrun dp

luccafong · luccafong · commit 521eeab4f159 · 2025-09-15T12:54:29.000-07:00
Signed-off-by: Lu Fang &lt;fanglu@fb.com&gt;
diff --git a/examples/offline_inference/torchrun_dp_example.py b/examples/offline_inference/torchrun_dp_example.py
@@ -0,0 +1,83 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+experimental support for data-parallel inference with torchrun
+Note the data load balancing and distribution is done out of the vllm engine,
+no internal lb supported in external_launcher mode.
+"""
+
+from vllm import LLM, SamplingParams
+
+# Create prompts, the same across all ranks
+prompts = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+] * 50
+
+# Create sampling parameters, the same across all ranks
+sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+# Use `distributed_executor_backend="external_launcher"` so that
+# this llm engine/instance only creates one worker.
+# it is important to set an explicit seed to make sure that
+# all ranks have the same random seed, so that sampling can be
+# deterministic across ranks.
+llm = LLM(
+    model="/data/local/models/oss/qwen1.5_2.7B_moe_chat",
+    tensor_parallel_size=2,
+    data_parallel_size=4,
+    pipeline_parallel_size=1,
+    enable_expert_parallel=True,
+    distributed_executor_backend="external_launcher",
+    max_model_len=32768,
+    # FIXME: with torch.compile, the torchrun processes do not exit properly
+    enforce_eager=True,
+    seed=1,
+)
+
+dp_rank = llm.llm_engine.vllm_config.parallel_config.data_parallel_rank
+dp_size = llm.llm_engine.vllm_config.parallel_config.data_parallel_size
+
+prompts = [
+    f"{idx}.{prompt}" for idx, prompt in enumerate(prompts) if idx % dp_size == dp_rank
+]
+
+outputs = llm.generate(prompts, sampling_params)
+
+
+# all ranks will have the same outputs
+print("-" * 50)
+for output in outputs:
+    prompt = output.prompt
+    generated_text = output.outputs[0].text
+    print(f"Prompt: {prompt!r}\nGenerated text: {generated_text!r}\n")
+    print("-" * 50)
+
+"""
+Further tips:
+
+1. to communicate control messages across all ranks, use the cpu group,
+a PyTorch ProcessGroup with GLOO backend.
+
+```python
+from vllm.distributed.parallel_state import get_world_group
+cpu_group = get_world_group().cpu_group
+torch_rank = dist.get_rank(group=cpu_group)
+if torch_rank == 0:
+    # do something for rank 0, e.g. saving the results to disk.
+```
+
+2. to communicate data across all ranks, use the model's device group,
+a PyTorch ProcessGroup with NCCL backend.
+```python
+from vllm.distributed.parallel_state import get_world_group
+device_group = get_world_group().device_group
+```
+
+3. to access the model directly in every rank, use the following code:
+```python
+llm.llm_engine.model_executor.driver_worker.worker.model_runner.model
+```
+"""
diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py
@@ -313,6 +313,10 @@ def __post_init__(self) -> None:
         # Continue with the rest of the initialization
         self.world_size = self.pipeline_parallel_size * \
             self.tensor_parallel_size
+        
+        if self.distributed_executor_backend == "external_launcher":
+            logger.info("Using external launcher for distributed inference.")
+            self.world_size *= self.data_parallel_size
 
         if self.data_parallel_size_local > self.data_parallel_size:
             raise ValueError(
@@ -321,6 +325,12 @@ def __post_init__(self) -> None:
 
         if self.data_parallel_size > 1 or self.data_parallel_size_local == 0:
             # Data parallel was specified in the engine args.
+            if self.distributed_executor_backend == "external_launcher":
+                # For external launcher, we need to set the data parallel rank automatically
+                # We assume DP is the first dimension of parallelism.
+                import os
+                self.data_parallel_rank = int(os.environ["RANK"]) // (self.world_size // self.data_parallel_size)
+                logger.debug(f"Setting data_parallel_rank to {self.data_parallel_rank} automatically.")
             if not self._data_parallel_master_port_list:
                 self._data_parallel_master_port_list = get_open_ports_list(5)
             self.data_parallel_master_port = \
diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py
@@ -991,7 +991,8 @@ def init_distributed_environment(world_size: int = -1,
         distributed_init_method, backend)
     from vllm.config import get_current_vllm_config
     config = get_current_vllm_config()
-    if config is not None and config.parallel_config.data_parallel_size > 1:
+    if config is not None and config.parallel_config.data_parallel_size > 1 \
+        and config.parallel_config.distributed_executor_backend != "external_launcher":
         parallel_config = config.parallel_config
         # adjust to take into account data parallelism
         # offset the rank by the data parallel rank
diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py
@@ -78,8 +78,9 @@ def __init__(
         # important: init dp group before init the engine_core
         # In the decoupled engine case this is handled in EngineCoreProc.
         parallel_config = vllm_config.parallel_config
-        if not multiprocess_mode and parallel_config.data_parallel_size > 1:
-            self.dp_group = parallel_config.stateless_init_dp_group()
+        if not multiprocess_mode and parallel_config.data_parallel_size > 1 \
+            and self.vllm_config.parallel_config.distributed_executor_backend != "external_launcher":
+                self.dp_group = parallel_config.stateless_init_dp_group()
         else:
             self.dp_group = None
         self.should_execute_dummy_batch = False