[0.9.1] Enable external distributed dp deployments in vllm ascend(0.9.1 only) (#2109)

ganyi1996ppo · web-flow · commit e3636c7ebc69 · 2025-08-05T22:24:42.000+08:00
### What this PR does / why we need it?
The vllm's dp strategy follows the classic master-slave implementation
for both scale-out and scale-up scenario. However, this kind of
implementation usually bring more pressure to the master node than the
others. Which may cause the unbalance host overhead issue for different
woker process, and potentially harm the performance.

Except for the master-slave structure, the dplb is implemented by by
lots of independent process, the isolated memory space also makes the
total balance on the engine workload become hard to achieve.

In this PR, we break the chain of master-slave struture and seperate the
dp instance to different vllm engine instance which owns its private ip
and port repectively. This implementation evens host pressure for the
different worker process, and brings untrival performance boost over the
former one. Besides, the load balance case can be achieved by proxy
rather than
independent process inside the engine, which give more flexibility to
the users.

### Does this PR introduce _any_ user-facing change?
Yes, this implementation will have distinct launch script compared with
vllm's original one, the usage tutorial and example scripts are placed
in the `external_online_dp` folder.

### How was this patch tested?

---------

Signed-off-by: ganyi &lt;pleaplusone.gy@gmail.com&gt;
diff --git a/examples/external_online_dp/README.md b/examples/external_online_dp/README.md
diff --git a/examples/external_online_dp/launch_dp_program.py b/examples/external_online_dp/launch_dp_program.py
@@ -0,0 +1,34 @@
+import multiprocessing
+import os
+import sys
+
+dp_size = 32
+dp_size_local = 16
+dp_rank_start = 0
+dp_ip = "your_dp_ip_here"
+dp_port = "your_dp_port_here"
+engine_port = 9000
+template_path = "./run_dp_template.sh"
+if not os.path.exists(template_path):
+    print(f"Template file {template_path} does not exist.")
+    sys.exit(1)
+
+
+def run_command(dp_rank_local, dp_rank, engine_port_):
+    command = f"bash ./run_dp_template.sh {dp_size} {dp_ip} {dp_port} {dp_rank_local} {dp_rank} {engine_port_} {dp_size_local}"
+    os.system(command)
+
+
+processes = []
+for i in range(dp_size_local):
+    dp_rank = dp_rank_start + i
+    dp_rank_local = i
+    engine_port_ = engine_port + i
+    process = multiprocessing.Process(target=run_command,
+                                      args=(dp_rank_local, dp_rank,
+                                            engine_port_))
+    processes.append(process)
+    process.start()
+
+for process in processes:
+    process.join()
diff --git a/examples/external_online_dp/run_dp_template.sh b/examples/external_online_dp/run_dp_template.sh
@@ -0,0 +1,51 @@
+export HCCL_IF_IP=your_ip_here
+export GLOO_SOCKET_IFNAME="enp48s3u1u1"
+export TP_SOCKET_IFNAME="enp48s3u1u1"
+export HCCL_SOCKET_IFNAME="enp48s3u1u1"
+export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=your_rank_table_path_here
+export VLLM_LOGGING_LEVEL="info"
+export OMP_PROC_BIND=false
+export OMP_NUM_THREADS=10
+export PYTORCH_NPU_ALLOC_CONF=expandable_segments:True
+export VLLM_DP_SIZE=$1
+export VLLM_DP_MASTER_IP=$2
+export VLLM_DP_MASTER_PORT=$3
+export VLLM_DP_RANK_LOCAL=$4
+export VLLM_DP_RANK=$5
+export VLLM_DP_SIZE_LOCAL=$7
+export HCCL_DETERMINISTIC=True
+export HCCL_BUFFER_SIZE=1024
+export TASK_QUEUE_ENABLE=1
+# Spawn the process inside the vllm maybe cause the circular import issue, using fork here is necessary
+export VLLM_WORKER_MULTIPROC_METHOD="fork"
+
+
+export VLLM_USE_V1=1
+
+export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+
+vllm serve model_path \
+    --host 0.0.0.0 \
+    --port $6 \
+    --tensor-parallel-size 2 \
+    --enable-expert-parallel \
+    --seed 1024 \
+    --served-model-name dsv3 \
+    --max-model-len 5200 \
+    --max-num-batched-tokens 256 \
+    --max-num-seqs 28 \
+    --trust-remote-code \
+    --gpu-memory-utilization 0.9 \
+    --quantization ascend \
+    --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
+    --kv-transfer-config \
+    '{"kv_connector": "LLMDataDistCMgrConnector",
+      "kv_buffer_device": "npu",
+      "kv_role": "kv_consumer",
+      "kv_parallel_size": "1",
+      "kv_port": "20001",
+      "engine_id": "0",
+      "kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
+    }' \
+    --additional-config \
+    '{"ascend_scheduler_config": {"enabled": true}, "torchair_graph_config":{"enabled":true,"enable_kv_nz":false, "enable_multistream_moe":false, "graph_batch_size":[28]}, "enable_weight_nz_layout":true}`
diff --git a/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py b/vllm_ascend/distributed/llmdatadist_c_mgr_connector.py
@@ -308,9 +308,9 @@ def __init__(self, vllm_config: VllmConfig):
         logger.info("Initialize the LLMDataDistCMgrConnectorWorker")
         # we assume the local node only contains dp and tp, and tp will not communicate inter-node.
         # for any scenario beyond this scope, the functionality of this connector is not guaranteed.
+        dp_size_local = vllm_config.parallel_config.data_parallel_size_local if not envs.VLLM_ASCEND_EXTERNAL_DP_LB_ENABLED else envs.VLLM_DP_SIZE_LOCAL
         self.local_rank_on_node = get_world_group().rank % (
-            vllm_config.parallel_config.data_parallel_size_local *
-            vllm_config.parallel_config.tensor_parallel_size)
+            dp_size_local * vllm_config.parallel_config.tensor_parallel_size)
         self.local_rank = get_world_group().local_rank
         self.local_dp_rank = vllm_config.parallel_config.data_parallel_rank_local
         self.tp_size = vllm_config.parallel_config.tensor_parallel_size
diff --git a/vllm_ascend/envs.py b/vllm_ascend/envs.py
@@ -163,6 +163,12 @@
     "VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE":
     lambda: int(
         os.getenv("VLLM_ASCEND_KV_CACHE_MEGABYTES_FLOATING_TOLERANCE", '0')),
+    # VLLM_DP_SIZE_LOCAL: used for external data parallelism in vllm-ascend to specify the local parallel size of current node, 0.9.1 specific.
+    "VLLM_DP_SIZE_LOCAL":
+    lambda: int(os.getenv("VLLM_DP_SIZE_LOCAL", '0')),
+    # VLLM_ASCEND_EXTERNAL_DP_LB_ENABLED: used for external distributed data parallelism in vllm-ascend, 0.9.1 specific.
+    "VLLM_ASCEND_EXTERNAL_DP_LB_ENABLED":
+    lambda: bool(int(os.getenv("VLLM_ASCEND_EXTERNAL_DP_LB_ENABLED", '0'))),
 }
 
 # end-env-vars-definition
diff --git a/vllm_ascend/patch/platform/patch_0_9_1/__init__.py b/vllm_ascend/patch/platform/patch_0_9_1/__init__.py
@@ -19,3 +19,7 @@
 # patch files.
 import vllm_ascend.patch.worker.patch_common.patch_utils  # noqa isort:skip
 import vllm_ascend.patch.platform.patch_0_9_1.patch_cache_manager  # noqa
+import vllm_ascend.patch.platform.patch_0_9_1.patch_configs  # noqa
+import vllm_ascend.patch.platform.patch_0_9_1.patch_core  # noqa
+import vllm_ascend.patch.platform.patch_0_9_1.patch_core_client  # noqa
+import vllm_ascend.patch.platform.patch_0_9_1.patch_decorator  # noqa
diff --git a/vllm_ascend/patch/platform/patch_0_9_1/patch_configs.py b/vllm_ascend/patch/platform/patch_0_9_1/patch_configs.py
@@ -0,0 +1,77 @@
+import vllm.envs as envs
+from vllm.config import DistributedExecutorBackend, ParallelConfig
+from vllm.logger import init_logger
+
+import vllm_ascend.envs as vllm_ascend_envs
+
+logger = init_logger(__name__)
+
+
+def __post_init__(self: ParallelConfig) -> None:
+    self.world_size = self.pipeline_parallel_size * \
+        self.tensor_parallel_size
+
+    if self.data_parallel_size_local > self.data_parallel_size:
+        raise ValueError(
+            f"data_parallel_size_local ({self.data_parallel_size_local}) "
+            f"must be <= data_parallel_size ({self.data_parallel_size})")
+
+    self.data_parallel_size = envs.VLLM_DP_SIZE
+    self.data_parallel_rank = envs.VLLM_DP_RANK
+    self.data_parallel_rank_local = envs.VLLM_DP_RANK_LOCAL
+    self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP
+    self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT
+
+    if self.distributed_executor_backend == "external_launcher":
+        import os
+        os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
+        logger.info("Disabling V1 multiprocessing for external launcher.")
+
+    ray_only_devices: list[str] = []
+    from vllm.platforms import current_platform
+    if (current_platform.device_type in ray_only_devices
+            and self.world_size > 1):
+        if self.distributed_executor_backend is None:
+            self.distributed_executor_backend = "ray"
+        if self.distributed_executor_backend != "ray":
+            raise ValueError(
+                f"{current_platform.device_type.upper()} backend only "
+                "supports Ray for distributed inference.")
+
+    if self.distributed_executor_backend is None and self.world_size > 1:
+        # We use multiprocessing by default if world_size fits on the
+        # current node and we aren't in a ray placement group.
+
+        from vllm.executor import ray_utils
+        backend: DistributedExecutorBackend = "mp"
+        ray_found = ray_utils.ray_is_available()
+        if current_platform.is_neuron():
+            # neuron uses single process to control multiple devices
+            backend = "uni"
+        elif current_platform.is_tpu() and envs.VLLM_XLA_USE_SPMD:
+            backend = "uni"
+        elif self.data_parallel_backend == "ray":
+            logger.info("Using ray distributed inference because "
+                        "data_parallel_backend is ray")
+            backend = "ray"
+        elif ray_found:
+            if self.placement_group:
+                backend = "ray"
+            else:
+                from ray import is_initialized as ray_is_initialized
+                if ray_is_initialized():
+                    from ray.util import get_current_placement_group
+                    if get_current_placement_group():
+                        backend = "ray"
+        self.distributed_executor_backend = backend
+        logger.info("Defaulting to use %s for distributed inference", backend)
+
+    if self.distributed_executor_backend is None and self.world_size == 1:
+        self.distributed_executor_backend = "uni"
+
+    self._verify_args()
+
+
+# apply this patch only if the external data parallelism is enabled
+if vllm_ascend_envs.VLLM_ASCEND_EXTERNAL_DP_LB_ENABLED:
+    ParallelConfig.__post_init__ = __post_init__  # type: ignore[attr-defined]
diff --git a/vllm_ascend/patch/platform/patch_0_9_1/patch_core.py b/vllm_ascend/patch/platform/patch_0_9_1/patch_core.py
@@ -0,0 +1,132 @@
+import os
+import signal
+from typing import Optional
+
+from vllm.config import ParallelConfig, VllmConfig
+from vllm.logger import init_logger
+from vllm.transformers_utils.config import \
+    maybe_register_config_serialize_by_value
+from vllm.v1.engine.core import DPEngineCoreProc, EngineCoreProc
+
+import vllm_ascend.envs as vllm_ascend_envs
+
+logger = init_logger(__name__)
+
+
+class ExternealDPEngineCoreProc(DPEngineCoreProc):
+
+    def __init__(self, *args, **kwargs):
+        # Use the external data parallelism master port from envs
+        super().__init__(*args, **kwargs)
+        self.engines_running = True
+
+    def _has_global_unfinished_reqs(self, local_unfinished):
+        return True
+
+    def _init_data_parallel(self, vllm_config: VllmConfig):
+
+        # Configure GPUs and stateless process group for data parallel.
+        dp_rank = vllm_config.parallel_config.data_parallel_rank
+        dp_size = vllm_config.parallel_config.data_parallel_size
+        local_dp_rank = vllm_config.parallel_config.data_parallel_rank_local
+
+        assert dp_size > 1
+        assert 0 <= local_dp_rank <= dp_rank < dp_size
+
+        if vllm_config.kv_transfer_config is not None:
+            # modify the engine_id and append the local_dp_rank to it to ensure
+            # that the kv_transfer_config is unique for each DP rank.
+            vllm_config.kv_transfer_config.engine_id = (
+                f"{vllm_config.kv_transfer_config.engine_id}_dp{local_dp_rank}"
+            )
+            logger.debug("Setting kv_transfer_config.engine_id to %s",
+                         vllm_config.kv_transfer_config.engine_id)
+
+        from vllm.platforms import current_platform
+        device_control_env_var = current_platform.device_control_env_var
+        world_size = vllm_config.parallel_config.world_size
+        os.environ[device_control_env_var] = ",".join(
+            str(current_platform.device_id_to_physical_device_id(i))
+            for i in range(local_dp_rank * world_size, (local_dp_rank + 1) *
+                           world_size))
+
+        self.dp_rank = dp_rank
+
+    def run_busy_loop(self):
+        """Core busy loop of the EngineCore for data parallel case."""
+        # Note: In customized DPEngineCoreProc, no idle time will exist. We assume the another dp groups are always
+        # running.
+
+        # Loop until process is sent a SIGINT or SIGTERM
+        while True:
+            # 1) Poll the input queue until there is work to do.
+            self._process_input_queue()
+
+            # 2) Step the engine core.
+            executed = self._process_engine_step()
+            self._maybe_publish_request_counts()
+
+            local_unfinished_reqs = self.scheduler.has_unfinished_requests()
+            if not executed:
+                if not local_unfinished_reqs and not self.engines_running:
+                    # All engines are idle.
+                    continue
+
+                # We are in a running state and so must execute a dummy pass
+                # if the model didn't execute any ready requests.
+                self.execute_dummy_batch()
+
+
+def run_engine_core(*args, dp_rank: int = 0, local_dp_rank: int = 0, **kwargs):
+    """Launch EngineCore busy loop in background process."""
+
+    # Signal handler used for graceful termination.
+    # SystemExit exception is only raised once to allow this and worker
+    # processes to terminate without error
+    shutdown_requested = False
+
+    # Ensure we can serialize transformer config after spawning
+    maybe_register_config_serialize_by_value()
+
+    def signal_handler(signum, frame):
+        nonlocal shutdown_requested
+        if not shutdown_requested:
+            shutdown_requested = True
+            raise SystemExit()
+
+    # Either SIGTERM or SIGINT will terminate the engine_core
+    signal.signal(signal.SIGTERM, signal_handler)
+    signal.signal(signal.SIGINT, signal_handler)
+
+    engine_core: Optional[EngineCoreProc] = None
+    try:
+        parallel_config: ParallelConfig = kwargs["vllm_config"].parallel_config
+        if parallel_config.data_parallel_size > 1 or dp_rank > 0:
+            # Set data parallel rank for this engine process.
+            parallel_config.data_parallel_rank = dp_rank
+            parallel_config.data_parallel_rank_local = local_dp_rank
+            engine_core = ExternealDPEngineCoreProc(*args, **kwargs)
+        else:
+            engine_core = EngineCoreProc(*args, **kwargs)
+
+        engine_core.run_busy_loop()
+
+    except SystemExit:
+        logger.debug("EngineCore exiting.")
+        raise
+    except Exception as e:
+        if engine_core is None:
+            logger.exception("EngineCore failed to start.")
+        else:
+            logger.exception("EngineCore encountered a fatal error.")
+            engine_core._send_engine_dead()
+        raise e
+    finally:
+        if engine_core is not None:
+            engine_core.shutdown()
+
+
+# Apply this patch only if the external data parallelism is enabled
+if vllm_ascend_envs.VLLM_ASCEND_EXTERNAL_DP_LB_ENABLED:
+    # Patch the EngineCoreClient to use the custom make_async_mp_client
+    EngineCoreProc.run_engine_core = run_engine_core  # type: ignore[attr-defined]
diff --git a/vllm_ascend/patch/platform/patch_0_9_1/patch_core_client.py b/vllm_ascend/patch/platform/patch_0_9_1/patch_core_client.py
@@ -0,0 +1,26 @@
+from typing import Optional
+
+from vllm.config import VllmConfig
+from vllm.v1.engine.core_client import (AsyncMPClient, EngineCoreClient,
+                                        MPClient)
+from vllm.v1.executor.abstract import Executor
+
+import vllm_ascend.envs as vllm_ascend_envs
+
+
+def make_async_mp_client(
+    vllm_config: VllmConfig,
+    executor_class: type[Executor],
+    log_stats: bool,
+    client_addresses: Optional[dict[str, str]] = None,
+    client_index: int = 0,
+) -> "MPClient":
+    # Use only AsyncMPClient here for dp scenario and use nginx for the dp request routering
+    return AsyncMPClient(vllm_config, executor_class, log_stats,
+                         client_addresses, client_index)
+
+
+# Apply this patch only if the external data parallelism is enabled
+if vllm_ascend_envs.VLLM_ASCEND_EXTERNAL_DP_LB_ENABLED:
+    # Patch the EngineCoreClient to use the custom make_async_mp_client
+    EngineCoreClient.make_async_mp_client = make_async_mp_client  # type: ignore[attr-defined]