vllm-project · simon-mo · Sep 22, 2025 · Sep 19, 2025 · Sep 19, 2025 · Sep 19, 2025
diff --git a/benchmarks/kernels/benchmark_w8a8_block_fp8.py b/benchmarks/kernels/benchmark_w8a8_block_fp8.py
@@ -11,13 +11,13 @@
 from typing import Any
 
 import torch
-import triton
 from tqdm import tqdm
 
 from vllm.model_executor.layers.quantization.utils.fp8_utils import (
     _w8a8_block_fp8_matmul,
 )
 from vllm.platforms import current_platform
+from vllm.triton_utils import triton
 from vllm.utils import FlexibleArgumentParser
 
 mp.set_start_method("spawn", force=True)

diff --git a/examples/others/tensorize_vllm_model.py b/examples/others/tensorize_vllm_model.py
@@ -1,8 +1,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import argparse
-import dataclasses
 import json
 import logging
 import os
@@ -327,12 +325,7 @@ def main():
 
 
     if args.command == "serialize":
-        eng_args_dict = {f.name: getattr(args, f.name) for f in
-                        dataclasses.fields(EngineArgs)}
-
-        engine_args = EngineArgs.from_cli_args(
-            argparse.Namespace(**eng_args_dict)
-        )
+        engine_args = EngineArgs.from_cli_args(args)
 
         input_dir = tensorizer_dir.rstrip('/')
         suffix = args.suffix if args.suffix else uuid.uuid4().hex

diff --git a/tests/compile/piecewise/test_full_cudagraph.py b/tests/compile/piecewise/test_full_cudagraph.py
@@ -46,7 +46,10 @@ class BackendConfig:
     # FA3 on Hopper
     "FA3":
     BackendConfig(name="FA3",
-                  env_vars={"VLLM_FLASH_ATTN_VERSION": "3"},
+                  env_vars={
+                      "VLLM_FLASH_ATTN_VERSION": "3",
+                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": 16,
+                  },
                   comp_config={
                       "cudagraph_mode": "FULL",
                   },
@@ -66,6 +69,7 @@ class BackendConfig:
     BackendConfig(name="FlashAttentionMLA",
                   env_vars={
                       "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
+                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": 16,
                   },
                   comp_config={
                       "cudagraph_mode": "FULL_DECODE_ONLY",
@@ -89,7 +93,10 @@ class BackendConfig:
     # FA2
     "FA2":
     BackendConfig(name="FA2",
-                  env_vars={"VLLM_FLASH_ATTN_VERSION": "2"},
+                  env_vars={
+                      "VLLM_FLASH_ATTN_VERSION": "2",
+                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": 16,
+                  },
                   comp_config={
                       "cudagraph_mode": "FULL",
                   }),

diff --git a/tests/entrypoints/test_api_server_process_manager.py b/tests/entrypoints/test_api_server_process_manager.py
@@ -60,7 +60,7 @@ def test_api_server_process_manager_init(api_server_args, with_stats_update):
     global WORKER_RUNTIME_SECONDS
     WORKER_RUNTIME_SECONDS = 0.5
 
-    # Copy the args to avoid mutating the
+    # Copy the args to avoid mutating them
     args = api_server_args.copy()
 
     if not with_stats_update:

diff --git a/tests/v1/cudagraph/test_cudagraph_mode.py b/tests/v1/cudagraph/test_cudagraph_mode.py
@@ -47,7 +47,10 @@ class BackendConfig:
     # FA3 on Hopper
     "FA3":
     BackendConfig(name="FA3",
-                  env_vars={"VLLM_FLASH_ATTN_VERSION": "3"},
+                  env_vars={
+                      "VLLM_FLASH_ATTN_VERSION": "3",
+                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": 16,
+                  },
                   comp_config={
                       "cudagraph_mode": "FULL",
                   },
@@ -67,6 +70,7 @@ class BackendConfig:
     BackendConfig(name="FlashAttentionMLA",
                   env_vars={
                       "VLLM_ATTENTION_BACKEND": "FLASH_ATTN_MLA",
+                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": 16,
                   },
                   comp_config={
                       "cudagraph_mode": "FULL_DECODE_ONLY",
@@ -75,7 +79,10 @@ class BackendConfig:
     # FA2
     "FA2":
     BackendConfig(name="FA2",
-                  env_vars={"VLLM_FLASH_ATTN_VERSION": "2"},
+                  env_vars={
+                      "VLLM_FLASH_ATTN_VERSION": "2",
+                      "VLLM_FLASH_ATTN_MAX_NUM_SPLITS_FOR_CUDA_GRAPH": 16,
+                  },
                   comp_config={
                       "cudagraph_mode": "FULL_AND_PIECEWISE",
                   }),

diff --git a/tests/v1/test_external_lb_dp.py b/tests/v1/test_external_lb_dp.py
@@ -9,6 +9,7 @@
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
+import requests
 
 from tests.utils import RemoteOpenAIServer
 from vllm.platforms import current_platform
@@ -70,6 +71,8 @@ def start_server(r: int, sargs: list[str]):
                         sargs,
                         auto_port=False,
                         env_dict={
+                            "VLLM_SERVER_DEV_MODE":
+                            "1",
                             current_platform.device_control_env_var:
                             ",".join(
                                 str(
@@ -127,11 +130,19 @@ def default_server_args():
 
 
 @pytest.fixture(scope="module", params=[1, 4])
-def servers(request, default_server_args):
+def server_manager(request, default_server_args):
     api_server_count = request.param
-    with ExternalLBServerManager(MODEL_NAME, DP_SIZE, api_server_count,
-                                 default_server_args) as server_list:
-        yield server_list
+    server_manager = ExternalLBServerManager(MODEL_NAME, DP_SIZE,
+                                             api_server_count,
+                                             default_server_args)
+
+    with server_manager:
+        yield server_manager
+
+
+@pytest.fixture
+def servers(server_manager):
+    return server_manager.servers
 
 
 @pytest_asyncio.fixture
@@ -144,6 +155,39 @@ async def clients(servers: list[tuple[RemoteOpenAIServer, list[str]]]):
         ]
 
 
+def _get_parallel_config(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("server_info?config_format=json"))
+    response.raise_for_status()
+
+    vllm_config = response.json()["vllm_config"]
+    return vllm_config["parallel_config"]
+
+
+def test_external_lb_server_info(server_manager):
+    servers = server_manager.servers
+    api_server_count = server_manager.api_server_count
+
+    for i, (server, _) in enumerate(servers):
+        print(f"Testing {i=}")
+
+        # Each request will hit one of the API servers
+        # `n_reqs` is set so that there is a good chance each server
+        # receives at least one request
+        n_reqs = 2 * api_server_count * api_server_count
+        parallel_configs = [
+            _get_parallel_config(server) for _ in range(n_reqs)
+        ]
+        api_process_counts = [
+            c["_api_process_count"] for c in parallel_configs
+        ]
+        api_process_ranks = [c["_api_process_rank"] for c in parallel_configs]
+
+        assert all(c == api_server_count
+                   for c in api_process_counts), api_process_counts
+        assert all(0 <= r < api_server_count
+                   for r in api_process_ranks), api_process_ranks
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",

diff --git a/tests/v1/test_hybrid_lb_dp.py b/tests/v1/test_hybrid_lb_dp.py
@@ -9,6 +9,7 @@
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
+import requests
 
 from tests.utils import RemoteOpenAIServer
 from tests.v1.test_utils import check_request_balancing
@@ -92,6 +93,8 @@ def start_server(node: int, sargs: list[str]):
                         sargs,
                         auto_port=False,
                         env_dict={
+                            "VLLM_SERVER_DEV_MODE":
+                            "1",
                             current_platform.device_control_env_var:
                             ",".join(
                                 str(
@@ -150,12 +153,20 @@ def default_server_args():
 
 
 @pytest.fixture(scope="module", params=[1, 4])
-def servers(request, default_server_args):
+def server_manager(request, default_server_args):
     api_server_count = request.param
-    with HybridLBServerManager(MODEL_NAME, DP_SIZE, api_server_count,
-                               default_server_args, DP_SIZE_LOCAL,
-                               TP_SIZE) as server_list:
-        yield server_list
+    server_manager = HybridLBServerManager(MODEL_NAME, DP_SIZE,
+                                           api_server_count,
+                                           default_server_args, DP_SIZE_LOCAL,
+                                           TP_SIZE)
+
+    with server_manager:
+        yield server_manager
+
+
+@pytest.fixture
+def servers(server_manager):
+    return server_manager.servers
 
 
 @pytest_asyncio.fixture
@@ -168,6 +179,39 @@ async def clients(servers: list[tuple[RemoteOpenAIServer, list[str]]]):
         ]
 
 
+def _get_parallel_config(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("server_info?config_format=json"))
+    response.raise_for_status()
+
+    vllm_config = response.json()["vllm_config"]
+    return vllm_config["parallel_config"]
+
+
+def test_hybrid_dp_server_info(server_manager):
+    servers = server_manager.servers
+    api_server_count = server_manager.api_server_count
+
+    for i, (server, _) in enumerate(servers):
+        print(f"Testing {i=}")
+
+        # Each request will hit one of the API servers
+        # `n_reqs` is set so that there is a good chance each server
+        # receives at least one request
+        n_reqs = 2 * api_server_count * api_server_count
+        parallel_configs = [
+            _get_parallel_config(server) for _ in range(n_reqs)
+        ]
+        api_process_counts = [
+            c["_api_process_count"] for c in parallel_configs
+        ]
+        api_process_ranks = [c["_api_process_rank"] for c in parallel_configs]
+
+        assert all(c == api_server_count
+                   for c in api_process_counts), api_process_counts
+        assert all(0 <= r < api_server_count
+                   for r in api_process_ranks), api_process_ranks
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",

diff --git a/tests/v1/test_internal_lb_dp.py b/tests/v1/test_internal_lb_dp.py
@@ -10,6 +10,7 @@
 import openai  # use the official client for correctness check
 import pytest
 import pytest_asyncio
+import requests
 
 from tests.utils import RemoteOpenAIServer
 from tests.v1.test_utils import check_request_balancing
@@ -101,6 +102,8 @@ def start_server(sidx: int, r: int, sargs: list[str]):
                         sargs,
                         auto_port=False,
                         env_dict={
+                            "VLLM_SERVER_DEV_MODE":
+                            "1",
                             current_platform.device_control_env_var:
                             ",".join(
                                 str(
@@ -214,7 +217,10 @@ def start_api_server():
                     self.model_name,
                     api_server_args,
                     auto_port=False,
-                    env_dict={})  # No GPUs needed for API-only server
+                    env_dict={
+                        "VLLM_SERVER_DEV_MODE": "1",
+                        # No GPUs needed for API-only server
+                    })
                 server.__enter__()
                 print(f"API-only server started successfully with "
                       f"{self.api_server_count} API servers")
@@ -293,14 +299,21 @@ def default_server_args():
 
 
 @pytest.fixture(scope="module", params=[1, 4])
-def servers(request, default_server_args):
+def server_manager(request, default_server_args):
     api_server_count = request.param
-    with MultinodeInternalLBServerManager(MODEL_NAME, DP_SIZE,
-                                          api_server_count,
-                                          default_server_args,
-                                          DP_SIZE // NUM_NODES,
-                                          TP_SIZE) as server_list:
-        yield server_list
+    server_manager = MultinodeInternalLBServerManager(MODEL_NAME, DP_SIZE,
+                                                      api_server_count,
+                                                      default_server_args,
+                                                      DP_SIZE // NUM_NODES,
+                                                      TP_SIZE)
+
+    with server_manager:
+        yield server_manager
+
+
+@pytest.fixture
+def servers(server_manager):
+    return server_manager.servers
 
 
 @pytest.fixture(scope="module", params=[1, 4])
@@ -331,6 +344,34 @@ async def api_only_client(api_only_servers: list[tuple[RemoteOpenAIServer,
         yield client
 
 
+def _get_parallel_config(server: RemoteOpenAIServer):
+    response = requests.get(server.url_for("server_info?config_format=json"))
+    response.raise_for_status()
+
+    vllm_config = response.json()["vllm_config"]
+    return vllm_config["parallel_config"]
+
+
+def test_multinode_dp_server_info(server_manager):
+    head_server = server_manager.servers[0][0]
+    api_server_count = server_manager.api_server_count
+
+    # Each request will hit one of the API servers
+    # `n_reqs` is set so that there is a good chance each server
+    # receives at least one request
+    n_reqs = 2 * api_server_count * api_server_count
+    parallel_configs = [
+        _get_parallel_config(head_server) for _ in range(n_reqs)
+    ]
+    api_process_counts = [c["_api_process_count"] for c in parallel_configs]
+    api_process_ranks = [c["_api_process_rank"] for c in parallel_configs]
+
+    assert all(c == api_server_count
+               for c in api_process_counts), api_process_counts
+    assert all(0 <= r < api_server_count
+               for r in api_process_ranks), api_process_ranks
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     "model_name",