diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 0eb64e566e71..d278dc983805 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -859,7 +859,7 @@ steps: - label: Language Models Tests (Extra Standard) %N timeout_in_minutes: 45 mirror_hardwares: [amdexperimental] - agent_pool: mi325_8 + agent_pool: mi325_2 # grade: Blocking torch_nightly: true source_file_dependencies: @@ -871,6 +871,7 @@ steps: # Shard slow subset of standard language models tests. Only run when model # source is modified, or when specified test files are modified - pip freeze | grep -E 'torch' + - export TORCH_NCCL_BLOCKING_WAIT=1 - pytest -v -s models/language -m 'core_model and slow_test' \ --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT \ --shard-id=$$BUILDKITE_PARALLEL_JOB @@ -888,7 +889,7 @@ steps: commands: # Install fast path packages for testing against transformers # Note: also needed to run plamo2 model in vLLM - - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' # Shard hybrid language model tests - pytest -v -s models/language/generation \ @@ -909,7 +910,7 @@ steps: commands: # Install fast path packages for testing against transformers # Note: also needed to run plamo2 model in vLLM - - uv pip install --system --no-build-isolation 'git+https://github.com/state-spaces/mamba@v2.2.5' + - uv pip install --system --no-build-isolation 'git+https://github.com/AndreasKaratzas/mamba@fix-rocm-7.0-warp-size-constexpr' - uv pip install --system --no-build-isolation 'git+https://github.com/Dao-AILab/causal-conv1d@v1.5.2' - pytest -v -s models/language/generation -m '(not core_model) and (not hybrid_model)' diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 955c6d1c07bf..9a26dc611515 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -943,7 +943,6 @@ steps: timeout_in_minutes: 30 working_dir: "/vllm-workspace/" gpu: b200 - # optional: true source_file_dependencies: - csrc/quantization/fp4/ - csrc/attention/mla/ @@ -1348,6 +1347,14 @@ steps: - CUDA_VISIBLE_DEVICES=1,2 VLLM_USE_DEEP_GEMM=1 VLLM_LOGGING_LEVEL=DEBUG python3 examples/offline_inference/data_parallel.py --model=Qwen/Qwen1.5-MoE-A2.7B -tp=1 -dp=2 --max-model-len=2048 --all2all-backend=deepep_high_throughput - pytest -v -s tests/v1/distributed/test_dbo.py +- label: LM Eval Large Models (H200) # optional + timeout_in_minutes: 60 + gpu: h200 + optional: true + num_gpus: 8 + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=configs/models-h200.txt + ##### B200 test ##### - label: Distributed Tests (B200) # optional gpu: b200 @@ -1399,3 +1406,19 @@ steps: working_dir: "/vllm-workspace" commands: - bash .buildkite/scripts/scheduled_integration_test/qwen30b_a3b_fp8_block_ep_eplb.sh 0.8 200 8020 2 1 + +##### MoE Refactor (Temporary) Tests ##### + +- label: MoE Refactor Integration Test (H100 - TEMPORARY) # optional + gpu: h100 + optional: true + num_gpus: 2 + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-h100.txt + +- label: MoE Refactor Integration Test (B200 - TEMPORARY) # optional + gpu: b200 + optional: true + num_gpus: 2 + commands: + - pytest -s -v evals/gsm8k/test_gsm8k_correctness.py --config-list-file=evals/gsm8k/configs/moe-refactor/config-b200.txt diff --git a/benchmarks/kernels/benchmark_activation.py b/benchmarks/kernels/benchmark_activation.py index d31e67057d8f..fbe5f744148e 100644 --- a/benchmarks/kernels/benchmark_activation.py +++ b/benchmarks/kernels/benchmark_activation.py @@ -8,10 +8,9 @@ import vllm.model_executor.layers.activation # noqa F401 from vllm.model_executor.custom_op import CustomOp -from vllm.platforms import current_platform from vllm.triton_utils import triton from vllm.utils.argparse_utils import FlexibleArgumentParser -from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE +from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed batch_size_range = [1, 16, 128] seq_len_range = [1, 16, 64, 1024, 4096] @@ -30,7 +29,7 @@ def benchmark_activation( device = "cuda" num_tokens = batch_size * seq_len dim = intermediate_size - current_platform.seed_everything(42) + set_random_seed(42) torch.set_default_device(device) if func_name == "gelu_and_mul": diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py index e07d6c776bc0..626b3b160044 100644 --- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py +++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py @@ -15,6 +15,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk from vllm.platforms import current_platform from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.v1.worker.workspace import init_workspace_manager # Weight shapes for different models: [num_experts, topk, hidden_size, # intermediate_size] @@ -297,6 +298,10 @@ def bench_cuda_graph(graph, num_warmup=5, num_iters=100): def main(args): + # Initialize workspace manager (required for CUTLASS MoE kernels) + device = torch.device("cuda:0") + init_workspace_manager(device) + print("Benchmarking models:") for i, model in enumerate(args.models): print(f"[{i}] {model}") diff --git a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py similarity index 98% rename from benchmarks/kernels/benchmark_cutlass_fp4_moe.py rename to benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py index 7982cbb1422c..d6b5820a5b41 100644 --- a/benchmarks/kernels/benchmark_cutlass_fp4_moe.py +++ b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py @@ -21,6 +21,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk from vllm.scalar_type import scalar_types from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.v1.worker.workspace import init_workspace_manager WEIGHT_SHAPES_MOE = { "nvidia/DeepSeek-R1-FP4": [ @@ -441,6 +442,10 @@ def replay_graph(graph, num_repeats): def main(args): + # Initialize workspace manager (required for CUTLASS MoE kernels) + device = torch.device("cuda:0") + init_workspace_manager(device) + print("Benchmarking models:") for i, model in enumerate(args.models): print(f"[{i}] {model}") diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py index 9b426d8d5f77..4390be8770c1 100644 --- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py +++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py @@ -14,6 +14,7 @@ fused_topk, ) from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.v1.worker.workspace import init_workspace_manager DEFAULT_MODELS = [ "mistralai/Mixtral-8x7B-Instruct-v0.1", @@ -364,6 +365,10 @@ def replay_graph(graph, num_repeats): def main(args): + # Initialize workspace manager (required for CUTLASS MoE kernels) + device = torch.device("cuda:0") + init_workspace_manager(device) + print("Benchmarking models:") for i, model in enumerate(args.models): print(f"[{i}] {model}") diff --git a/benchmarks/kernels/benchmark_layernorm.py b/benchmarks/kernels/benchmark_layernorm.py index 6fa5c248670e..2292d2f87288 100644 --- a/benchmarks/kernels/benchmark_layernorm.py +++ b/benchmarks/kernels/benchmark_layernorm.py @@ -6,9 +6,8 @@ import torch from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.platforms import current_platform from vllm.utils.argparse_utils import FlexibleArgumentParser -from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE +from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed @torch.inference_mode() @@ -22,7 +21,7 @@ def main( num_warmup_iters: int = 5, num_iters: int = 100, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device("cuda") layer = RMSNorm(hidden_size).to(dtype=dtype) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index a1af0b8aec3d..26a281f4e4fb 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import argparse +import gc import json import os import time @@ -23,9 +24,50 @@ from vllm.transformers_utils.config import get_config from vllm.triton_utils import triton from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.torch_utils import set_random_seed FP8_DTYPE = current_platform.fp8_dtype() +# Default interval for clearing Triton JIT cache during tuning +# Set to 0 to disable automatic cache clearing +_CACHE_CLEAR_INTERVAL_ENV = "VLLM_MOE_TUNE_CACHE_CLEAR_INTERVAL" +TRITON_CACHE_CLEAR_INTERVAL = int(os.environ.get(_CACHE_CLEAR_INTERVAL_ENV, "50")) + + +def clear_triton_cache(): + """Clear Triton JIT compilation cache and Python/CUDA memory. + + This helps prevent OOM during tuning with large models (many experts). + """ + # Force Python garbage collection + gc.collect() + + # Clear CUDA memory cache + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + # Try to clear Triton's runtime cache + try: + import triton + + if ( + hasattr(triton, "runtime") + and hasattr(triton.runtime, "cache") + and hasattr(triton.runtime.cache, "clear") + ): + triton.runtime.cache.clear() + except ImportError: + # Triton not installed, skip cache clearing + pass + except AttributeError: + # Triton version doesn't have expected cache API + pass + except Exception as e: + print(f"Warning: Failed to clear Triton cache: {e}") + + # Additional garbage collection after clearing caches + gc.collect() + def ensure_divisibility(numerator, denominator, text): """Ensure that numerator is divisible by the denominator.""" @@ -390,7 +432,7 @@ def merge_unique_dicts(list1, list2): class BenchmarkWorker: def __init__(self, seed: int) -> None: torch.set_default_device("cuda") - current_platform.seed_everything(seed) + set_random_seed(seed) self.seed = seed # Get the device ID to allocate tensors and kernels # on the respective GPU. This is required for Ray to work @@ -410,7 +452,7 @@ def benchmark( block_quant_shape: list[int] = None, use_deep_gemm: bool = False, ) -> tuple[dict[str, int], float]: - current_platform.seed_everything(self.seed) + set_random_seed(self.seed) dtype_str = _get_config_dtype_str( dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8 ) @@ -483,7 +525,7 @@ def tune( need_device_guard = True with torch.cuda.device(self.device_id) if need_device_guard else nullcontext(): - for config in tqdm(search_space): + for idx, config in enumerate(tqdm(search_space)): try: kernel_time = benchmark_config( config, @@ -506,6 +548,19 @@ def tune( if kernel_time < best_time: best_time = kernel_time best_config = config + + # Periodically clear Triton JIT cache to prevent OOM + # This is especially important for large models with many experts + if ( + TRITON_CACHE_CLEAR_INTERVAL > 0 + and idx > 0 + and idx % TRITON_CACHE_CLEAR_INTERVAL == 0 + ): + clear_triton_cache() + + # Final cleanup after tuning completes + clear_triton_cache() + now = datetime.now() print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}") assert best_config is not None diff --git a/benchmarks/kernels/benchmark_moe_permute_unpermute.py b/benchmarks/kernels/benchmark_moe_permute_unpermute.py index b8913a217c60..77b77a15b53a 100644 --- a/benchmarks/kernels/benchmark_moe_permute_unpermute.py +++ b/benchmarks/kernels/benchmark_moe_permute_unpermute.py @@ -18,6 +18,7 @@ from vllm.model_executor.layers.fused_moe.utils import _fp8_quantize from vllm.platforms import current_platform from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.torch_utils import set_random_seed FP8_DTYPE = current_platform.fp8_dtype() @@ -261,7 +262,7 @@ def run(input: tuple): class BenchmarkWorker: def __init__(self, seed: int) -> None: torch.set_default_device("cuda") - current_platform.seed_everything(seed) + set_random_seed(seed) self.seed = seed # Get the device ID to allocate tensors and kernels # on the respective GPU. This is required for Ray to work @@ -279,7 +280,7 @@ def benchmark( use_int8_w8a16: bool, use_customized_permute: bool = False, ) -> tuple[dict[str, int], float]: - current_platform.seed_everything(self.seed) + set_random_seed(self.seed) permute_time = benchmark_permute( num_tokens, diff --git a/benchmarks/kernels/benchmark_mrope.py b/benchmarks/kernels/benchmark_mrope.py index 09de5fa822f8..3e0365135778 100644 --- a/benchmarks/kernels/benchmark_mrope.py +++ b/benchmarks/kernels/benchmark_mrope.py @@ -37,9 +37,9 @@ import torch from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.platforms import current_platform from vllm.transformers_utils.config import get_config from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.torch_utils import set_random_seed device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -94,7 +94,7 @@ def benchmark_mrope( benchmark_iter: int = 100, csv_writer=None, ): - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) # the parameters to compute the q k v size based on tp_size mrope_helper_class = get_rope( diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py index 46ab2a5fe5e9..be871d3d1aa0 100644 --- a/benchmarks/kernels/benchmark_paged_attention.py +++ b/benchmarks/kernels/benchmark_paged_attention.py @@ -13,6 +13,7 @@ from vllm.utils.torch_utils import ( STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random, + set_random_seed, ) logger = init_logger(__name__) @@ -38,7 +39,7 @@ def main( device: str = "cuda", kv_cache_dtype: str | None = None, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) scale = float(1.0 / (head_size**0.5)) query = torch.empty( diff --git a/benchmarks/kernels/benchmark_quant.py b/benchmarks/kernels/benchmark_quant.py index 3c2ac9128947..9a21cfe94e5b 100644 --- a/benchmarks/kernels/benchmark_quant.py +++ b/benchmarks/kernels/benchmark_quant.py @@ -6,9 +6,8 @@ import torch from vllm import _custom_ops as ops -from vllm.platforms import current_platform from vllm.utils.argparse_utils import FlexibleArgumentParser -from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE +from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed @torch.inference_mode() @@ -23,7 +22,7 @@ def main( num_warmup_iters: int = 5, num_iters: int = 100, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device("cuda") x = torch.randn(num_tokens, hidden_size, dtype=dtype) diff --git a/benchmarks/kernels/benchmark_reshape_and_cache.py b/benchmarks/kernels/benchmark_reshape_and_cache.py index 0d3aef0c630b..99067d8ac371 100644 --- a/benchmarks/kernels/benchmark_reshape_and_cache.py +++ b/benchmarks/kernels/benchmark_reshape_and_cache.py @@ -8,11 +8,11 @@ from vllm import _custom_ops as ops from vllm.logger import init_logger -from vllm.platforms import current_platform from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.torch_utils import ( STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random, + set_random_seed, ) logger = init_logger(__name__) @@ -36,7 +36,7 @@ def run_benchmark( if kv_cache_dtype == "fp8" and head_size % 16: raise ValueError("fp8 kv-cache requires head_size to be a multiple of 16.") - current_platform.seed_everything(42) + set_random_seed(42) torch.set_default_device(device) # create random key / value tensors [T, H, D]. diff --git a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py index 12f17ea575d9..bca66f301127 100644 --- a/benchmarks/kernels/benchmark_reshape_and_cache_flash.py +++ b/benchmarks/kernels/benchmark_reshape_and_cache_flash.py @@ -11,11 +11,11 @@ triton_reshape_and_cache_flash, ) from vllm.logger import init_logger -from vllm.platforms import current_platform from vllm.utils.argparse_utils import FlexibleArgumentParser from vllm.utils.torch_utils import ( STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random_flash, + set_random_seed, ) logger = init_logger(__name__) @@ -49,7 +49,7 @@ def run_benchmark( if implementation == "triton" and kv_cache_layout == "HND": return float("nan") # Triton does not support HND layout yet. - current_platform.seed_everything(42) + set_random_seed(42) torch.set_default_device(device) # create random key / value tensors [T, H, D]. diff --git a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py index de01ff197eab..da32bc30cb2a 100644 --- a/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py +++ b/benchmarks/kernels/benchmark_silu_mul_fp8_quant.py @@ -23,9 +23,9 @@ from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import ( persistent_masked_m_silu_mul_quant, ) -from vllm.platforms import current_platform from vllm.triton_utils import tl, triton from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used +from vllm.utils.torch_utils import set_random_seed @triton.jit @@ -207,7 +207,7 @@ def benchmark( ): def generate_data(seed_offset=0): """Generate input data with given seed offset""" - current_platform.seed_everything(42 + seed_offset) + set_random_seed(42 + seed_offset) y = torch.rand((E, T, 2 * H), dtype=torch.bfloat16, device="cuda").contiguous() if gen_strategy == "random_imbalanced": diff --git a/benchmarks/kernels/cpu/benchmark_cpu_attn.py b/benchmarks/kernels/cpu/benchmark_cpu_attn.py new file mode 100644 index 000000000000..30b860395373 --- /dev/null +++ b/benchmarks/kernels/cpu/benchmark_cpu_attn.py @@ -0,0 +1,272 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import functools +import time + +import numpy as np +import torch + +from vllm._custom_ops import ( + cpu_attention_with_kv_cache, + cpu_attn_get_scheduler_metadata, + cpu_attn_reshape_and_cache, +) +from vllm.platforms import CpuArchEnum, current_platform +from vllm.utils.argparse_utils import FlexibleArgumentParser +from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE +from vllm.v1.attention.backends.cpu_attn import CPUAttentionBackend, _get_attn_isa + + +def get_attn_isa( + block_size: int | None = None, + dtype: torch.dtype | None = None, +): + if block_size and dtype: + return _get_attn_isa(dtype, block_size) + else: + if current_platform.get_cpu_architecture() == CpuArchEnum.ARM: + return "neon" + elif torch._C._cpu._is_amx_tile_supported(): + return "amx" + else: + return "vec" + + +# rand number generation takes too much time, cache rand tensors +@functools.lru_cache(maxsize=128, typed=False) +def tensor_cache( + elem_num: int, + dtype: torch.dtype, +) -> torch.Tensor: + tensor = torch.randn(elem_num, dtype=dtype) + return tensor + + +@torch.inference_mode() +def main( + seq_lens: list[tuple[int, int]], + num_heads: tuple[int, int], + head_size: int, + sliding_window: int = None, + dtype: torch.dtype = torch.bfloat16, + block_size: int = 128, + num_blocks: int = 4096, + use_sink: bool = False, + enable_kv_split: bool = False, + isa: str | None = None, + seed: int = 0, + iters: int = 20, +) -> None: + current_platform.seed_everything(seed) + num_seqs = len(seq_lens) + query_lens = [x[0] for x in seq_lens] + kv_lens = [x[1] for x in seq_lens] + num_query_heads = num_heads[0] + num_kv_heads = num_heads[1] + assert num_query_heads % num_kv_heads == 0 + max_kv_len = max(kv_lens) + window_size = (sliding_window - 1, 0) if sliding_window is not None else (-1, -1) + scale = head_size**-0.5 + token_num = sum(query_lens) + + if isa is None: + isa = get_attn_isa(block_size, dtype) + + s_aux = ( + 15 * torch.rand((num_query_heads,), dtype=torch.bfloat16) if use_sink else None + ) + + query = tensor_cache( + elem_num=token_num * num_query_heads * head_size, + dtype=dtype, + ) + query = query.view( + token_num, + num_query_heads, + head_size, + ) + + key_value = tensor_cache( + elem_num=2 * num_blocks * num_kv_heads * block_size * head_size, + dtype=dtype, + ) + key_value = key_value.view( + 2, + num_blocks, + block_size, + num_kv_heads, + head_size, + ) + key_cache, value_cache = key_value.unbind(0) + + # KV cache for CPU attention + packed_key_cache = torch.empty( + num_blocks, num_kv_heads, block_size, head_size, dtype=dtype + ) + packed_value_cache = torch.empty_like(packed_key_cache) + + cu_query_lens = torch.tensor([0] + query_lens, dtype=torch.int32).cumsum( + dim=0, dtype=torch.int32 + ) + kv_lens_tensor = torch.tensor(kv_lens, dtype=torch.int32) + max_num_blocks_per_seq = (max_kv_len + block_size - 1) // block_size + block_tables = torch.randint( + 0, num_blocks, (num_seqs, max_num_blocks_per_seq), dtype=torch.int32 + ) + + # use reshape_and_cache to pack key_cache and value_cache + slot_mapping = torch.arange(0, num_blocks * block_size, dtype=torch.int64) + cpu_attn_reshape_and_cache( + key=key_cache.view(-1, num_kv_heads, head_size), + value=value_cache.view(-1, num_kv_heads, head_size), + key_cache=packed_key_cache, + value_cache=packed_value_cache, + slot_mapping=slot_mapping, + isa=isa, + ) + + metadata = cpu_attn_get_scheduler_metadata( + num_reqs=num_seqs, + num_heads=num_query_heads, + num_kv_heads=num_kv_heads, + head_dim=head_size, + seq_lens=kv_lens_tensor, + dtype=dtype, + query_start_loc=cu_query_lens, + causal=True, + sliding_window_size=sliding_window if sliding_window is not None else -1, + isa=isa, + enable_kv_split=enable_kv_split, + ) + + out_with_split = torch.empty_like(query) + + def run_benchmark(iters: int) -> list[float]: + times = [] + for _ in range(iters): + start_time = time.perf_counter_ns() + cpu_attention_with_kv_cache( + query=query, + key_cache=packed_key_cache, + value_cache=packed_value_cache, + output=out_with_split, + query_start_loc=cu_query_lens, + seq_lens=kv_lens_tensor, + scale=scale, + causal=True, + alibi_slopes=None, + sliding_window=window_size, + block_table=block_tables, + softcap=0, + scheduler_metadata=metadata, + s_aux=s_aux, + ) + end_time = time.perf_counter_ns() + times.append((end_time - start_time) / 1e6) + return times + + # warmup + run_benchmark(5) + # benchmark + times = run_benchmark(iters) + + time_min = min(times) + time_max = max(times) + time_mean = np.mean(times) + time_std = np.std(times) + + print("\tmin (ms) = ", time_min) + print("\tmax (ms) = ", time_max) + print("\tmean (ms) = ", time_mean) + print("\tstd = ", time_std) + print("\tmedian (ms) = ", np.median(times)) + + +def generate_seq_lens( + batch_size: int, + q_len_min: int, + q_len_max: int, + kv_len_min: int, + kv_len_max: int, + seed: int = 0, +) -> list[tuple[int, int]]: + assert 1 <= q_len_min <= q_len_max + assert 1 <= kv_len_min <= kv_len_max + assert kv_len_max >= q_len_min + + g = torch.Generator(device="cpu").manual_seed(seed) + + def rint(lo: int, hi: int) -> int: + return torch.randint(lo, hi + 1, (1,), generator=g).item() + + seq_lens: list[tuple[int, int]] = [] + for _ in range(batch_size): + # ensure q <= kv + kv = rint(max(kv_len_min, q_len_min), kv_len_max) + q = rint(q_len_min, min(q_len_max, kv)) + seq_lens.append((q, kv)) + + return seq_lens + + +if __name__ == "__main__": + parser = FlexibleArgumentParser(description="Benchmark the paged attention kernel.") + parser.add_argument("--batch-size", type=int, default=64) + parser.add_argument("--q-len-min", type=int, default=512) + parser.add_argument("--q-len-max", type=int, default=512) + parser.add_argument("--kv-len-min", type=int, default=512) + parser.add_argument("--kv-len-max", type=int, default=512) + parser.add_argument("--num-blocks", type=int, default=4096) + + parser.add_argument("--sliding-window", type=int, default=None) + parser.add_argument("--num-query-heads", type=int, default=32) + parser.add_argument("--num-kv-heads", type=int, default=8) + parser.add_argument( + "--head-size", + type=int, + choices=CPUAttentionBackend.get_supported_head_sizes(), + default=128, + ) + parser.add_argument("--enable-kv-split", action="store_true") + parser.add_argument("--block-size", type=int, choices=[32, 64, 128], default=128) + parser.add_argument( + "--dtype", type=str, choices=["half", "bfloat16", "float"], default="bfloat16" + ) + parser.add_argument("--use-sink", action="store_true") + parser.add_argument( + "--isa", type=str, choices=["vec", "neon", "amx", "vec16"], default=None + ) + parser.add_argument("--seed", type=int, default=0) + parser.add_argument("--iters", type=int, default=20) + + args = parser.parse_args() + print(args) + + seq_lens = generate_seq_lens( + args.batch_size, + args.q_len_min, + args.q_len_max, + args.kv_len_min, + args.kv_len_max, + args.seed, + ) + + print("batch (query len, kv len) = ", seq_lens) + + main( + seq_lens=seq_lens, + num_heads=(args.num_query_heads, args.num_kv_heads), + head_size=args.head_size, + sliding_window=args.sliding_window, + dtype=STR_DTYPE_TO_TORCH_DTYPE[args.dtype], + block_size=args.block_size, + num_blocks=args.num_blocks, + use_sink=args.use_sink, + enable_kv_split=args.enable_kv_split, + isa=args.isa + if args.isa is not None + else get_attn_isa(args.block_size, STR_DTYPE_TO_TORCH_DTYPE[args.dtype]), + seed=args.seed, + iters=args.iters, + ) diff --git a/cmake/external_projects/qutlass.cmake b/cmake/external_projects/qutlass.cmake index 5a59a409999a..84bb1b00c1bb 100644 --- a/cmake/external_projects/qutlass.cmake +++ b/cmake/external_projects/qutlass.cmake @@ -31,10 +31,15 @@ if(NOT qutlass_SOURCE_DIR) endif() message(STATUS "[QUTLASS] QuTLASS is available at ${qutlass_SOURCE_DIR}") -cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a" "${CUDA_ARCHS}") -if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.8 AND QUTLASS_ARCHS) +if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 13.0) + cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0f" "${CUDA_ARCHS}") +else() + cuda_archs_loose_intersection(QUTLASS_ARCHS "12.0a;10.0a;10.3a" "${CUDA_ARCHS}") +endif() + +if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER_EQUAL 12.8 AND QUTLASS_ARCHS) - if(QUTLASS_ARCHS MATCHES "10\\.0a") + if(QUTLASS_ARCHS MATCHES "10\\.(0a|3a|0f)") set(QUTLASS_TARGET_CC 100) elseif(QUTLASS_ARCHS MATCHES "12\\.0a") set(QUTLASS_TARGET_CC 120) diff --git a/docker/Dockerfile b/docker/Dockerfile index 679ffc4a7df5..0daff39956b4 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -273,6 +273,7 @@ RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh ARG PPLX_COMMIT_HASH ARG DEEPEP_COMMIT_HASH +ARG NVSHMEM_VER RUN --mount=type=cache,target=/root/.cache/uv \ mkdir -p /tmp/ep_kernels_workspace/dist && \ export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \ @@ -280,7 +281,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \ --workspace /tmp/ep_kernels_workspace \ --mode wheel \ ${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \ - ${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} && \ + ${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} \ + ${NVSHMEM_VER:+--nvshmem-ver "$NVSHMEM_VER"} && \ find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete #################### EXTENSIONS BUILD IMAGE #################### @@ -615,6 +617,7 @@ RUN mv vllm src/vllm FROM vllm-base AS vllm-openai-base ARG TARGETPLATFORM ARG INSTALL_KV_CONNECTORS=false +ARG CUDA_VERSION ARG PIP_INDEX_URL UV_INDEX_URL ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL @@ -626,8 +629,26 @@ ENV UV_HTTP_TIMEOUT=500 # install kv_connectors if requested RUN --mount=type=cache,target=/root/.cache/uv \ --mount=type=bind,source=requirements/kv_connectors.txt,target=/tmp/kv_connectors.txt,ro \ + CUDA_MAJOR="${CUDA_VERSION%%.*}"; \ + CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-'); \ + CUDA_HOME=/usr/local/cuda; \ + # lmcache requires explicit specifying CUDA_HOME + BUILD_PKGS="libcusparse-dev-${CUDA_VERSION_DASH} \ + libcublas-dev-${CUDA_VERSION_DASH} \ + libcusolver-dev-${CUDA_VERSION_DASH}"; \ if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \ - uv pip install --system -r /tmp/kv_connectors.txt || true; \ + if [ "$CUDA_MAJOR" -ge 13 ]; then \ + uv pip install --system nixl-cu13; \ + fi; \ + uv pip install --system -r /tmp/kv_connectors.txt --no-build || ( \ + # if the above fails, install from source + apt-get update -y && \ + apt-get install -y --no-install-recommends ${BUILD_PKGS} && \ + uv pip install --system -r /tmp/kv_connectors.txt --no-build-isolation && \ + apt-get purge -y ${BUILD_PKGS} && \ + # clean up -dev packages, keep runtime libraries + rm -rf /var/lib/apt/lists/* \ + ); \ fi ENV VLLM_USAGE_SOURCE production-docker-image diff --git a/docs/cli/bench/mm_processor.md b/docs/cli/bench/mm_processor.md new file mode 100644 index 000000000000..af2c3a8cfd36 --- /dev/null +++ b/docs/cli/bench/mm_processor.md @@ -0,0 +1,9 @@ +# vllm bench mm-processor + +## JSON CLI Arguments + +--8<-- "docs/cli/json_tip.inc.md" + +## Arguments + +--8<-- "docs/generated/argparse/bench_mm_processor.inc.md" diff --git a/docs/design/plugin_system.md b/docs/design/plugin_system.md index 0fd448c2153c..6a4b5fd6b882 100644 --- a/docs/design/plugin_system.md +++ b/docs/design/plugin_system.md @@ -154,3 +154,4 @@ The interface for the model/module may change during vLLM's development. If you !!! warning "Deprecations" - `use_v1` parameter in `Platform.get_attn_backend_cls` is deprecated. It has been removed in v0.13.0. - `_Backend` in `vllm.attention` is deprecated. It has been removed in v0.13.0. Please use `vllm.attention.backends.registry.register_backend` to add new attention backend to `AttentionBackendEnum` instead. + - `seed_everything` platform interface is deprecated. It will be removed in v0.15.0 or later. Please use `vllm.utils.torch_utils.set_random_seed` instead. diff --git a/docs/features/lora.md b/docs/features/lora.md index eb9f44638543..09ab13dcc638 100644 --- a/docs/features/lora.md +++ b/docs/features/lora.md @@ -277,7 +277,7 @@ The new format of `--lora-modules` is mainly to support the display of parent mo ## LoRA Support for Tower and Connector of Multi-Modal Model -Currently, vLLM experimentally supports LoRA for the Tower and Connector components of multi-modal models. To enable this feature, you need to implement the corresponding token helper functions for the tower and connector. For more details on the rationale behind this approach, please refer to [PR 26674](https://github.com/vllm-project/vllm/pull/26674). We welcome contributions to extend LoRA support to additional models' tower and connector. +Currently, vLLM experimentally supports LoRA for the Tower and Connector components of multi-modal models. To enable this feature, you need to implement the corresponding token helper functions for the tower and connector. For more details on the rationale behind this approach, please refer to [PR 26674](https://github.com/vllm-project/vllm/pull/26674). We welcome contributions to extend LoRA support to additional models' tower and connector. Please refer to [Issue 31479](https://github.com/vllm-project/vllm/issues/31479) to check the current model support status. ## Default LoRA Models For Multimodal Models diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index c3fd726e9938..4e54056d1b07 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -166,49 +166,51 @@ Full example: [examples/offline_inference/vision_language_multi_image.py](../../ If using the [LLM.chat](../models/generative_models.md#llmchat) method, you can pass images directly in the message content using various formats: image URLs, PIL Image objects, or pre-computed embeddings: -```python -from vllm import LLM -from vllm.assets.image import ImageAsset - -llm = LLM(model="llava-hf/llava-1.5-7b-hf") -image_url = "https://picsum.photos/id/32/512/512" -image_pil = ImageAsset('cherry_blossom').pil_image -image_embeds = torch.load(...) - -conversation = [ - {"role": "system", "content": "You are a helpful assistant"}, - {"role": "user", "content": "Hello"}, - {"role": "assistant", "content": "Hello! How can I assist you today?"}, - { - "role": "user", - "content": [ - { - "type": "image_url", - "image_url": {"url": image_url}, - }, - { - "type": "image_pil", - "image_pil": image_pil, - }, - { - "type": "image_embeds", - "image_embeds": image_embeds, - }, - { - "type": "text", - "text": "What's in these images?", - }, - ], - }, -] +??? code -# Perform inference and log output. -outputs = llm.chat(conversation) + ```python + from vllm import LLM + from vllm.assets.image import ImageAsset -for o in outputs: - generated_text = o.outputs[0].text - print(generated_text) -``` + llm = LLM(model="llava-hf/llava-1.5-7b-hf") + image_url = "https://picsum.photos/id/32/512/512" + image_pil = ImageAsset('cherry_blossom').pil_image + image_embeds = torch.load(...) + + conversation = [ + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hello! How can I assist you today?"}, + { + "role": "user", + "content": [ + { + "type": "image_url", + "image_url": {"url": image_url}, + }, + { + "type": "image_pil", + "image_pil": image_pil, + }, + { + "type": "image_embeds", + "image_embeds": image_embeds, + }, + { + "type": "text", + "text": "What's in these images?", + }, + ], + }, + ] + + # Perform inference and log output. + outputs = llm.chat(conversation) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + ``` Multi-image input can be extended to perform video captioning. We show this with [Qwen2-VL](https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct) as it supports videos: @@ -506,6 +508,7 @@ Then, you can use the OpenAI client as follows: ??? code ```python + import os from openai import OpenAI openai_api_key = "EMPTY" @@ -517,8 +520,11 @@ Then, you can use the OpenAI client as follows: ) # Single-image input inference + + # Public image URL for testing remote image processing image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + # Create chat completion with remote image chat_response = client.chat.completions.create( model="microsoft/Phi-3.5-vision-instruct", messages=[ @@ -542,6 +548,35 @@ Then, you can use the OpenAI client as follows: ) print("Chat completion output:", chat_response.choices[0].message.content) + # Local image file path (update this to point to your actual image file) + image_file = "/path/to/image.jpg" + + # Create chat completion with local image file + # Launch the API server/engine with the --allowed-local-media-path argument. + if os.path.exists(image_file): + chat_completion_from_local_image_url = client.chat.completions.create( + model="microsoft/Phi-3.5-vision-instruct", + messages=[ + { + "role": "user", + "content": [ + { + "type": "text", + "text": "What’s in this image?", + }, + { + "type": "image_url", + "image_url": {"url": f"file://{image_file}"}, + }, + ], + } + ], + ) + result = chat_completion_from_local_image_url.choices[0].message.content + print("Chat completion output from local image file:\n", result) + else: + print(f"Local image file not found at {image_file}, skipping local file test.") + # Multi-image input inference image_url_duck = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/duck.jpg" image_url_lion = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/lion.jpg" @@ -860,6 +895,8 @@ The following example demonstrates how to pass image embeddings to the OpenAI se For Online Serving, you can also skip sending media if you expect cache hits with provided UUIDs. You can do so by sending media like this: +??? code + ```python # Image/video/audio URL: { diff --git a/docs/mkdocs/hooks/generate_argparse.py b/docs/mkdocs/hooks/generate_argparse.py index 3baa4d817dee..53bfce93ce21 100644 --- a/docs/mkdocs/hooks/generate_argparse.py +++ b/docs/mkdocs/hooks/generate_argparse.py @@ -92,6 +92,7 @@ def auto_mock(module_name: str, attr: str, max_mocks: int = 100): bench_latency = auto_mock("vllm.benchmarks", "latency") +bench_mm_processor = auto_mock("vllm.benchmarks", "mm_processor") bench_serve = auto_mock("vllm.benchmarks", "serve") bench_sweep_plot = auto_mock("vllm.benchmarks.sweep.plot", "SweepPlotArgs") bench_sweep_plot_pareto = auto_mock( @@ -222,6 +223,7 @@ def on_startup(command: Literal["build", "gh-deploy", "serve"], dirty: bool): "run-batch": create_parser(openai_run_batch.make_arg_parser), # Benchmark CLI "bench_latency": create_parser(bench_latency.add_cli_args), + "bench_mm_processor": create_parser(bench_mm_processor.add_cli_args), "bench_serve": create_parser(bench_serve.add_cli_args), "bench_sweep_plot": create_parser(bench_sweep_plot.add_cli_args), "bench_sweep_plot_pareto": create_parser(bench_sweep_plot_pareto.add_cli_args), diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 0328fe52223a..39e965c1d099 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -540,21 +540,28 @@ If your model is not in the above list, we will try to automatically convert the Cross-encoder and reranker models are a subset of classification models that accept two prompts as input. These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) API. -| Architecture | Models | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | -|--------------|--------|-------------------|----------------------|---------------------------| -| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | | | -| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma` (see note), etc. | ✅︎ | ✅︎ | -| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | | | -| `LlamaBidirectionalForSequenceClassification`C | Llama-based with bidirectional attention | `nvidia/llama-nemotron-rerank-1b-v2` (see note), etc. | ✅︎ | ✅︎ | -| `Qwen2ForSequenceClassification`C | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2` (see note), etc. | ✅︎ | ✅︎ | -| `Qwen3ForSequenceClassification`C | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B` (see note), etc. | ✅︎ | ✅︎ | -| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | | | -| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | | | -| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | \* | \* | +| Architecture | Models | Example HF Models | Score template (see note) | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | +|--------------|--------|-------------------|---------------------------|-----------------------------|-----------------------------------------| +| `BertForSequenceClassification` | BERT-based | `cross-encoder/ms-marco-MiniLM-L-6-v2`, etc. | N/A | | | +| `GemmaForSequenceClassification` | Gemma-based | `BAAI/bge-reranker-v2-gemma`(see note), etc. | [bge-reranker-v2-gemma.jinja](../../examples/pooling/score/template/bge-reranker-v2-gemma.jinja) | ✅︎ | ✅︎ | +| `GteNewForSequenceClassification` | mGTE-TRM (see note) | `Alibaba-NLP/gte-multilingual-reranker-base`, etc. | N/A | | | +| `LlamaBidirectionalForSequenceClassification`C | Llama-based with bidirectional attention | `nvidia/llama-nemotron-rerank-1b-v2`, etc. | [nemotron-rerank.jinja](../../examples/pooling/score/template/nemotron-rerank.jinja) | ✅︎ | ✅︎ | +| `Qwen2ForSequenceClassification`C | Qwen2-based | `mixedbread-ai/mxbai-rerank-base-v2`(see note), etc. | [mxbai_rerank_v2.jinja](../../examples/pooling/score/template/mxbai_rerank_v2.jinja) | ✅︎ | ✅︎ | +| `Qwen3ForSequenceClassification`C | Qwen3-based | `tomaarsen/Qwen3-Reranker-0.6B-seq-cls`, `Qwen/Qwen3-Reranker-0.6B`(see note), etc. | [qwen3_reranker.jinja](../../examples/pooling/score/template/qwen3_reranker.jinja) | ✅︎ | ✅︎ | +| `RobertaForSequenceClassification` | RoBERTa-based | `cross-encoder/quora-roberta-base`, etc. | N/A | | | +| `XLMRobertaForSequenceClassification` | XLM-RoBERTa-based | `BAAI/bge-reranker-v2-m3`, etc. | N/A | | | +| `*Model`C, `*ForCausalLM`C, etc. | Generative models | N/A | N/A | \* | \* | C Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion)) \* Feature support is the same as that of the original model. +!!! note + Some models require a specific prompt format to work correctly. + + You can find Example HF Models's corresponding score template in [examples/pooling/score/template/](../../examples/pooling/score/template) + + Examples : [examples/pooling/score/using_template_offline.py](../../examples/pooling/score/using_template_offline.py) [examples/pooling/score/using_template_online.py](../../examples/pooling/score/using_template_online.py) + !!! note Load the official original `BAAI/bge-reranker-v2-gemma` by using the following command. @@ -565,11 +572,6 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A !!! note The second-generation GTE model (mGTE-TRM) is named `NewForSequenceClassification`. The name `NewForSequenceClassification` is too generic, you should set `--hf-overrides '{"architectures": ["GteNewForSequenceClassification"]}'` to specify the use of the `GteNewForSequenceClassification` architecture. -!!! note - `nvidia/llama-nemotron-rerank-1b-v2` require a specific prompt format to work correctly. - - Examples : [offline_using_template.py](../../examples/pooling/score/offline_using_template.py) [online_using_template.py](../../examples/pooling/score/online_using_template.py) - !!! note Load the official original `mxbai-rerank-v2` by using the following command. @@ -578,7 +580,7 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A ``` !!! note - Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/offline_reranker.py](../../examples/pooling/score/offline_reranker.py). + Load the official original `Qwen3 Reranker` by using the following command. More information can be found at: [examples/pooling/score/qwen3_reranker_offline.py](../../examples/pooling/score/qwen3_reranker_offline.py) [examples/pooling/score/qwen3_reranker_online.py](../../examples/pooling/score/qwen3_reranker_online.py). ```bash vllm serve Qwen/Qwen3-Reranker-0.6B --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' @@ -673,11 +675,11 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `AyaVisionForConditionalGeneration` | Aya Vision | T + I+ | `CohereLabs/aya-vision-8b`, `CohereLabs/aya-vision-32b`, etc. | | ✅︎ | | `BagelForConditionalGeneration` | BAGEL | T + I+ | `ByteDance-Seed/BAGEL-7B-MoT` | ✅︎ | ✅︎ | | `BeeForConditionalGeneration` | Bee-8B | T + IE+ | `Open-Bee/Bee-8B-RL`, `Open-Bee/Bee-8B-SFT` | | ✅︎ | -| `Blip2ForConditionalGeneration` | BLIP-2 | T + IE | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | | ✅︎ | +| `Blip2ForConditionalGeneration` | BLIP-2 | T + IE | `Salesforce/blip2-opt-2.7b`, `Salesforce/blip2-opt-6.7b`, etc. | ✅︎ | ✅︎ | | `ChameleonForConditionalGeneration` | Chameleon | T + I | `facebook/chameleon-7b`, etc. | | ✅︎ | | `Cohere2VisionForConditionalGeneration` | Command A Vision | T + I+ | `CohereLabs/command-a-vision-07-2025`, etc. | | ✅︎ | | `DeepseekVLV2ForCausalLM`^ | DeepSeek-VL2 | T + I+ | `deepseek-ai/deepseek-vl2-tiny`, `deepseek-ai/deepseek-vl2-small`, `deepseek-ai/deepseek-vl2`, etc. | | ✅︎ | -| `DeepseekOCRForCausalLM` | DeepSeek-OCR | T + I+ | `deepseek-ai/DeepSeek-OCR`, etc. | | ✅︎ | +| `DeepseekOCRForCausalLM` | DeepSeek-OCR | T + I+ | `deepseek-ai/DeepSeek-OCR`, etc. | ✅︎ | ✅︎ | | `Ernie4_5_VLMoeForConditionalGeneration` | Ernie4.5-VL | T + I+/ V+ | `baidu/ERNIE-4.5-VL-28B-A3B-PT`, `baidu/ERNIE-4.5-VL-424B-A47B-PT` | | ✅︎ | | `FuyuForCausalLM` | Fuyu | T + I | `adept/fuyu-8b`, etc. | | ✅︎ | | `Gemma3ForConditionalGeneration` | Gemma 3 | T + IE+ | `google/gemma-3-4b-it`, `google/gemma-3-27b-it`, etc. | ✅︎ | ✅︎ | @@ -699,7 +701,7 @@ These models primarily accept the [`LLM.generate`](./generative_models.md#llmgen | `LightOnOCRForConditionalGeneration` | LightOnOCR-1B | T + I+ | `lightonai/LightOnOCR-1B`, etc | ✅︎ | ✅︎ | | `Llama4ForConditionalGeneration` | Llama 4 | T + I+ | `meta-llama/Llama-4-Scout-17B-16E-Instruct`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8`, `meta-llama/Llama-4-Maverick-17B-128E-Instruct`, etc. | ✅︎ | ✅︎ | | `Llama_Nemotron_Nano_VL` | Llama Nemotron Nano VL | T + IE+ | `nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1` | ✅︎ | ✅︎ | -| `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + IE+ | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | | ✅︎ | +| `LlavaForConditionalGeneration` | LLaVA-1.5, Pixtral (HF Transformers) | T + IE+ | `llava-hf/llava-1.5-7b-hf`, `TIGER-Lab/Mantis-8B-siglip-llama3` (see note), `mistral-community/pixtral-12b`, etc. | ✅︎ | ✅︎ | | `LlavaNextForConditionalGeneration` | LLaVA-NeXT | T + IE+ | `llava-hf/llava-v1.6-mistral-7b-hf`, `llava-hf/llava-v1.6-vicuna-7b-hf`, etc. | | ✅︎ | | `LlavaNextVideoForConditionalGeneration` | LLaVA-NeXT-Video | T + V | `llava-hf/LLaVA-NeXT-Video-7B-hf`, etc. | | ✅︎ | | `LlavaOnevisionForConditionalGeneration` | LLaVA-Onevision | T + I+ + V+ | `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc. | | ✅︎ | @@ -765,9 +767,6 @@ Some models are supported only via the [Transformers modeling backend](#transfor The official `openbmb/MiniCPM-V-2` doesn't work yet, so we need to use a fork (`HwwwH/MiniCPM-V-2`) for now. For more details, please see: -!!! note - For Qwen2.5-Omni and Qwen3-Omni, reading audio from video pre-processing (`--mm-processor-kwargs '{"use_audio_in_video": true}'`) is currently work in progress and not yet supported. - #### Transcription Speech2Text models trained specifically for Automatic Speech Recognition. diff --git a/examples/offline_inference/qwen2_5_omni/README.md b/examples/offline_inference/qwen2_5_omni/README.md index d8fb50d7fe55..409ac0223b55 100644 --- a/examples/offline_inference/qwen2_5_omni/README.md +++ b/examples/offline_inference/qwen2_5_omni/README.md @@ -10,7 +10,6 @@ python examples/offline_inference/qwen2_5_omni/only_thinker.py \ -q mixed_modalities # Read vision and audio inputs from a single video file -# NOTE: V1 engine does not support interleaved modalities yet. python examples/offline_inference/qwen2_5_omni/only_thinker.py \ -q use_audio_in_video diff --git a/examples/online_serving/openai_chat_completion_client_for_multimodal.py b/examples/online_serving/openai_chat_completion_client_for_multimodal.py index 3d1259276998..198863ae4a8b 100644 --- a/examples/online_serving/openai_chat_completion_client_for_multimodal.py +++ b/examples/online_serving/openai_chat_completion_client_for_multimodal.py @@ -21,6 +21,7 @@ """ import base64 +import os import requests from openai import OpenAI @@ -51,6 +52,16 @@ def encode_base64_content_from_url(content_url: str) -> str: return result +def encode_base64_content_from_file(file_path: str) -> str: + """Encode a local file content to base64 format.""" + + with open(file_path, "rb") as file: + file_content = file.read() + result = base64.b64encode(file_content).decode("utf-8") + + return result + + # Text-only inference def run_text_only(model: str, max_completion_tokens: int) -> None: chat_completion = client.chat.completions.create( @@ -67,6 +78,7 @@ def run_text_only(model: str, max_completion_tokens: int) -> None: def run_single_image(model: str, max_completion_tokens: int) -> None: ## Use image url in the payload image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + image_file = "/path/to/image.jpg" # local file chat_completion_from_url = client.chat.completions.create( messages=[ { @@ -87,6 +99,30 @@ def run_single_image(model: str, max_completion_tokens: int) -> None: result = chat_completion_from_url.choices[0].message.content print("Chat completion output from image url:\n", result) + ## Use local image url in the payload + # Launch the API server/engine with the --allowed-local-media-path argument. + if os.path.exists(image_file): + chat_completion_from_local_image_url = client.chat.completions.create( + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + { + "type": "image_url", + "image_url": {"url": f"file://{image_file}"}, + }, + ], + } + ], + model=model, + max_completion_tokens=max_completion_tokens, + ) + result = chat_completion_from_local_image_url.choices[0].message.content + print("Chat completion output from local image file:\n", result) + else: + print(f"Local image file not found at {image_file}, skipping local file test.") + ## Use base64 encoded image in the payload image_base64 = encode_base64_content_from_url(image_url) chat_completion_from_base64 = client.chat.completions.create( @@ -109,6 +145,33 @@ def run_single_image(model: str, max_completion_tokens: int) -> None: result = chat_completion_from_base64.choices[0].message.content print("Chat completion output from base64 encoded image:", result) + ## Use base64 encoded local image in the payload + if os.path.exists(image_file): + local_image_base64 = encode_base64_content_from_file(image_file) + chat_completion_from_local_image_base64 = client.chat.completions.create( + messages=[ + { + "role": "user", + "content": [ + {"type": "text", "text": "What's in this image?"}, + { + "type": "image_url", + "image_url": { + "url": f"data:image/jpeg;base64,{local_image_base64}" + }, + }, + ], + } + ], + model=model, + max_completion_tokens=max_completion_tokens, + ) + + result = chat_completion_from_local_image_base64.choices[0].message.content + print("Chat completion output from base64 encoded local image:", result) + else: + print(f"Local image file not found at {image_file}, skipping local file test.") + # Multi-image input inference def run_multi_image(model: str, max_completion_tokens: int) -> None: diff --git a/examples/pooling/score/convert_model_to_seq_cls.py b/examples/pooling/score/convert_model_to_seq_cls.py index 72356020330f..a3d31ceb12a7 100644 --- a/examples/pooling/score/convert_model_to_seq_cls.py +++ b/examples/pooling/score/convert_model_to_seq_cls.py @@ -2,35 +2,70 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # ruff: noqa: E501 +""" +Script to convert Large Language Models (LLMs) to Sequence Classification models. +This is particularly useful for converting reranker models that use next-token +prediction to a sequence classification format for compatibility with standard +classification and rerank pipelines. + +Usage examples: +- For BAAI/bge-reranker-v2-gemma: + python convert_model_to_seq_cls.py --model_name BAAI/bge-reranker-v2-gemma \ + --classifier_from_tokens '["Yes"]' --method no_post_processing \ + --path ./bge-reranker-v2-gemma-seq-cls + +- For mxbai-rerank-v2: + python convert_model_to_seq_cls.py --model_name mixedbread-ai/mxbai-rerank-base-v2 \ + --classifier_from_tokens '["0", "1"]' --method from_2_way_softmax \ + --path ./mxbai-rerank-base-v2-seq-cls + +- For Qwen3-Reranker: + python convert_model_to_seq_cls.py --model_name Qwen/Qwen3-Reranker-0.6B \ + --classifier_from_tokens '["no", "yes"]' --method from_2_way_softmax \ + --path ./Qwen3-Reranker-0.6B-seq-cls + +Note: For BAAI/bge-reranker-v2-gemma, "Yes" and "yes" are different tokens. +""" + import argparse import json import torch import transformers -# Usage: -# for BAAI/bge-reranker-v2-gemma -# Caution: "Yes" and "yes" are two different tokens -# python convert_model_to_seq_cls.py --model_name BAAI/bge-reranker-v2-gemma --classifier_from_tokens '["Yes"]' --method no_post_processing --path ./bge-reranker-v2-gemma-seq-cls -# for mxbai-rerank-v2 -# python convert_model_to_seq_cls.py --model_name mixedbread-ai/mxbai-rerank-base-v2 --classifier_from_tokens '["0", "1"]' --method from_2_way_softmax --path ./mxbai-rerank-base-v2-seq-cls -# for Qwen3-Reranker -# python convert_model_to_seq_cls.py --model_name Qwen/Qwen3-Reranker-0.6B --classifier_from_tokens '["no", "yes"]' --method from_2_way_softmax --path ./Qwen3-Reranker-0.6B-seq-cls - def from_2_way_softmax(causal_lm, seq_cls_model, tokenizer, tokens, device): - # refer to https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3 - assert len(tokens) == 2 + """ + This method extracts the difference between weights for 'true' and 'false' tokens + from the language model head to create a single classification weight vector. + + Args: + causal_lm: The original causal language model + seq_cls_model: The target sequence classification model + tokenizer: Model tokenizer + tokens: List of two tokens representing [false_token, true_token] + device: Target device (cpu/cuda) + + Reference: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3 + """ + assert len(tokens) == 2, ( + "Method requires exactly two tokens for binary classification" + ) + # Get the language model head weights (vocabulary_size x hidden_size) lm_head_weights = causal_lm.lm_head.weight + # Convert token strings to their corresponding token IDs false_id = tokenizer.convert_tokens_to_ids(tokens[0]) true_id = tokenizer.convert_tokens_to_ids(tokens[1]) + # Compute the classification weight as the difference between true and false token weights + # This follows the approach in: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3 score_weight = lm_head_weights[true_id].to(device).to( torch.float32 ) - lm_head_weights[false_id].to(device).to(torch.float32) + # Copy the computed weights to the sequence classification model with torch.no_grad(): seq_cls_model.score.weight.copy_(score_weight.unsqueeze(0)) if seq_cls_model.score.bias is not None: @@ -38,12 +73,29 @@ def from_2_way_softmax(causal_lm, seq_cls_model, tokenizer, tokens, device): def no_post_processing(causal_lm, seq_cls_model, tokenizer, tokens, device): + """ + Directly use token weights from the language model head for classification. + + This method maps each classification label directly to a corresponding token + in the vocabulary without additional transformation. + + Args: + causal_lm: The original causal language model + seq_cls_model: The target sequence classification model + tokenizer: Model tokenizer + tokens: List of tokens representing class labels + device: Target device (cpu/cuda) + """ + # Get the language model head weights (vocabulary_size x hidden_size) lm_head_weights = causal_lm.lm_head.weight + # Convert all tokens to their corresponding token IDs token_ids = [tokenizer.convert_tokens_to_ids(t) for t in tokens] + # Extract weights for the specific tokens (num_tokens x hidden_size) score_weight = lm_head_weights[token_ids].to(device) + # Copy the weights to the sequence classification model with torch.no_grad(): seq_cls_model.score.weight.copy_(score_weight) if seq_cls_model.score.bias is not None: @@ -56,21 +108,35 @@ def no_post_processing(causal_lm, seq_cls_model, tokenizer, tokens, device): def converting( - model_name, classifier_from_tokens, path, method, use_pad_token=False, device="cpu" + model_name, classifier_from_tokens, path, method, use_sep_token=False, device="cpu" ): - assert method in method_map - + """ + Main conversion function to transform a CausalLM model to SequenceClassification. + + Args: + model_name: Name or path of the pretrained model + classifier_from_tokens: List of tokens used for classification + path: Output path to save the converted model + method: Conversion method ('from_2_way_softmax' or 'no_post_processing') + use_sep_token: Whether to use separating token in the sequence classification model + device: Device to load the model on ('cpu' or 'cuda') + """ + assert method in method_map, f"Unknown method: {method}" + + # Determine number of labels based on conversion method if method == "from_2_way_softmax": assert len(classifier_from_tokens) == 2 num_labels = 1 else: num_labels = len(classifier_from_tokens) + # Load tokenizer and original causal language model tokenizer = transformers.AutoTokenizer.from_pretrained(model_name) causal_lm = transformers.AutoModelForCausalLM.from_pretrained( model_name, device_map=device ) + # Load an empty sequence classification model with the same architecture seq_cls_model = transformers.AutoModelForSequenceClassification.from_pretrained( model_name, num_labels=num_labels, @@ -78,14 +144,17 @@ def converting( device_map=device, ) + # Apply the selected conversion method to transfer weights method_map[method]( causal_lm, seq_cls_model, tokenizer, classifier_from_tokens, device ) - # `llm as reranker` defaults to not using pad_token - seq_cls_model.config.use_pad_token = use_pad_token - seq_cls_model.config.pad_token_id = tokenizer.pad_token_id + # Configure separating token settings + # Note: `llm as reranker` defaults to not using separating token. + seq_cls_model.config.use_sep_token = use_sep_token + seq_cls_model.config.sep_token_id = tokenizer.sep_token_id + # Save the converted model and tokenizer seq_cls_model.save_pretrained(path) tokenizer.save_pretrained(path) @@ -99,25 +168,30 @@ def parse_args(): "--model_name", type=str, default="BAAI/bge-reranker-v2-gemma", - help="Model name", + help="HuggingFace model name or local path", ) parser.add_argument( "--classifier_from_tokens", type=str, default='["Yes"]', - help="classifier from tokens", + help="JSON string of tokens used for classification labels", ) parser.add_argument( - "--method", type=str, default="no_post_processing", help="Converting converting" + "--method", + type=str, + default="no_post_processing", + help="Conversion method to use", ) parser.add_argument( - "--use-pad-token", action="store_true", help="Whether to use pad_token" + "--use-pad-token", + action="store_true", + help="Enable padding token in the sequence classification model", ) parser.add_argument( "--path", type=str, default="./bge-reranker-v2-gemma-seq-cls", - help="Path to save converted model", + help="Output directory to save the converted model", ) return parser.parse_args() @@ -129,6 +203,6 @@ def parse_args(): model_name=args.model_name, classifier_from_tokens=json.loads(args.classifier_from_tokens), method=args.method, - use_pad_token=args.use_pad_token, + use_sep_token=args.use_sep_token, path=args.path, ) diff --git a/examples/pooling/score/offline_reranker.py b/examples/pooling/score/offline_reranker.py deleted file mode 100644 index 7bc48277f551..000000000000 --- a/examples/pooling/score/offline_reranker.py +++ /dev/null @@ -1,89 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# ruff: noqa: E501 - -from vllm import LLM - -model_name = "Qwen/Qwen3-Reranker-0.6B" - -# What is the difference between the official original version and one -# that has been converted into a sequence classification model? -# Qwen3-Reranker is a language model that doing reranker by using the -# logits of "no" and "yes" tokens. -# It needs to computing 151669 tokens logits, making this method extremely -# inefficient, not to mention incompatible with the vllm score API. -# A method for converting the original model into a sequence classification -# model was proposed. See:https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3 -# Models converted offline using this method can not only be more efficient -# and support the vllm score API, but also make the init parameters more -# concise, for example. -# llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", runner="pooling") - -# If you want to load the official original version, the init parameters are -# as follows. - - -def get_llm() -> LLM: - """Initializes and returns the LLM model for Qwen3-Reranker.""" - return LLM( - model=model_name, - runner="pooling", - hf_overrides={ - "architectures": ["Qwen3ForSequenceClassification"], - "classifier_from_token": ["no", "yes"], - "is_original_qwen3_reranker": True, - }, - ) - - -# Why do we need hf_overrides for the official original version: -# vllm converts it to Qwen3ForSequenceClassification when loaded for -# better performance. -# - Firstly, we need using `"architectures": ["Qwen3ForSequenceClassification"],` -# to manually route to Qwen3ForSequenceClassification. -# - Then, we will extract the vector corresponding to classifier_from_token -# from lm_head using `"classifier_from_token": ["no", "yes"]`. -# - Third, we will convert these two vectors into one vector. The use of -# conversion logic is controlled by `using "is_original_qwen3_reranker": True`. - -# Please use the query_template and document_template to format the query and -# document for better reranker results. - -prefix = '<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>\n<|im_start|>user\n' -suffix = "<|im_end|>\n<|im_start|>assistant\n\n\n\n\n" - -query_template = "{prefix}: {instruction}\n: {query}\n" -document_template = ": {doc}{suffix}" - - -def main() -> None: - instruction = ( - "Given a web search query, retrieve relevant passages that answer the query" - ) - - queries = [ - "What is the capital of China?", - "Explain gravity", - ] - - documents = [ - "The capital of China is Beijing.", - "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.", - ] - - queries = [ - query_template.format(prefix=prefix, instruction=instruction, query=query) - for query in queries - ] - documents = [document_template.format(doc=doc, suffix=suffix) for doc in documents] - - llm = get_llm() - outputs = llm.score(queries, documents) - - print("-" * 30) - print([output.outputs.score for output in outputs]) - print("-" * 30) - - -if __name__ == "__main__": - main() diff --git a/examples/pooling/score/offline_using_template.py b/examples/pooling/score/offline_using_template.py deleted file mode 100644 index 427cbaab6fbc..000000000000 --- a/examples/pooling/score/offline_using_template.py +++ /dev/null @@ -1,27 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# ruff: noqa: E501 -from pathlib import Path - -from vllm import LLM - -model_name = "nvidia/llama-nemotron-rerank-1b-v2" - -# Path to template file -template_path = Path(__file__).parent / "template" / "nemotron-rerank.jinja" -chat_template = template_path.read_text() - -llm = LLM(model=model_name, runner="pooling", trust_remote_code=True) - -query = "how much protein should a female eat?" -documents = [ - "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.", - "Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments.", - "Calorie intake should not fall below 1,200 a day in women or 1,500 a day in men, except under the supervision of a health professional.", -] - -outputs = llm.score(query, documents, chat_template=chat_template) - -print("-" * 30) -print([output.outputs.score for output in outputs]) -print("-" * 30) diff --git a/examples/pooling/score/online_using_template.py b/examples/pooling/score/online_using_template.py deleted file mode 100644 index 66b22e0a9563..000000000000 --- a/examples/pooling/score/online_using_template.py +++ /dev/null @@ -1,46 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -# ruff: noqa: E501 -""" -Example of using the rerank API with template. - -run: - vllm serve nvidia/llama-nemotron-rerank-1b-v2 --runner pooling --trust-remote-code --chat-template examples/pooling/score/template/nemotron-rerank.jinja -""" - -import json - -import requests - -url = "http://127.0.0.1:8000/rerank" - -headers = {"accept": "application/json", "Content-Type": "application/json"} - -query = "how much protein should a female eat?" -documents = [ - "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.", - "Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments.", - "Calorie intake should not fall below 1,200 a day in women or 1,500 a day in men, except under the supervision of a health professional.", -] - -data = { - "model": "nvidia/llama-nemotron-rerank-1b-v2", - "query": query, - "documents": documents, -} - - -def main(): - response = requests.post(url, headers=headers, json=data) - - # Check the response - if response.status_code == 200: - print("Request successful!") - print(json.dumps(response.json(), indent=2)) - else: - print(f"Request failed with status code: {response.status_code}") - print(response.text) - - -if __name__ == "__main__": - main() diff --git a/examples/pooling/score/qwen3_reranker_offline.py b/examples/pooling/score/qwen3_reranker_offline.py new file mode 100644 index 000000000000..c79ebf97fe64 --- /dev/null +++ b/examples/pooling/score/qwen3_reranker_offline.py @@ -0,0 +1,104 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: E501 + +""" +What is the difference between the official original version and one +that has been converted into a sequence classification model? + +Qwen3-Reranker is a language model that doing reranker by using the +logits of "no" and "yes" tokens. +This requires computing logits for all 151,669 tokens in the vocabulary, +making it inefficient and incompatible with vLLM's score() API. + +A conversion method has been proposed to transform the original model into a +sequence classification model. This converted model: +1. Is significantly more efficient +2. Fully supports vLLM's score() API +3. Simplifies initialization parameters +Reference: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3 +Reference: https://github.com/vllm-project/vllm/blob/main/examples/pooling/score/convert_model_to_seq_cls.py + +For the converted model, initialization would simply be: +llm = LLM(model="tomaarsen/Qwen3-Reranker-0.6B-seq-cls", runner="pooling") + +This example demonstrates loading the ORIGINAL model with special overrides +to make it compatible with vLLM's score API. +""" + +from pathlib import Path + +from vllm import LLM + +model_name = "Qwen/Qwen3-Reranker-0.6B" + + +def get_llm() -> LLM: + """ + Initializes and returns the LLM model for Qwen3-Reranker. + + Returns: + LLM: Configured vLLM instance for reranking tasks. + + Note: + This function loads the ORIGINAL Qwen3-Reranker model with specific + overrides to make it compatible with vLLM's score API. + """ + return LLM( + # Specify the original model from HuggingFace + model=model_name, + # Use pooling runner for score task + runner="pooling", + # HuggingFace model configuration overrides required for compatibility + hf_overrides={ + # Manually route to sequence classification architecture + # This tells vLLM to use Qwen3ForSequenceClassification instead of + # the default Qwen3ForCausalLM + "architectures": ["Qwen3ForSequenceClassification"], + # Specify which token logits to extract from the language model head + # The original reranker uses "no" and "yes" token logits for scoring + "classifier_from_token": ["no", "yes"], + # Enable special handling for original Qwen3-Reranker models + # This flag triggers conversion logic that transforms the two token + # vectors into a single classification vector + "is_original_qwen3_reranker": True, + }, + ) + + +def main() -> None: + # Load the Jinja template for formatting query-document pairs + # The template ensures proper formatting for the reranker model + template_home = Path(__file__).parent / "template" + template_path = "qwen3_reranker.jinja" + chat_template = (template_home / template_path).read_text() + + # Sample queries for testing the reranker + queries = [ + "What is the capital of China?", + "Explain gravity", + ] + + # Corresponding documents to be scored against each query + documents = [ + "The capital of China is Beijing.", + "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.", + ] + + # Initialize the LLM model with the original Qwen3-Reranker configuration + llm = get_llm() + + # Compute relevance scores for each query-document pair + # The score() method returns a relevance score for each pair + # Higher scores indicate better relevance + outputs = llm.score(queries, documents, chat_template=chat_template) + + # Extract and print the relevance scores from the outputs + # Each output contains a score representing query-document relevance + print("-" * 30) + print("Relevance scores:", [output.outputs.score for output in outputs]) + print("-" * 30) + + +if __name__ == "__main__": + main() diff --git a/examples/pooling/score/qwen3_reranker_online.py b/examples/pooling/score/qwen3_reranker_online.py new file mode 100644 index 000000000000..441c1709dc52 --- /dev/null +++ b/examples/pooling/score/qwen3_reranker_online.py @@ -0,0 +1,80 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: E501 +""" +What is the difference between the official original version and one +that has been converted into a sequence classification model? + +Qwen3-Reranker is a language model that doing reranker by using the +logits of "no" and "yes" tokens. +This requires computing logits for all 151,669 tokens in the vocabulary, +making it inefficient and incompatible with vLLM's score() API. + +A conversion method has been proposed to transform the original model into a +sequence classification model. This converted model: +1. Is significantly more efficient +2. Fully supports vLLM's score() API +3. Simplifies initialization parameters +Reference: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B/discussions/3 +Reference: https://github.com/vllm-project/vllm/blob/main/examples/pooling/score/convert_model_to_seq_cls.py + +For the converted model, initialization would simply be: + vllm serve tomaarsen/Qwen3-Reranker-0.6B-seq-cls --runner pooling --chat-template examples/pooling/score/template/qwen3_reranker.jinja + +This example demonstrates loading the ORIGINAL model with special overrides +to make it compatible with vLLM's score API. + vllm serve Qwen/Qwen3-Reranker-0.6B --runner pooling --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' --chat-template examples/pooling/score/template/qwen3_reranker.jinja +""" + +import json + +import requests + +# URL of the vLLM server's score endpoint +# Default vLLM server runs on localhost port 8000 +url = "http://127.0.0.1:8000/score" + +# HTTP headers for the request +headers = {"accept": "application/json", "Content-Type": "application/json"} + +# Example queries & documents +queries = [ + "What is the capital of China?", + "Explain gravity", +] +documents = [ + "The capital of China is Beijing.", + "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun.", +] + +# Request payload for the score API +data = { + "model": "Qwen/Qwen3-Reranker-0.6B", + "text_1": queries, + "text_2": documents, +} + + +def main(): + """Main function to send a score request to the vLLM server. + + This function sends a POST request to the /score endpoint with + the query and documents, then prints the relevance scores. + """ + # Send POST request to the vLLM server's score endpoint + response = requests.post(url, headers=headers, json=data) + + # Check if the request was successful + if response.status_code == 200: + print("Request successful!") + # Pretty print the JSON response containing relevance scores + # The response includes scores for each document's relevance to the query + print(json.dumps(response.json(), indent=2)) + else: + # Handle request failure + print(f"Request failed with status code: {response.status_code}") + print(response.text) + + +if __name__ == "__main__": + main() diff --git a/examples/pooling/score/template/bge-reranker-v2-gemma.jinja b/examples/pooling/score/template/bge-reranker-v2-gemma.jinja new file mode 100644 index 000000000000..cdc83aeab6cb --- /dev/null +++ b/examples/pooling/score/template/bge-reranker-v2-gemma.jinja @@ -0,0 +1,3 @@ +A: {{ (messages | selectattr("role", "eq", "query") | first).content }} +B: {{ (messages | selectattr("role", "eq", "document") | first).content }} +Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'. \ No newline at end of file diff --git a/examples/pooling/score/template/mxbai_rerank_v2.jinja b/examples/pooling/score/template/mxbai_rerank_v2.jinja new file mode 100644 index 000000000000..32488c48b3af --- /dev/null +++ b/examples/pooling/score/template/mxbai_rerank_v2.jinja @@ -0,0 +1,8 @@ +<|im_start|>system +You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|> +<|im_start|>user +query: {{ (messages | selectattr("role", "eq", "query") | first).content }} +document: {{ (messages | selectattr("role", "eq", "document") | first).content }} +You are a search relevance expert who evaluates how well documents match search queries. For each query-document pair, carefully analyze the semantic relationship between them, then provide your binary relevance judgment (0 for not relevant, 1 for relevant). +Relevance:<|im_end|> +<|im_start|>assistant diff --git a/examples/pooling/score/template/qwen3_reranker.jinja b/examples/pooling/score/template/qwen3_reranker.jinja new file mode 100644 index 000000000000..f33f526dc054 --- /dev/null +++ b/examples/pooling/score/template/qwen3_reranker.jinja @@ -0,0 +1,11 @@ +<|im_start|>system +Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|> +<|im_start|>user +: {{ messages | selectattr("role", "eq", "system") | map(attribute="content") | first | default("Given a web search query, retrieve relevant passages that answer the query") }} +: {{ messages | selectattr("role", "eq", "query") | map(attribute="content") | first }} +: {{ messages | selectattr("role", "eq", "document") | map(attribute="content") | first }}<|im_end|> +<|im_start|>assistant + + + + diff --git a/examples/pooling/score/using_template_offline.py b/examples/pooling/score/using_template_offline.py new file mode 100644 index 000000000000..f434e699ff0e --- /dev/null +++ b/examples/pooling/score/using_template_offline.py @@ -0,0 +1,159 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: E501 +from argparse import Namespace +from pathlib import Path +from typing import Any + +from vllm import LLM, EngineArgs +from vllm.utils.argparse_utils import FlexibleArgumentParser + + +def parse_args(): + """Parse command line arguments for the reranking example. + + This function sets up the argument parser with default values + specific to reranking models, including the model name and + runner type. + """ + parser = FlexibleArgumentParser() + # Add all EngineArgs command line arguments to the parser + parser = EngineArgs.add_cli_args(parser) + + # Set default values specific to this reranking example + # These defaults ensure the script works out-of-the-box for reranking tasks + parser.set_defaults( + model="nvidia/llama-nemotron-rerank-1b-v2", # Default reranking model + runner="pooling", # Required for cross-encoder/reranking models + trust_remote_code=True, # Allow loading models with custom code + ) + return parser.parse_args() + + +def get_chat_template(model: str) -> str: + """Load the appropriate chat template for the specified model. + + Reranking models require specific prompt templates to format + query-document pairs correctly. This function maps model names + to their corresponding template files. + """ + # Directory containing all chat template files + template_home = Path(__file__).parent / "template" + + # Mapping from model names to their corresponding template files + # Each reranking model has its own specific prompt format + model_name_to_template_path_map = { + "BAAI/bge-reranker-v2-gemma": "bge-reranker-v2-gemma.jinja", + "Qwen/Qwen3-Reranker-0.6B": "qwen3_reranker.jinja", + "Qwen/Qwen3-Reranker-4B": "qwen3_reranker.jinja", + "Qwen/Qwen3-Reranker-8B": "qwen3_reranker.jinja", + "tomaarsen/Qwen3-Reranker-0.6B-seq-cls": "qwen3_reranker.jinja", + "tomaarsen/Qwen3-Reranker-4B-seq-cls": "qwen3_reranker.jinja", + "tomaarsen/Qwen3-Reranker-8B-seq-cls": "qwen3_reranker.jinja", + "mixedbread-ai/mxbai-rerank-base-v2": "mxbai_rerank_v2.jinja", + "mixedbread-ai/mxbai-rerank-large-v2": "mxbai_rerank_v2.jinja", + "nvidia/llama-nemotron-rerank-1b-v2": "nemotron-rerank.jinja", + } + + # Get the template filename for the specified model + template_path = model_name_to_template_path_map.get(model) + + if template_path is None: + raise ValueError(f"This demo does not support model name: {model}.") + + # Read and return the template content + return (template_home / template_path).read_text() + + +def get_hf_overrides(model: str) -> dict[str, Any]: + """Convert Large Language Models (LLMs) to Sequence Classification models. + + note: + Some reranking models require special configuration overrides to work + correctly with vLLM's score API. + Reference: https://github.com/vllm-project/vllm/blob/main/examples/pooling/score/qwen3_reranker_offline.py + Reference: https://github.com/vllm-project/vllm/blob/main/examples/pooling/score/convert_model_to_seq_cls.py + """ + + model_name_to_hf_overrides_map = { + "BAAI/bge-reranker-v2-gemma": { + "architectures": ["GemmaForSequenceClassification"], + "classifier_from_token": ["Yes"], + "method": "no_post_processing", + }, + "Qwen/Qwen3-Reranker-0.6B": { + "architectures": ["Qwen3ForSequenceClassification"], + "classifier_from_token": ["no", "yes"], + "is_original_qwen3_reranker": True, + }, + "Qwen/Qwen3-Reranker-4B": { + "architectures": ["Qwen3ForSequenceClassification"], + "classifier_from_token": ["no", "yes"], + "is_original_qwen3_reranker": True, + }, + "Qwen/Qwen3-Reranker-8B": { + "architectures": ["Qwen3ForSequenceClassification"], + "classifier_from_token": ["no", "yes"], + "is_original_qwen3_reranker": True, + }, + "tomaarsen/Qwen3-Reranker-0.6B-seq-cls": {}, + "tomaarsen/Qwen3-Reranker-4B-seq-cls": {}, + "tomaarsen/Qwen3-Reranker-8B-seq-cls": {}, + "mixedbread-ai/mxbai-rerank-base-v2": { + "architectures": ["Qwen2ForSequenceClassification"], + "classifier_from_token": ["0", "1"], + "method": "from_2_way_softmax", + }, + "mixedbread-ai/mxbai-rerank-large-v2": { + "architectures": ["Qwen2ForSequenceClassification"], + "classifier_from_token": ["0", "1"], + "method": "from_2_way_softmax", + }, + "nvidia/llama-nemotron-rerank-1b-v2": {}, + } + + hf_overrides = model_name_to_hf_overrides_map.get(model) + + if hf_overrides is None: + raise ValueError(f"This demo does not support model name: {model}.") + + return hf_overrides + + +def main(args: Namespace): + """Main execution function for the reranking example.""" + + # Get the overrides for the specified model + args.hf_overrides = get_hf_overrides(args.model) + + # Initialize the LLM with all provided arguments + llm = LLM(**vars(args)) + + # Example query for demonstration + query = "how much protein should a female eat?" + + # Example documents to be reranked based on relevance to the query + documents = [ + "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.", + "Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments.", + "Calorie intake should not fall below 1,200 a day in women or 1,500 a day in men, except under the supervision of a health professional.", + ] + + # Load the appropriate chat template for the selected model + # The template formats query-document pairs for the reranking model + chat_template = get_chat_template(args.model) + + # Score documents based on relevance to the query + # The score method returns relevance scores for each document + outputs = llm.score(query, documents, chat_template=chat_template) + + # Display the relevance scores + # Higher scores indicate more relevant documents + print("-" * 30) + print([output.outputs.score for output in outputs]) + print("-" * 30) + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/examples/pooling/score/using_template_online.py b/examples/pooling/score/using_template_online.py new file mode 100644 index 000000000000..f0bfa7d15769 --- /dev/null +++ b/examples/pooling/score/using_template_online.py @@ -0,0 +1,75 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: E501 +""" +Example of using the rerank API with template. + +This script demonstrates how to interact with a vLLM server running +a reranking model via the REST API. +Before running this script, start the vLLM server with one of the +supported reranking models using the commands below. + +note: + Some reranking models require special configuration overrides to work correctly + with vLLM's score API. + Reference: https://github.com/vllm-project/vllm/blob/main/examples/pooling/score/qwen3_reranker_online.py + Reference: https://github.com/vllm-project/vllm/blob/main/examples/pooling/score/convert_model_to_seq_cls.py + +run: + vllm serve BAAI/bge-reranker-v2-gemma --hf_overrides '{"architectures": ["GemmaForSequenceClassification"],"classifier_from_token": ["Yes"],"method": "no_post_processing"}' --chat-template examples/pooling/score/template/bge-reranker-v2-gemma.jinja + vllm serve tomaarsen/Qwen3-Reranker-0.6B-seq-cls --chat-template examples/pooling/score/template/qwen3_reranker.jinja + vllm serve mixedbread-ai/mxbai-rerank-base-v2 --hf_overrides '{"architectures": ["Qwen2ForSequenceClassification"],"classifier_from_token": ["0", "1"], "method": "from_2_way_softmax"}' --chat-template examples/pooling/score/template/mxbai_rerank_v2.jinja + vllm serve nvidia/llama-nemotron-rerank-1b-v2 --runner pooling --trust-remote-code --chat-template examples/pooling/score/template/nemotron-rerank.jinja + vllm serve Qwen/Qwen3-Reranker-0.6B --runner pooling --hf_overrides '{"architectures": ["Qwen3ForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' --chat-template examples/pooling/score/template/qwen3_reranker.jinja +""" + +import json + +import requests + +# URL of the vLLM server's rerank endpoint +# Default vLLM server runs on localhost port 8000 +url = "http://127.0.0.1:8000/rerank" + +# HTTP headers for the request +headers = {"accept": "application/json", "Content-Type": "application/json"} + +# Example query & documents +query = "how much protein should a female eat?" +documents = [ + "As a general guideline, the CDC's average requirement of protein for women ages 19 to 70 is 46 grams per day. But, as you can see from this chart, you'll need to increase that if you're expecting or training for a marathon. Check out the chart below to see how much protein you should be eating each day.", + "Definition of summit for English Language Learners. : 1 the highest point of a mountain : the top of a mountain. : 2 the highest level. : 3 a meeting or series of meetings between the leaders of two or more governments.", + "Calorie intake should not fall below 1,200 a day in women or 1,500 a day in men, except under the supervision of a health professional.", +] + +# Request payload for the rerank API +data = { + "model": "nvidia/llama-nemotron-rerank-1b-v2", # Model to use for reranking + "query": query, # The query to score documents against + "documents": documents, # List of documents to be scored +} + + +def main(): + """Main function to send a rerank request to the vLLM server. + + This function sends a POST request to the /rerank endpoint with + the query and documents, then prints the relevance scores. + """ + # Send POST request to the vLLM server's rerank endpoint + response = requests.post(url, headers=headers, json=data) + + # Check if the request was successful + if response.status_code == 200: + print("Request successful!") + # Pretty print the JSON response containing relevance scores + # The response includes scores for each document's relevance to the query + print(json.dumps(response.json(), indent=2)) + else: + # Handle request failure + print(f"Request failed with status code: {response.status_code}") + print(response.text) + + +if __name__ == "__main__": + main() diff --git a/examples/tool_chat_template_glm4.jinja b/examples/tool_chat_template_glm4.jinja new file mode 100644 index 000000000000..11f76b4d4af4 --- /dev/null +++ b/examples/tool_chat_template_glm4.jinja @@ -0,0 +1,54 @@ +{%- set counter = namespace(index=0) -%} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{%- if messages and messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "You are a helpful assistant." %} +{%- endif %} + +{%- if tools is not none %} + {%- set tool_instruction %} +You have access to the following tools. When you need to call a tool, you MUST use the following format: + +function_name +parameter_name +parameter_value + + +Important rules: +- Always wrap tool calls with ... tags +- Put the function name on the first line after +- Use and tags for each parameter +- If a parameter value is a string, keep it as-is. If it's a number or boolean, convert it appropriately +- You can make multiple tool calls if needed +- If no tool is suitable, respond with regular text + +Available tools: +{% endset %} + {{- tool_instruction + "\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} + +{%- for message in messages -%} + {%- if message['role'] == 'user' -%} + {{- '[Round ' + counter.index|string + ']\n问:' + message['content'] -}} + {%- set counter.index = counter.index + 1 -%} + {%- endif -%} + {%- if message['role'] == 'assistant' -%} + {{- '\n答:' + message['content'] -}} + {%- if (loop.last and add_generation_prompt) or not loop.last -%} + {{- '\n' -}} + {%- endif -%} + {%- endif -%} +{%- endfor -%} + +{%- if add_generation_prompt and messages[-1]['role'] != 'assistant' -%} + {{- '\n答:' -}} +{%- endif -%} diff --git a/requirements/nightly_torch_test.txt b/requirements/nightly_torch_test.txt index a8c22f8e6946..72fa1369249e 100644 --- a/requirements/nightly_torch_test.txt +++ b/requirements/nightly_torch_test.txt @@ -17,7 +17,7 @@ vocos # required for minicpmo_26 test peft pqdm ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests -sentence-transformers # required for embedding tests +sentence-transformers>=5.2.0 # required for embedding tests soundfile # required for audio tests jiwer # required for audio tests timm # required for internvl test diff --git a/requirements/test.in b/requirements/test.in index 2aa36a5d32c0..5fc405a63672 100644 --- a/requirements/test.in +++ b/requirements/test.in @@ -9,6 +9,7 @@ pytest-timeout pytest-cov # testing utils +albumentations # required for Nemotron Parse in test_common.py backoff # required for phi4mm test blobfile # required for kimi-vl test einops # required for MPT, qwen-vl @@ -19,11 +20,11 @@ vocos # required for minicpmo_26 test peft>=0.15.0 # required for phi-4-mm test pqdm ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests -sentence-transformers # required for embedding tests +sentence-transformers>=5.2.0 # required for embedding tests soundfile # required for audio tests jiwer # required for audio tests tblib # for pickling test exceptions -timm >=1.0.17 # required for internvl and gemma3n-mm test +timm==1.0.17 # required for internvl and gemma3n-mm test torch==2.9.1 torchaudio==2.9.1 torchvision==0.24.1 @@ -31,7 +32,7 @@ transformers_stream_generator # required for qwen-vl test matplotlib # required for qwen-vl test mistral_common[image,audio] >= 1.8.8 # required for voxtral test num2words # required for smolvlm test -open_clip_torch==2.32.0 # Required for nemotron_vl test +open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py opencv-python-headless >= 4.11.0 # required for video test datamodel_code_generator # required for minicpm3 test lm-eval[api]>=0.4.9.2 # required for model evaluation test diff --git a/requirements/test.txt b/requirements/test.txt index e734df5e1bff..41882da9d31f 100644 --- a/requirements/test.txt +++ b/requirements/test.txt @@ -27,7 +27,9 @@ aiosignal==1.4.0 albucore==0.0.16 # via terratorch albumentations==1.4.6 - # via terratorch + # via + # -r requirements/test.in + # terratorch alembic==1.16.4 # via mlflow annotated-types==0.7.0 @@ -723,7 +725,6 @@ pillow==10.4.0 # perceptron # scikit-image # segmentation-models-pytorch - # sentence-transformers # torchgeo # torchvision platformdirs==4.3.6 @@ -1023,7 +1024,7 @@ segmentation-models-pytorch==0.4.0 # via # terratorch # torchgeo -sentence-transformers==3.2.1 +sentence-transformers==5.2.0 # via # -r requirements/test.in # mteb @@ -1263,6 +1264,7 @@ typing-extensions==4.15.0 # pydantic-core # pydantic-extra-types # pytorch-lightning + # sentence-transformers # sqlalchemy # torch # torchgeo diff --git a/tests/compile/distributed/test_async_tp.py b/tests/compile/distributed/test_async_tp.py index 2eb18e25c98b..e02f038b4edf 100644 --- a/tests/compile/distributed/test_async_tp.py +++ b/tests/compile/distributed/test_async_tp.py @@ -26,6 +26,7 @@ ) from vllm.platforms import current_platform from vllm.utils.system_utils import update_environment_variables +from vllm.utils.torch_utils import set_random_seed from ...models.registry import HF_EXAMPLE_MODELS from ...utils import ( @@ -301,7 +302,7 @@ def async_tp_pass_on_test_model( dtype: torch.dtype, dynamic: bool, ): - current_platform.seed_everything(0) + set_random_seed(0) device = torch.device(f"cuda:{local_rank}") torch.cuda.set_device(device) diff --git a/tests/compile/distributed/test_fusion_all_reduce.py b/tests/compile/distributed/test_fusion_all_reduce.py index fc8d1f98ebf8..d0a194c2b044 100644 --- a/tests/compile/distributed/test_fusion_all_reduce.py +++ b/tests/compile/distributed/test_fusion_all_reduce.py @@ -32,6 +32,7 @@ ) from vllm.platforms import current_platform from vllm.utils.system_utils import update_environment_variables +from vllm.utils.torch_utils import set_random_seed from ...utils import has_module_attribute, multi_gpu_test from ..backend import TestBackend @@ -263,7 +264,7 @@ def all_reduce_fusion_pass_on_test_model( enable_rms_norm_custom_op, enable_quant_fp8_custom_op, ): - current_platform.seed_everything(0) + set_random_seed(0) device = torch.device(f"cuda:{local_rank}") torch.cuda.set_device(device) diff --git a/tests/compile/distributed/test_sequence_parallelism.py b/tests/compile/distributed/test_sequence_parallelism.py index d9fdc3acc3d6..35916ba99652 100644 --- a/tests/compile/distributed/test_sequence_parallelism.py +++ b/tests/compile/distributed/test_sequence_parallelism.py @@ -31,6 +31,7 @@ from vllm.model_executor.layers.quantization.utils.w8a8_utils import Fp8LinearOp from vllm.platforms import current_platform from vllm.utils.system_utils import update_environment_variables +from vllm.utils.torch_utils import set_random_seed from ...utils import multi_gpu_test from ..backend import TestBackend @@ -232,7 +233,7 @@ def sequence_parallelism_pass_on_test_model( fuse_norm_quant: bool, dynamic: bool, ): - current_platform.seed_everything(0) + set_random_seed(0) device = torch.device(f"cuda:{local_rank}") torch.cuda.set_device(device) diff --git a/tests/config/base_model_arch_groundtruth.json b/tests/config/base_model_arch_groundtruth.json new file mode 100644 index 000000000000..3401198ad7d5 --- /dev/null +++ b/tests/config/base_model_arch_groundtruth.json @@ -0,0 +1,359 @@ +{ + "state-spaces/mamba-130m-hf": { + "architectures": [ + "MambaForCausalLM" + ], + "model_type": "mamba", + "text_model_type": "mamba", + "hidden_size": 768, + "total_num_hidden_layers": 24, + "total_num_attention_heads": 0, + "head_size": 0, + "vocab_size": 50280, + "total_num_kv_heads": 0, + "num_experts": 0, + "is_deepseek_mla": false, + "is_multimodal_model": false, + "dtype": "torch.float32" + }, + "mistralai/Mamba-Codestral-7B-v0.1": { + "architectures": [ + "Mamba2ForCausalLM" + ], + "model_type": "mamba", + "text_model_type": "mamba", + "hidden_size": 4096, + "total_num_hidden_layers": 64, + "total_num_attention_heads": 0, + "head_size": 0, + "vocab_size": 32768, + "total_num_kv_heads": 0, + "num_experts": 0, + "is_deepseek_mla": false, + "is_multimodal_model": false, + "dtype": "torch.bfloat16" + }, + "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11": { + "architectures": [ + "Terratorch" + ], + "model_type": "timm_wrapper", + "text_model_type": "timm_wrapper", + "hidden_size": 0, + "total_num_hidden_layers": 0, + "total_num_attention_heads": 0, + "head_size": 0, + "vocab_size": 0, + "total_num_kv_heads": 0, + "num_experts": 0, + "is_deepseek_mla": false, + "is_multimodal_model": true, + "dtype": "torch.float32" + }, + "tiiuae/falcon-mamba-7b-instruct": { + "architectures": [ + "FalconMambaForCausalLM" + ], + "model_type": "falcon_mamba", + "text_model_type": "falcon_mamba", + "hidden_size": 4096, + "total_num_hidden_layers": 64, + "total_num_attention_heads": 0, + "head_size": 0, + "vocab_size": 65024, + "total_num_kv_heads": 0, + "num_experts": 0, + "is_deepseek_mla": false, + "is_multimodal_model": false, + "dtype": "torch.bfloat16" + }, + "Zyphra/Zamba2-7B-instruct": { + "architectures": [ + "Zamba2ForCausalLM" + ], + "model_type": "zamba2", + "text_model_type": "zamba2", + "hidden_size": 3584, + "total_num_hidden_layers": 81, + "total_num_attention_heads": 32, + "head_size": 224, + "vocab_size": 32000, + "total_num_kv_heads": 32, + "num_experts": 0, + "is_deepseek_mla": false, + "is_multimodal_model": false, + "dtype": "torch.bfloat16" + }, + "mosaicml/mpt-7b": { + "architectures": [ + "MPTForCausalLM" + ], + "model_type": "mpt", + "text_model_type": "mpt", + "hidden_size": 4096, + "total_num_hidden_layers": 32, + "total_num_attention_heads": 32, + "head_size": 128, + "vocab_size": 50432, + "total_num_kv_heads": 32, + "num_experts": 0, + "is_deepseek_mla": false, + "is_multimodal_model": false, + "dtype": "torch.bfloat16" + }, + "databricks/dbrx-instruct": { + "architectures": [ + "DbrxForCausalLM" + ], + "model_type": "dbrx", + "text_model_type": "dbrx", + "hidden_size": 6144, + "total_num_hidden_layers": 40, + "total_num_attention_heads": 48, + "head_size": 128, + "vocab_size": 100352, + "total_num_kv_heads": 8, + "num_experts": 0, + "is_deepseek_mla": false, + "is_multimodal_model": false, + "dtype": "torch.bfloat16" + }, + "tiiuae/falcon-7b": { + "architectures": [ + "FalconForCausalLM" + ], + "model_type": "falcon", + "text_model_type": "falcon", + "hidden_size": 4544, + "total_num_hidden_layers": 32, + "total_num_attention_heads": 71, + "head_size": 64, + "vocab_size": 65024, + "total_num_kv_heads": 1, + "num_experts": 0, + "is_deepseek_mla": false, + "is_multimodal_model": false, + "dtype": "torch.bfloat16" + }, + "tiiuae/falcon-40b": { + "architectures": [ + "FalconForCausalLM" + ], + "model_type": "falcon", + "text_model_type": "falcon", + "hidden_size": 8192, + "total_num_hidden_layers": 60, + "total_num_attention_heads": 128, + "head_size": 64, + "vocab_size": 65024, + "total_num_kv_heads": 8, + "num_experts": 0, + "is_deepseek_mla": false, + "is_multimodal_model": false, + "dtype": "torch.bfloat16" + }, + "luccafong/deepseek_mtp_main_random": { + "architectures": [ + "DeepseekV3ForCausalLM" + ], + "model_type": "deepseek_v3", + "text_model_type": "deepseek_v3", + "hidden_size": 2560, + "total_num_hidden_layers": 5, + "total_num_attention_heads": 32, + "head_size": 576, + "vocab_size": 129280, + "total_num_kv_heads": 32, + "num_experts": 72, + "is_deepseek_mla": true, + "is_multimodal_model": false, + "dtype": "torch.bfloat16" + }, + "luccafong/deepseek_mtp_draft_random": { + "architectures": [ + "DeepseekV3ForCausalLM" + ], + "model_type": "deepseek_v3", + "text_model_type": "deepseek_v3", + "hidden_size": 2560, + "total_num_hidden_layers": 10, + "total_num_attention_heads": 32, + "head_size": 576, + "vocab_size": 129280, + "total_num_kv_heads": 32, + "num_experts": 72, + "is_deepseek_mla": true, + "is_multimodal_model": false, + "dtype": "torch.bfloat16" + }, + "Qwen/Qwen3-Next-80B-A3B-Instruct": { + "architectures": [ + "Qwen3NextForCausalLM" + ], + "model_type": "qwen3_next", + "text_model_type": "qwen3_next", + "hidden_size": 2048, + "total_num_hidden_layers": 48, + "total_num_attention_heads": 16, + "head_size": 256, + "vocab_size": 151936, + "total_num_kv_heads": 2, + "num_experts": 512, + "is_deepseek_mla": false, + "is_multimodal_model": false, + "dtype": "torch.bfloat16" + }, + "tiny-random/qwen3-next-moe": { + "architectures": [ + "Qwen3NextForCausalLM" + ], + "model_type": "qwen3_next", + "text_model_type": "qwen3_next", + "hidden_size": 8, + "total_num_hidden_layers": 4, + "total_num_attention_heads": 16, + "head_size": 32, + "vocab_size": 151936, + "total_num_kv_heads": 8, + "num_experts": 32, + "is_deepseek_mla": false, + "is_multimodal_model": false, + "dtype": "torch.bfloat16" + }, + "zai-org/GLM-4.5": { + "architectures": [ + "Glm4MoeForCausalLM" + ], + "model_type": "glm4_moe", + "text_model_type": "glm4_moe", + "hidden_size": 5120, + "total_num_hidden_layers": 92, + "total_num_attention_heads": 96, + "head_size": 128, + "vocab_size": 151552, + "total_num_kv_heads": 8, + "num_experts": 160, + "is_deepseek_mla": false, + "is_multimodal_model": false, + "dtype": "torch.bfloat16" + }, + "baidu/ERNIE-4.5-21B-A3B-PT": { + "architectures": [ + "Ernie4_5_MoeForCausalLM" + ], + "model_type": "ernie4_5_moe", + "text_model_type": "ernie4_5_moe", + "hidden_size": 2560, + "total_num_hidden_layers": 28, + "total_num_attention_heads": 20, + "head_size": 128, + "vocab_size": 103424, + "total_num_kv_heads": 4, + "num_experts": 64, + "is_deepseek_mla": false, + "is_multimodal_model": false, + "dtype": "torch.bfloat16" + }, + "lmsys/gpt-oss-20b-bf16": { + "architectures": [ + "GptOssForCausalLM" + ], + "model_type": "gpt_oss", + "text_model_type": "gpt_oss", + "hidden_size": 2880, + "total_num_hidden_layers": 24, + "total_num_attention_heads": 64, + "head_size": 64, + "vocab_size": 201088, + "total_num_kv_heads": 8, + "num_experts": 32, + "is_deepseek_mla": false, + "is_multimodal_model": false, + "dtype": "torch.bfloat16" + }, + "deepseek-ai/DeepSeek-V3.2-Exp": { + "architectures": [ + "DeepseekV32ForCausalLM" + ], + "model_type": "deepseek_v32", + "text_model_type": "deepseek_v32", + "hidden_size": 7168, + "total_num_hidden_layers": 61, + "total_num_attention_heads": 128, + "head_size": 576, + "vocab_size": 129280, + "total_num_kv_heads": 128, + "num_experts": 256, + "is_deepseek_mla": true, + "is_multimodal_model": false, + "dtype": "torch.bfloat16" + }, + "meta-llama/Llama-4-Scout-17B-16E-Instruct": { + "architectures": [ + "Llama4ForConditionalGeneration" + ], + "model_type": "llama4", + "text_model_type": "llama4_text", + "hidden_size": 5120, + "total_num_hidden_layers": 48, + "total_num_attention_heads": 40, + "head_size": 128, + "vocab_size": 202048, + "total_num_kv_heads": 8, + "num_experts": 16, + "is_deepseek_mla": false, + "is_multimodal_model": true, + "dtype": "torch.bfloat16" + }, + "nvidia/Llama-3_3-Nemotron-Super-49B-v1": { + "architectures": [ + "DeciLMForCausalLM" + ], + "model_type": "nemotron-nas", + "text_model_type": "nemotron-nas", + "hidden_size": 8192, + "total_num_hidden_layers": 80, + "total_num_attention_heads": 64, + "head_size": 128, + "vocab_size": 128256, + "total_num_kv_heads": 8, + "num_experts": 0, + "is_deepseek_mla": false, + "is_multimodal_model": false, + "dtype": "torch.bfloat16" + }, + "XiaomiMiMo/MiMo-7B-RL": { + "architectures": [ + "MiMoForCausalLM" + ], + "model_type": "mimo", + "text_model_type": "mimo", + "hidden_size": 4096, + "total_num_hidden_layers": 36, + "total_num_attention_heads": 32, + "head_size": 128, + "vocab_size": 151680, + "total_num_kv_heads": 8, + "num_experts": 0, + "is_deepseek_mla": false, + "is_multimodal_model": false, + "dtype": "torch.bfloat16" + }, + "meituan-longcat/LongCat-Flash-Chat": { + "architectures": [ + "LongcatFlashForCausalLM" + ], + "model_type": "longcat_flash", + "text_model_type": "longcat_flash", + "hidden_size": 6144, + "total_num_hidden_layers": 28, + "total_num_attention_heads": 64, + "head_size": 576, + "vocab_size": 131072, + "total_num_kv_heads": 64, + "num_experts": 512, + "is_deepseek_mla": true, + "is_multimodal_model": false, + "dtype": "torch.float32" + } +} diff --git a/tests/config/draft_model_arch_groundtruth.json b/tests/config/draft_model_arch_groundtruth.json new file mode 100644 index 000000000000..dfe6f3d39e93 --- /dev/null +++ b/tests/config/draft_model_arch_groundtruth.json @@ -0,0 +1,87 @@ +{ + "abhigoyal/vllm-medusa-llama-68m-random": { + "architectures": [ + "MedusaModel" + ], + "model_type": "medusa", + "text_model_type": "medusa", + "hidden_size": 768, + "total_num_hidden_layers": 1, + "total_num_attention_heads": 0, + "head_size": "Error: integer division or modulo by zero", + "vocab_size": 32000, + "total_num_kv_heads": 0, + "num_experts": 0, + "is_deepseek_mla": false, + "is_multimodal_model": false, + "dtype": "torch.float32" + }, + "luccafong/deepseek_mtp_draft_random": { + "architectures": [ + "DeepSeekMTPModel" + ], + "model_type": "deepseek_mtp", + "text_model_type": "deepseek_mtp", + "hidden_size": 2560, + "total_num_hidden_layers": 1, + "total_num_attention_heads": 32, + "head_size": 576, + "vocab_size": 129280, + "total_num_kv_heads": 32, + "num_experts": 72, + "is_deepseek_mla": true, + "is_multimodal_model": false, + "dtype": "torch.bfloat16" + }, + "eagle618/eagle-deepseek-v3-random": { + "architectures": [ + "EagleDeepSeekMTPModel" + ], + "model_type": "eagle", + "text_model_type": "deepseek_mtp", + "hidden_size": 2560, + "total_num_hidden_layers": 1, + "total_num_attention_heads": 32, + "head_size": 576, + "vocab_size": 129280, + "total_num_kv_heads": 32, + "num_experts": 72, + "is_deepseek_mla": true, + "is_multimodal_model": false, + "dtype": "bfloat16" + }, + "yuhuili/EAGLE-LLaMA3-Instruct-8B": { + "architectures": [ + "EagleLlamaForCausalLM" + ], + "model_type": "eagle", + "text_model_type": "llama", + "hidden_size": 4096, + "total_num_hidden_layers": 1, + "total_num_attention_heads": 32, + "head_size": 128, + "vocab_size": 128256, + "total_num_kv_heads": 8, + "num_experts": 0, + "is_deepseek_mla": false, + "is_multimodal_model": false, + "dtype": "float16" + }, + "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B": { + "architectures": [ + "Eagle3LlamaForCausalLM" + ], + "model_type": "eagle", + "text_model_type": "llama", + "hidden_size": 4096, + "total_num_hidden_layers": 1, + "total_num_attention_heads": 32, + "head_size": 128, + "vocab_size": 128256, + "total_num_kv_heads": 8, + "num_experts": 0, + "is_deepseek_mla": false, + "is_multimodal_model": false, + "dtype": "float16" + } +} diff --git a/tests/config/test_model_arch_config.py b/tests/config/test_model_arch_config.py new file mode 100644 index 000000000000..06d4c6e7a865 --- /dev/null +++ b/tests/config/test_model_arch_config.py @@ -0,0 +1,152 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Tests for ModelArchitectureConfig and its integration with ModelConfig.""" + +import json +from pathlib import Path + +import pytest + +from vllm.config import ModelConfig, ParallelConfig, SpeculativeConfig +from vllm.transformers_utils.model_arch_config_convertor import ( + ModelArchConfigConvertorBase, +) + +BASE_TRUST_REMOTE_CODE_MODELS = { + "nvidia/Llama-3_3-Nemotron-Super-49B-v1", + "XiaomiMiMo/MiMo-7B-RL", + # Excluded: Not available online right now + # "FreedomIntelligence/openPangu-Ultra-MoE-718B-V1.1", + "meituan-longcat/LongCat-Flash-Chat", +} + +BASE_MODELS_TO_TEST = [ + "state-spaces/mamba-130m-hf", + "mistralai/Mamba-Codestral-7B-v0.1", + # Excluded: terratorch/torchgeo version mismatch in CPU CI environment + # (NonGeoDataset import error). Tested in model initialization tests. + # "ibm-nasa-geospatial/Prithvi-EO-2.0-300M-TL-Sen1Floods11", + "Zyphra/Zamba2-7B-instruct", + # FIXME: mosaicml/mpt-7b has been deleted + # "mosaicml/mpt-7b", + # FIXME: databricks/dbrx-instruct has been deleted + # "databricks/dbrx-instruct", + "tiiuae/falcon-7b", + "tiiuae/falcon-40b", + "luccafong/deepseek_mtp_main_random", + "Qwen/Qwen3-Next-80B-A3B-Instruct", + "tiny-random/qwen3-next-moe", + "zai-org/GLM-4.5", + "baidu/ERNIE-4.5-21B-A3B-PT", + # Models using base convertor + "lmsys/gpt-oss-20b-bf16", + "deepseek-ai/DeepSeek-V3.2-Exp", + "meta-llama/Llama-4-Scout-17B-16E-Instruct", +] + list(BASE_TRUST_REMOTE_CODE_MODELS) + +# (target_model, draft_model, trust_remote_code) +SPECULATIVE_MODELS = [ + ("JackFram/llama-68m", "abhigoyal/vllm-medusa-llama-68m-random", False), + ("luccafong/deepseek_mtp_main_random", "luccafong/deepseek_mtp_draft_random", True), + ("eagle618/deepseek-v3-random", "eagle618/eagle-deepseek-v3-random", True), + ("meta-llama/Meta-Llama-3-8B-Instruct", "yuhuili/EAGLE-LLaMA3-Instruct-8B", True), + ("meta-llama/Llama-3.1-8B-Instruct", "yuhuili/EAGLE3-LLaMA3.1-Instruct-8B", True), +] + + +def _load_groundtruth(filename: str) -> dict: + """Load groundtruth JSON from the test directory.""" + groundtruth_path = Path(__file__).parent / filename + with open(groundtruth_path) as f: + return json.load(f) + + +def _assert_model_arch_config( + model_config, expected: dict, check_head_size: bool = True +): + """Assert model_arch_config matches expected values.""" + model_arch_config = model_config.model_arch_config + assert model_arch_config.architectures == expected["architectures"] + assert model_arch_config.model_type == expected["model_type"] + assert model_arch_config.text_model_type == expected["text_model_type"] + assert model_arch_config.hidden_size == expected["hidden_size"] + assert ( + model_arch_config.total_num_hidden_layers == expected["total_num_hidden_layers"] + ) + assert ( + model_arch_config.total_num_attention_heads + == expected["total_num_attention_heads"] + ) + assert model_arch_config.vocab_size == expected["vocab_size"] + assert model_arch_config.total_num_kv_heads == expected["total_num_kv_heads"] + assert model_arch_config.num_experts == expected["num_experts"] + assert model_arch_config.is_deepseek_mla == expected["is_deepseek_mla"] + + torch_dtype = ModelArchConfigConvertorBase.get_torch_dtype( + model_config.hf_config, model_config.model, revision=model_config.revision + ) + assert str(torch_dtype) == expected["dtype"] + + if check_head_size: + assert model_arch_config.head_size == expected["head_size"] + + +def _assert_model_config_methods( + model_config, expected: dict, check_head_size: bool = True +): + """Assert model_config methods return expected values.""" + assert model_config.architectures == expected["architectures"] + assert model_config.get_vocab_size() == expected["vocab_size"] + assert model_config.get_hidden_size() == expected["hidden_size"] + assert model_config.get_total_num_kv_heads() == expected["total_num_kv_heads"] + assert model_config.get_num_experts() == expected["num_experts"] + assert ( + model_config.get_total_num_hidden_layers() + == expected["total_num_hidden_layers"] + ) + + if check_head_size: + assert model_config.get_head_size() == expected["head_size"] + + +@pytest.mark.parametrize("model", BASE_MODELS_TO_TEST) +def test_base_model_arch_config(model: str): + """Test model architecture config for base models.""" + groundtruth = _load_groundtruth("base_model_arch_groundtruth.json") + expected = groundtruth[model] + + model_config = ModelConfig( + model, trust_remote_code=model in BASE_TRUST_REMOTE_CODE_MODELS + ) + + _assert_model_arch_config(model_config, expected) + _assert_model_config_methods(model_config, expected) + + +@pytest.mark.parametrize( + "target_model,draft_model,trust_remote_code", SPECULATIVE_MODELS +) +def test_draft_model_arch_config( + target_model: str, draft_model: str, trust_remote_code: bool +): + """Test model architecture config for draft/speculative models.""" + groundtruth = _load_groundtruth("draft_model_arch_groundtruth.json") + expected = groundtruth[draft_model] + + target_model_config = ModelConfig(target_model, trust_remote_code=trust_remote_code) + speculative_config = SpeculativeConfig( + model=draft_model, + num_speculative_tokens=1, + target_model_config=target_model_config, + target_parallel_config=ParallelConfig(), + ) + model_config = speculative_config.draft_model_config + + # For medusa models, head_size may cause division by zero before + # model_arch_config was introduced, so we conditionally check it + check_head_size = isinstance(expected["head_size"], int) + + _assert_model_arch_config(model_config, expected, check_head_size=check_head_size) + _assert_model_config_methods( + model_config, expected, check_head_size=check_head_size + ) diff --git a/tests/conftest.py b/tests/conftest.py index 30e25294925c..d346335f7b26 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -45,7 +45,11 @@ ) from transformers.models.auto.auto_factory import _BaseAutoModelClass -from tests.models.utils import TokensTextLogprobs, TokensTextLogprobsPromptLogprobs +from tests.models.utils import ( + TokensTextLogprobs, + TokensTextLogprobsPromptLogprobs, + softmax, +) from vllm import LLM, SamplingParams, envs from vllm.assets.audio import AudioAsset from vllm.assets.image import ImageAsset @@ -513,7 +517,7 @@ def classify(self, prompts: list[str]) -> list[list[float]]: elif problem_type == "multi_label_classification": logits = output.logits.sigmoid()[0].tolist() else: - logits = output.logits.softmax(dim=-1)[0].tolist() + logits = softmax(output.logits)[0].tolist() outputs.append(logits) return outputs @@ -681,6 +685,7 @@ def generate_greedy_logprobs_limit( images: PromptImageInput | None = None, audios: PromptAudioInput | None = None, videos: PromptVideoInput | None = None, + use_cache: bool = True, **kwargs: Any, ) -> list[TokensTextLogprobs]: all_inputs = self.get_inputs( @@ -694,7 +699,7 @@ def generate_greedy_logprobs_limit( for inputs in all_inputs: output: "GenerateOutput" = self.model.generate( **self.wrap_device(inputs), - use_cache=True, + use_cache=use_cache, do_sample=False, max_new_tokens=max_tokens, output_hidden_states=True, diff --git a/tests/entrypoints/openai/test_chunked_prompt.py b/tests/entrypoints/openai/test_chunked_prompt.py index cbd351beb4e9..f9037ac3f8bf 100644 --- a/tests/entrypoints/openai/test_chunked_prompt.py +++ b/tests/entrypoints/openai/test_chunked_prompt.py @@ -66,8 +66,11 @@ async def test_completion_stream_options_and_logprobs_with_long_prompts( chunk.usage.prompt_tokens + chunk.usage.completion_tokens ) if not finished: - tokens_received += 1 assert chunk.choices[0].text + # Count actual tokens from logprobs since multiple tokens + # can be batched into a single chunk + assert chunk.choices[0].logprobs and chunk.choices[0].logprobs.tokens + tokens_received += len(chunk.choices[0].logprobs.tokens) if chunk.choices[0].finish_reason is not None: finished = True diff --git a/tests/entrypoints/pooling/score/test_utils.py b/tests/entrypoints/pooling/score/test_utils.py index 356fd0ad6678..0a57e53be20a 100644 --- a/tests/entrypoints/pooling/score/test_utils.py +++ b/tests/entrypoints/pooling/score/test_utils.py @@ -51,9 +51,9 @@ def llm_reranker_model_config(): CROSS_ENCODER_MODEL_ID, runner="pooling", ) - # use_pad_token is a property that reads from hf_config, + # use_sep_token is a property that reads from hf_config, # so we set it there to override the default (True) - config.hf_config.use_pad_token = False + config.hf_config.use_sep_token = False return config @@ -230,7 +230,7 @@ def test_not_using_default_template( cross_encoder_tokenizer, full_prompt, engine_prompt ) - def test_fallback_with_pad_token( + def test_fallback_with_sep_token( self, cross_encoder_model_config, cross_encoder_tokenizer, @@ -238,7 +238,7 @@ def test_fallback_with_pad_token( mock_model_no_score_template, ): """Test fallback path when ChatTemplateResolutionError - and use_pad_token=True.""" + and use_sep_token=True.""" with ( patch( "vllm.model_executor.model_loader.get_model_cls", @@ -250,7 +250,7 @@ def test_fallback_with_pad_token( ), ): full_prompt, engine_prompt = get_score_prompt( - cross_encoder_model_config, # use_pad_token=True + cross_encoder_model_config, # use_sep_token=True cross_encoder_tokenizer, tokenization_kwargs, "query", @@ -281,7 +281,7 @@ def test_fallback_with_pad_token( add_special_tokens=False, ) - def test_fallback_without_pad_token( + def test_fallback_without_sep_token( self, llm_reranker_model_config, cross_encoder_tokenizer, @@ -289,7 +289,7 @@ def test_fallback_without_pad_token( mock_model_no_score_template, ): """Test fallback path when ChatTemplateResolutionError - and use_pad_token=False.""" + and use_sep_token=False.""" with ( patch( "vllm.model_executor.model_loader.get_model_cls", @@ -301,7 +301,7 @@ def test_fallback_without_pad_token( ), ): full_prompt, engine_prompt = get_score_prompt( - llm_reranker_model_config, # use_pad_token=False + llm_reranker_model_config, # use_sep_token=False cross_encoder_tokenizer, tokenization_kwargs, "query", diff --git a/tests/evals/gsm8k/configs/DeepSeek-R1-DP.yaml b/tests/evals/gsm8k/configs/DeepSeek-R1-DP.yaml new file mode 100644 index 000000000000..f351a1722064 --- /dev/null +++ b/tests/evals/gsm8k/configs/DeepSeek-R1-DP.yaml @@ -0,0 +1,11 @@ +model_name: "deepseek-ai/DeepSeek-R1" +accuracy_threshold: 0.95 +num_questions: 1319 +num_fewshot: 5 +startup_max_wait_seconds: 1200 +server_args: >- + --enforce-eager + --max-model-len 4096 + --data-parallel-size 8 + --enable-expert-parallel + --speculative-config '{"method":"mtp","num_speculative_tokens":1}' diff --git a/tests/evals/gsm8k/configs/DeepSeek-R1-TP.yaml b/tests/evals/gsm8k/configs/DeepSeek-R1-TP.yaml new file mode 100644 index 000000000000..ba3463463b5e --- /dev/null +++ b/tests/evals/gsm8k/configs/DeepSeek-R1-TP.yaml @@ -0,0 +1,11 @@ +model_name: "deepseek-ai/DeepSeek-R1" +accuracy_threshold: 0.95 +num_questions: 1319 +num_fewshot: 5 +startup_max_wait_seconds: 1200 +server_args: >- + --enforce-eager + --max-model-len 4096 + --tensor-parallel-size 8 + --enable-expert-parallel + --speculative-config '{"method":"mtp","num_speculative_tokens":1}' diff --git a/tests/evals/gsm8k/configs/models-h200.txt b/tests/evals/gsm8k/configs/models-h200.txt new file mode 100644 index 000000000000..ec936533bb66 --- /dev/null +++ b/tests/evals/gsm8k/configs/models-h200.txt @@ -0,0 +1,2 @@ +DeepSeek-R1-TP.yaml +DeepSeek-R1-DP.yaml diff --git a/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-fi-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-fi-cutlass.yaml new file mode 100644 index 000000000000..4c9a01274d99 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-fi-cutlass.yaml @@ -0,0 +1,8 @@ +model_name: "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8" +accuracy_threshold: 0.92 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" +env: + VLLM_USE_FLASHINFER_MOE_FP8: "1" + VLLM_FLASHINFER_MOE_BACKEND: "throughput" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-fi-trtllm.yaml b/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-fi-trtllm.yaml new file mode 100644 index 000000000000..17f067215eb5 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-fi-trtllm.yaml @@ -0,0 +1,8 @@ +model_name: "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8" +accuracy_threshold: 0.92 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" +env: + VLLM_USE_FLASHINFER_MOE_FP8: "1" + VLLM_FLASHINFER_MOE_BACKEND: "latency" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-marlin.yaml b/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-marlin.yaml new file mode 100644 index 000000000000..be8192f2a89a --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-marlin.yaml @@ -0,0 +1,7 @@ +model_name: "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8" +accuracy_threshold: 0.92 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" +env: + VLLM_TEST_FORCE_FP8_MARLIN: "1" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-triton.yaml b/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-triton.yaml new file mode 100644 index 000000000000..80e279edc971 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Llama-4-Scout-Fp8-ModelOpt-triton.yaml @@ -0,0 +1,5 @@ +model_name: "nvidia/Llama-4-Scout-17B-16E-Instruct-FP8" +accuracy_threshold: 0.92 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Mixtral-8x7B-Fp8-AutoFp8-fi-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor/Mixtral-8x7B-Fp8-AutoFp8-fi-cutlass.yaml new file mode 100644 index 000000000000..b9c6a1997dc3 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Mixtral-8x7B-Fp8-AutoFp8-fi-cutlass.yaml @@ -0,0 +1,9 @@ +# TODO(rob): enable +# model_name: "amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV" +# accuracy_threshold: 0.62 +# num_questions: 1319 +# num_fewshot: 5 +# server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" +# env: +# VLLM_USE_FLASHINFER_MOE_FP8: "1" +# VLLM_FLASHINFER_MOE_BACKEND: "throughput" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Mixtral-8x7B-Fp8-AutoFp8-triton.yaml b/tests/evals/gsm8k/configs/moe-refactor/Mixtral-8x7B-Fp8-AutoFp8-triton.yaml new file mode 100644 index 000000000000..f730e2e2fb1a --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Mixtral-8x7B-Fp8-AutoFp8-triton.yaml @@ -0,0 +1,5 @@ +model_name: "amd/Mixtral-8x7B-Instruct-v0.1-FP8-KV" +accuracy_threshold: 0.62 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm.yaml new file mode 100644 index 000000000000..b6cff0abc9d3 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm.yaml @@ -0,0 +1,8 @@ +model_name: "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8" +accuracy_threshold: 0.88 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" +env: + VLLM_USE_DEEP_GEMM: "1" + VLLM_USE_DEEP_GEMM_MOE: "1" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-fi-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-fi-cutlass.yaml new file mode 100644 index 000000000000..080c8d338e58 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-fi-cutlass.yaml @@ -0,0 +1,10 @@ +model_name: "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8" +accuracy_threshold: 0.88 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" +env: + VLLM_USE_DEEP_GEMM: "0" + VLLM_USE_DEEP_GEMM_MOE: "0" + VLLM_USE_FLASHINFER_MOE_FP8: "1" + VLLM_FLASHINFER_MOE_BACKEND: "throughput" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-fi-trtllm.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-fi-trtllm.yaml new file mode 100644 index 000000000000..a656cc7c37f1 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-fi-trtllm.yaml @@ -0,0 +1,10 @@ +model_name: "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8" +accuracy_threshold: 0.88 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" +env: + VLLM_USE_DEEP_GEMM: "0" + VLLM_USE_DEEP_GEMM_MOE: "0" + VLLM_USE_FLASHINFER_MOE_FP8: "1" + VLLM_FLASHINFER_MOE_BACKEND: "latency" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-marlin.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-marlin.yaml new file mode 100644 index 000000000000..f2273bf2c96c --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-marlin.yaml @@ -0,0 +1,9 @@ +model_name: "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8" +accuracy_threshold: 0.88 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" +env: + VLLM_USE_DEEP_GEMM: "0" + VLLM_USE_DEEP_GEMM_MOE: "0" + VLLM_TEST_FORCE_FP8_MARLIN: "1" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-triton.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-triton.yaml new file mode 100644 index 000000000000..ed61e9b89978 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-AutoFp8-triton.yaml @@ -0,0 +1,8 @@ +model_name: "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8" +accuracy_threshold: 0.88 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" +env: + VLLM_USE_DEEP_GEMM: "0" + VLLM_USE_DEEP_GEMM_MOE: "0" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm.yaml new file mode 100644 index 000000000000..f7ddd30342b3 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-deepgemm.yaml @@ -0,0 +1,8 @@ +model_name: "RedHatAI/Qwen3-30B-A3B-FP8-block" +accuracy_threshold: 0.85 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" +env: + VLLM_USE_DEEP_GEMM: "1" + VLLM_USE_DEEP_GEMM_MOE: "1" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-fi-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-fi-cutlass.yaml new file mode 100644 index 000000000000..db18dd01bb23 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-fi-cutlass.yaml @@ -0,0 +1,10 @@ +model_name: "RedHatAI/Qwen3-30B-A3B-FP8-block" +accuracy_threshold: 0.85 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" +env: + VLLM_USE_DEEP_GEMM: "0" + VLLM_USE_DEEP_GEMM_MOE: "0" + VLLM_USE_FLASHINFER_MOE_FP8: "1" + VLLM_FLASHINFER_MOE_BACKEND: "throughput" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-marlin.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-marlin.yaml new file mode 100644 index 000000000000..3d82d2e22c1a --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-marlin.yaml @@ -0,0 +1,9 @@ +model_name: "RedHatAI/Qwen3-30B-A3B-FP8-block" +accuracy_threshold: 0.85 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" +env: + VLLM_USE_DEEP_GEMM: "0" + VLLM_USE_DEEP_GEMM_MOE: "0" + VLLM_TEST_FORCE_FP8_MARLIN: "1" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-vllm-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-vllm-cutlass.yaml new file mode 100644 index 000000000000..5621217de83a --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Block-vllm-cutlass.yaml @@ -0,0 +1,8 @@ +model_name: "RedHatAI/Qwen3-30B-A3B-FP8-block" +accuracy_threshold: 0.85 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" +env: + VLLM_USE_DEEP_GEMM: "0" + VLLM_USE_DEEP_GEMM_MOE: "0" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Channel-marlin.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Channel-marlin.yaml new file mode 100644 index 000000000000..8ed6410c36b5 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Channel-marlin.yaml @@ -0,0 +1,7 @@ +model_name: "RedHatAI/Qwen3-30B-A3B-FP8-dynamic" +accuracy_threshold: 0.85 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" +env: + VLLM_TEST_FORCE_FP8_MARLIN: "1" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Channel-vllm-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Channel-vllm-cutlass.yaml new file mode 100644 index 000000000000..d6adbfc5fba0 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-Fp8-CT-Channel-vllm-cutlass.yaml @@ -0,0 +1,5 @@ +model_name: "RedHatAI/Qwen3-30B-A3B-FP8-dynamic" +accuracy_threshold: 0.85 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-fi-cutlass-dp-ep.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-fi-cutlass-dp-ep.yaml new file mode 100644 index 000000000000..53fd62bac839 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-fi-cutlass-dp-ep.yaml @@ -0,0 +1,8 @@ +model_name: "RedHatAI/Qwen3-30B-A3B-NVFP4" +accuracy_threshold: 0.88 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel" +env: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_FLASHINFER_MOE_BACKEND: "throughput" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-fi-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-fi-cutlass.yaml new file mode 100644 index 000000000000..6edacc32975c --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-fi-cutlass.yaml @@ -0,0 +1,8 @@ +model_name: "RedHatAI/Qwen3-30B-A3B-NVFP4" +accuracy_threshold: 0.88 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" +env: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_FLASHINFER_MOE_BACKEND: "throughput" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-fi-trtllm.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-fi-trtllm.yaml new file mode 100644 index 000000000000..8e0b155fa70d --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-fi-trtllm.yaml @@ -0,0 +1,8 @@ +model_name: "RedHatAI/Qwen3-30B-A3B-NVFP4" +accuracy_threshold: 0.88 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" +env: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_FLASHINFER_MOE_BACKEND: "latency" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-marlin.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-marlin.yaml new file mode 100644 index 000000000000..8199e6563495 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-marlin.yaml @@ -0,0 +1,7 @@ +model_name: "RedHatAI/Qwen3-30B-A3B-NVFP4" +accuracy_threshold: 0.88 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" +env: + VLLM_TEST_FORCE_FP8_MARLIN: "1" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-vllm-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-vllm-cutlass.yaml new file mode 100644 index 000000000000..b1ccadeddbba --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-CT-vllm-cutlass.yaml @@ -0,0 +1,5 @@ +model_name: "RedHatAI/Qwen3-30B-A3B-NVFP4" +accuracy_threshold: 0.88 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass-dp-ep.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass-dp-ep.yaml new file mode 100644 index 000000000000..44f8700e4b46 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass-dp-ep.yaml @@ -0,0 +1,8 @@ +model_name: "nvidia/Qwen3-30B-A3B-NVFP4" +accuracy_threshold: 0.88 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --data-parallel-size 2 --enable-expert-parallel" +env: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_FLASHINFER_MOE_BACKEND: "throughput" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass.yaml new file mode 100644 index 000000000000..09e76e21ab43 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass.yaml @@ -0,0 +1,8 @@ +model_name: "nvidia/Qwen3-30B-A3B-NVFP4" +accuracy_threshold: 0.88 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" +env: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_FLASHINFER_MOE_BACKEND: "throughput" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-trtllm.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-trtllm.yaml new file mode 100644 index 000000000000..a98afafbcde9 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-fi-trtllm.yaml @@ -0,0 +1,8 @@ +model_name: "nvidia/Qwen3-30B-A3B-NVFP4" +accuracy_threshold: 0.88 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" +env: + VLLM_USE_FLASHINFER_MOE_FP4: "1" + VLLM_FLASHINFER_MOE_BACKEND: "latency" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-marlin.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-marlin.yaml new file mode 100644 index 000000000000..4156cec89761 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-marlin.yaml @@ -0,0 +1,7 @@ +model_name: "nvidia/Qwen3-30B-A3B-NVFP4" +accuracy_threshold: 0.88 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" +env: + VLLM_TEST_FORCE_FP8_MARLIN: "1" diff --git a/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-vllm-cutlass.yaml b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-vllm-cutlass.yaml new file mode 100644 index 000000000000..49a1589fcfea --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/Qwen3-30B-A3B-NvFp4-ModelOpt-vllm-cutlass.yaml @@ -0,0 +1,5 @@ +model_name: "nvidia/Qwen3-30B-A3B-NVFP4" +accuracy_threshold: 0.88 +num_questions: 1319 +num_fewshot: 5 +server_args: "--enforce-eager --max-model-len 8192 --tensor-parallel-size 2" diff --git a/tests/evals/gsm8k/configs/moe-refactor/config-b200.txt b/tests/evals/gsm8k/configs/moe-refactor/config-b200.txt new file mode 100644 index 000000000000..bf02f1363be3 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/config-b200.txt @@ -0,0 +1,12 @@ +Llama-4-Scout-Fp8-ModelOpt-fi-trtllm.yaml +Qwen3-30B-A3B-Fp8-AutoFp8-fi-trtllm.yaml +Qwen3-30B-A3B-NvFp4-CT-vllm-cutlass.yaml +Qwen3-30B-A3B-NvFp4-CT-marlin.yaml +Qwen3-30B-A3B-NvFp4-CT-fi-trtllm.yaml +Qwen3-30B-A3B-NvFp4-CT-fi-cutlass.yaml +Qwen3-30B-A3B-NvFp4-CT-fi-cutlass-dp-ep.yaml +Qwen3-30B-A3B-NvFp4-ModelOpt-vllm-cutlass.yaml +Qwen3-30B-A3B-NvFp4-ModelOpt-marlin.yaml +Qwen3-30B-A3B-NvFp4-ModelOpt-fi-trtllm.yaml +Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass.yaml +Qwen3-30B-A3B-NvFp4-ModelOpt-fi-cutlass-dp-ep.yaml diff --git a/tests/evals/gsm8k/configs/moe-refactor/config-h100.txt b/tests/evals/gsm8k/configs/moe-refactor/config-h100.txt new file mode 100644 index 000000000000..9725db7c8be2 --- /dev/null +++ b/tests/evals/gsm8k/configs/moe-refactor/config-h100.txt @@ -0,0 +1,13 @@ +Mixtral-8x7B-Fp8-AutoFp8-triton.yaml +Qwen3-30B-A3B-Fp8-AutoFp8-deepgemm.yaml +Qwen3-30B-A3B-Fp8-AutoFp8-fi-cutlass.yaml +Qwen3-30B-A3B-Fp8-AutoFp8-marlin.yaml +Qwen3-30B-A3B-Fp8-AutoFp8-triton.yaml +Qwen3-30B-A3B-Fp8-CT-Block-deepgemm.yaml +Qwen3-30B-A3B-Fp8-CT-Block-marlin.yaml +Qwen3-30B-A3B-Fp8-CT-Block-vllm-cutlass.yaml +Qwen3-30B-A3B-Fp8-CT-Channel-marlin.yaml +Qwen3-30B-A3B-Fp8-CT-Channel-vllm-cutlass.yaml +Llama-4-Scout-Fp8-ModelOpt-fi-cutlass.yaml +Llama-4-Scout-Fp8-ModelOpt-marlin.yaml +Llama-4-Scout-Fp8-ModelOpt-triton.yaml diff --git a/tests/evals/gsm8k/test_gsm8k_correctness.py b/tests/evals/gsm8k/test_gsm8k_correctness.py index dd0d3ae0cca4..991b905211ff 100644 --- a/tests/evals/gsm8k/test_gsm8k_correctness.py +++ b/tests/evals/gsm8k/test_gsm8k_correctness.py @@ -78,7 +78,7 @@ def test_gsm8k_correctness(config_filename): eval_config["model_name"], server_args, env_dict=env_dict, - max_wait_seconds=600, + max_wait_seconds=eval_config.get("startup_max_wait_seconds", 600), ) as remote_server: server_url = remote_server.url_for("v1") print(f"Server started at: {server_url}") diff --git a/tests/kernels/attention/test_aiter_flash_attn.py b/tests/kernels/attention/test_aiter_flash_attn.py index 8f58c470d217..68ffb1ee34ad 100644 --- a/tests/kernels/attention/test_aiter_flash_attn.py +++ b/tests/kernels/attention/test_aiter_flash_attn.py @@ -8,6 +8,7 @@ import vllm.v1.attention.backends.rocm_aiter_fa # noqa: F401 from vllm.attention.utils.fa_utils import is_flash_attn_varlen_func_available from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed NUM_HEADS = [(4, 4), (8, 2)] HEAD_SIZES = [128, 256] @@ -104,7 +105,7 @@ def test_varlen_with_paged_kv( if not is_flash_attn_varlen_func_available(): pytest.skip("flash_attn_varlen_func required to run this test.") torch.set_default_device("cuda") - current_platform.seed_everything(0) + set_random_seed(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] kv_lens = [x[1] for x in seq_lens] diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py index 96bdcf16d568..24b058ed24fa 100644 --- a/tests/kernels/attention/test_attention.py +++ b/tests/kernels/attention/test_attention.py @@ -13,6 +13,7 @@ from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.platforms import current_platform from vllm.utils.mem_utils import get_max_shared_memory_bytes +from vllm.utils.torch_utils import set_random_seed FLOAT32_BYTES = torch.finfo(torch.float).bits // 8 # This will change depending on the compute capability. @@ -150,7 +151,7 @@ def test_paged_attention( global PARTITION_SIZE - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) scale = float(1.0 / (head_size**0.5)) num_query_heads, num_kv_heads = num_heads diff --git a/tests/kernels/attention/test_cache.py b/tests/kernels/attention/test_cache.py index 3f76033254d3..19892ce26b6b 100644 --- a/tests/kernels/attention/test_cache.py +++ b/tests/kernels/attention/test_cache.py @@ -9,6 +9,7 @@ from tests.kernels.utils import DEFAULT_OPCHECK_TEST_UTILS, opcheck from vllm import _custom_ops as ops from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed COPYING_DIRECTION = [("cuda", "cpu"), ("cuda", "cuda"), ("cpu", "cuda")] DTYPES = [torch.bfloat16, torch.float] @@ -64,7 +65,7 @@ def test_reshape_and_cache( ) -> None: if kv_cache_dtype == "fp8" and head_size % 16: pytest.skip() - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) torch.cuda.set_device(device) # Create a random slot mapping. @@ -185,7 +186,7 @@ def test_reshape_and_cache_flash( kv_cache_layout: str, implementation: str, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) torch.cuda.set_device(device) assert implementation in ["cuda", "triton"] @@ -355,7 +356,7 @@ def test_swap_blocks( if kv_cache_dtype == "fp8" and head_size % 16: pytest.skip() - current_platform.seed_everything(seed) + set_random_seed(seed) src_device = device if direction[0] == "cuda" else "cpu" dst_device = device if direction[1] == "cuda" else "cpu" @@ -444,7 +445,7 @@ def test_fp8_e4m3_conversion( seed: int, device: str, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) low = -224.0 high = 224.0 @@ -507,7 +508,7 @@ def test_concat_and_cache_mla( device: str, kv_cache_dtype: str, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) torch.cuda.set_device(device) @@ -584,7 +585,7 @@ def test_concat_and_cache_ds_mla( if dtype.itemsize != 2: pytest.skip("ds_mla only supports 16-bit input") kv_cache_dtype = "fp8_ds_mla" - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) torch.cuda.set_device(device) @@ -695,7 +696,7 @@ def test_swap_blocks_mla( device: str, kv_cache_dtype: str, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) torch.cuda.set_device(device) @@ -947,7 +948,7 @@ def test_concat_and_cache_mla_cpu( ) -> None: device = "cpu" kv_cache_dtype = "auto" - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) total_slots = num_blocks * block_size diff --git a/tests/kernels/attention/test_cascade_flash_attn.py b/tests/kernels/attention/test_cascade_flash_attn.py index d86041d71feb..80c5c853debb 100755 --- a/tests/kernels/attention/test_cascade_flash_attn.py +++ b/tests/kernels/attention/test_cascade_flash_attn.py @@ -6,6 +6,7 @@ import torch from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed from vllm.v1.attention.backends.flash_attn import cascade_attention, merge_attn_states try: @@ -39,7 +40,7 @@ def test_merge_kernel( dtype: torch.dtype, ): torch.set_default_device("cuda") - current_platform.seed_everything(0) + set_random_seed(0) num_query_heads = num_heads[0] num_kv_heads = num_heads[1] assert num_query_heads % num_kv_heads == 0 @@ -103,7 +104,7 @@ def test_cascade( f'to: "{fa_version_unsupported_reason(fa_version)}"' ) - current_platform.seed_everything(0) + set_random_seed(0) window_size = (-1, -1) scale = head_size**-0.5 diff --git a/tests/kernels/attention/test_cpu_attn.py b/tests/kernels/attention/test_cpu_attn.py index be5d66197f6e..ef0099f635a5 100644 --- a/tests/kernels/attention/test_cpu_attn.py +++ b/tests/kernels/attention/test_cpu_attn.py @@ -8,6 +8,7 @@ import torch from vllm.platforms import CpuArchEnum, current_platform +from vllm.utils.torch_utils import set_random_seed from vllm.v1.attention.backends.cpu_attn import _get_attn_isa if not current_platform.is_cpu(): @@ -190,7 +191,7 @@ def varlen_with_paged_kv( use_sink: bool, isa: str, ) -> None: - current_platform.seed_everything(0) + set_random_seed(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] kv_lens = [x[1] for x in seq_lens] diff --git a/tests/kernels/attention/test_flash_attn.py b/tests/kernels/attention/test_flash_attn.py index bbd5df5419f8..2714cd81819e 100644 --- a/tests/kernels/attention/test_flash_attn.py +++ b/tests/kernels/attention/test_flash_attn.py @@ -6,6 +6,7 @@ import torch from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed try: from vllm.vllm_flash_attn import ( @@ -129,7 +130,7 @@ def test_varlen_with_paged_kv( "Flash attention with quantized inputs is only " "supported on version 3 with bfloat16 base type" ) - current_platform.seed_everything(0) + set_random_seed(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] kv_lens = [x[1] for x in seq_lens] diff --git a/tests/kernels/attention/test_flashinfer.py b/tests/kernels/attention/test_flashinfer.py index eedeec33e0d4..570bf7fc865a 100644 --- a/tests/kernels/attention/test_flashinfer.py +++ b/tests/kernels/attention/test_flashinfer.py @@ -5,6 +5,7 @@ import pytest from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed try: import flashinfer @@ -101,7 +102,7 @@ def test_flashinfer_decode_with_paged_kv( sliding_window: int | None, ) -> None: torch.set_default_device("cuda") - current_platform.seed_everything(0) + set_random_seed(0) num_seqs = len(kv_lens) num_query_heads = num_heads[0] num_kv_heads = num_heads[1] @@ -196,7 +197,7 @@ def test_flashinfer_prefill_with_paged_kv( sliding_window: int | None, ) -> None: torch.set_default_device("cuda") - current_platform.seed_everything(0) + set_random_seed(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] kv_lens = [x[1] for x in seq_lens] @@ -299,7 +300,7 @@ def test_flashinfer_prefill_with_paged_fp8_kv( ) -> None: pytest.skip("TODO: fix the accuracy issue") torch.set_default_device("cuda") - current_platform.seed_everything(0) + set_random_seed(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] kv_lens = [x[1] for x in seq_lens] @@ -409,7 +410,7 @@ def test_flashinfer_decode_with_paged_fp8_kv( ) -> None: # test doesn't work for num_heads = (16,16) torch.set_default_device("cuda") - current_platform.seed_everything(0) + set_random_seed(0) num_seqs = len(kv_lens) num_query_heads = num_heads[0] num_kv_heads = num_heads[1] diff --git a/tests/kernels/attention/test_flashinfer_trtllm_attention.py b/tests/kernels/attention/test_flashinfer_trtllm_attention.py index 220d827b9d5f..1edb3dd1671f 100644 --- a/tests/kernels/attention/test_flashinfer_trtllm_attention.py +++ b/tests/kernels/attention/test_flashinfer_trtllm_attention.py @@ -10,6 +10,7 @@ ) from vllm.platforms import current_platform from vllm.utils.math_utils import round_up +from vllm.utils.torch_utils import set_random_seed if not current_platform.is_device_capability_family(100): pytest.skip( @@ -80,7 +81,7 @@ def test_flashinfer_trtllm_decode_with_baseline( has_sinks: bool, ) -> None: torch.set_default_device("cuda") - current_platform.seed_everything(42) + set_random_seed(42) q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes q_quant_dtype = q_quant_dtype or dtype @@ -279,7 +280,7 @@ def test_flashinfer_trtllm_prefill_with_baseline( has_sinks: bool, ) -> None: torch.set_default_device("cuda") - current_platform.seed_everything(42) + set_random_seed(42) q_quant_dtype, kv_quant_dtype, o_quant_dtype = quant_dtypes q_quant_dtype = q_quant_dtype or dtype diff --git a/tests/kernels/attention/test_lightning_attn.py b/tests/kernels/attention/test_lightning_attn.py index ec938caff2c6..37fd85ccec04 100644 --- a/tests/kernels/attention/test_lightning_attn.py +++ b/tests/kernels/attention/test_lightning_attn.py @@ -5,7 +5,7 @@ import torch from vllm.model_executor.layers.lightning_attn import linear_decode_forward_triton -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed NUM_HEADS = [4, 8] HEAD_SIZES = [64] @@ -124,7 +124,7 @@ def test_linear_decode_forward_triton( torch.set_default_device("cuda") torch.manual_seed(42) torch.cuda.manual_seed_all(42) - current_platform.seed_everything(42) + set_random_seed(42) base = 0.01 q = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype) k = base * torch.randn(batch_size, num_heads, 1, head_size, dtype=dtype) @@ -167,7 +167,7 @@ def test_linear_decode_forward_triton_with_padding( torch.set_default_device("cuda") torch.manual_seed(42) torch.cuda.manual_seed_all(42) - current_platform.seed_everything(42) + set_random_seed(42) batch_size = 4 base = 0.01 @@ -231,7 +231,7 @@ def test_lightning_attention_reference( torch.set_default_device("cuda") torch.manual_seed(42) torch.cuda.manual_seed_all(42) - current_platform.seed_everything(42) + set_random_seed(42) base = 0.01 q = base * torch.randn(batch_size, num_heads, seq_len, head_size, dtype=dtype) diff --git a/tests/kernels/attention/test_mha_attn.py b/tests/kernels/attention/test_mha_attn.py index 7405e4d41da9..32aba1a45747 100644 --- a/tests/kernels/attention/test_mha_attn.py +++ b/tests/kernels/attention/test_mha_attn.py @@ -19,6 +19,7 @@ from vllm.platforms.cpu import CpuPlatform from vllm.platforms.cuda import CudaPlatform from vllm.platforms.rocm import RocmPlatform +from vllm.utils.torch_utils import set_random_seed @pytest.fixture(autouse=True) @@ -123,7 +124,7 @@ def test_mha_attn_forward( dtype: torch.dtype, device: str, ): - current_platform.seed_everything(0) + set_random_seed(0) torch.set_default_device(device) torch.set_default_dtype(dtype) @@ -168,7 +169,7 @@ def test_mha_attn_varlen_forward( dtype: torch.dtype, device: str, ): - current_platform.seed_everything(0) + set_random_seed(0) torch.set_default_device(device) torch.set_default_dtype(dtype) diff --git a/tests/kernels/attention/test_prefix_prefill.py b/tests/kernels/attention/test_prefix_prefill.py index e041e8c8d2ff..b2c955b4901a 100644 --- a/tests/kernels/attention/test_prefix_prefill.py +++ b/tests/kernels/attention/test_prefix_prefill.py @@ -13,7 +13,7 @@ from vllm.attention.ops.chunked_prefill_paged_decode import chunked_prefill_paged_decode from vllm.attention.ops.prefix_prefill import context_attention_fwd from vllm.platforms import current_platform -from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE +from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed NUM_HEADS = [64] NUM_QUERIES_PER_KV = [1, 64] @@ -125,7 +125,7 @@ def test_contexted_kv_attention( ): pytest.skip("ROCm custom paged attention does not support fp8_e5m2 KV cache") - current_platform.seed_everything(0) + set_random_seed(0) torch.set_default_device(device) # Need this, otherwise when we capture the graph the process @@ -346,7 +346,7 @@ def test_contexted_kv_attention_alibi( ): pytest.skip("ROCm custom paged attention does not support fp8_e5m2 KV cache") - current_platform.seed_everything(0) + set_random_seed(0) torch.set_default_device(device) # Need this, otherwise when we capture the graph the process diff --git a/tests/kernels/attention/test_triton_prefill_attention.py b/tests/kernels/attention/test_triton_prefill_attention.py new file mode 100644 index 000000000000..67c52cbfd452 --- /dev/null +++ b/tests/kernels/attention/test_triton_prefill_attention.py @@ -0,0 +1,225 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +import pytest +import torch +import torch.nn.functional as F + +from vllm.attention.ops.triton_prefill_attention import context_attention_fwd + + +def ref_masked_attention( + q: torch.Tensor, + k: torch.Tensor, + v: torch.Tensor, + is_causal: bool = True, + sliding_window_q: int | None = None, + sliding_window_k: int | None = None, +) -> torch.Tensor: + """Reference implementation using PyTorch SDPA.""" + # q, k, v: [total_tokens, num_heads, head_dim] + # SDPA expects [batch, num_heads, seq_len, head_dim] + + total_tokens = q.shape[0] + + # Add batch dimension and transpose + q = q.unsqueeze(0).transpose(1, 2) # [1, num_heads, total_tokens, head_dim] + k = k.unsqueeze(0).transpose(1, 2) # [1, num_heads, total_tokens, head_dim] + v = v.unsqueeze(0).transpose(1, 2) # [1, num_heads, total_tokens, head_dim] + + # Create attention mask if needed + attn_mask = None + use_causal = is_causal + + # If we have sliding window or need custom masking, create explicit mask + sliding_window_q = sliding_window_q if sliding_window_q is not None else 0 + sliding_window_k = sliding_window_k if sliding_window_k is not None else 0 + if (sliding_window_q > 0) or (sliding_window_k > 0): + # Position indices + pos_q = torch.arange(total_tokens, device=q.device).unsqueeze(1) + pos_k = torch.arange(total_tokens, device=q.device).unsqueeze(0) + + # Start with valid mask (False = no masking) + mask = torch.ones( + (total_tokens, total_tokens), dtype=torch.bool, device=q.device + ) + + # Apply causal mask + if is_causal: + mask = mask & (pos_q >= pos_k) + + # Apply sliding window masks + sliding_window_mask = torch.ones_like(mask) + if sliding_window_q > 0: + sliding_window_mask &= pos_q - pos_k <= sliding_window_q + + if sliding_window_k > 0: + sliding_window_mask &= pos_k - pos_q <= sliding_window_k + + mask = mask & sliding_window_mask + + attn_mask = torch.where(mask, 0.0, float("-inf")).to(q.dtype) + use_causal = False # Don't use is_causal when providing explicit mask + + # Use SDPA + output = F.scaled_dot_product_attention( + q, k, v, attn_mask=attn_mask, is_causal=use_causal, dropout_p=0.0 + ) + + # Convert back to original shape: [total_tokens, num_heads, head_dim] + output = output.transpose(1, 2).squeeze(0) + + return output + + +@pytest.mark.parametrize("B", [5]) +@pytest.mark.parametrize("max_seq_len", [1024]) +@pytest.mark.parametrize("H_Q", [32]) +@pytest.mark.parametrize("H_KV", [32, 8]) +@pytest.mark.parametrize("D", [128]) +@pytest.mark.parametrize("is_causal", [True, False]) +@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16]) +def test_context_attention( + B: int, + max_seq_len: int, + H_Q: int, + H_KV: int, + D: int, + is_causal: bool, + dtype: torch.dtype, +): + """Test basic context attention without sliding window.""" + torch.manual_seed(42) + + # Generate random sequence lengths for each batch + seq_lens = torch.randint(max_seq_len // 2, max_seq_len + 1, (B,), device="cuda") + total_tokens = seq_lens.sum().item() + + # Create batch start locations + b_start_loc = torch.zeros(B, dtype=torch.int32, device="cuda") + b_start_loc[1:] = torch.cumsum(seq_lens[:-1], dim=0) + + # Create input tensors + q = torch.randn(total_tokens, H_Q, D, dtype=dtype, device="cuda") + k = torch.randn(total_tokens, H_KV, D, dtype=dtype, device="cuda") + v = torch.randn(total_tokens, H_KV, D, dtype=dtype, device="cuda") + o = torch.zeros_like(q) + + # Call Triton kernel + context_attention_fwd( + q, + k, + v, + o, + b_start_loc, + seq_lens, + max_seq_len, + is_causal=is_causal, + sliding_window_q=None, + sliding_window_k=None, + ) + + # Compute reference output for each sequence in batch + o_ref = torch.zeros_like(q) + for i in range(B): + start = b_start_loc[i].item() + end = start + seq_lens[i].item() + + q_seq = q[start:end] + k_seq = k[start:end] + v_seq = v[start:end] + + # Expand KV heads if using GQA + if H_Q != H_KV: + kv_group_num = H_Q // H_KV + k_seq = k_seq.repeat_interleave(kv_group_num, dim=1) + v_seq = v_seq.repeat_interleave(kv_group_num, dim=1) + + o_ref[start:end] = ref_masked_attention( + q_seq, + k_seq, + v_seq, + is_causal=is_causal, + sliding_window_q=None, + sliding_window_k=None, + ) + + # Compare outputs + torch.testing.assert_close(o, o_ref, rtol=1e-2, atol=1e-2) + + +@pytest.mark.parametrize("B", [4]) +@pytest.mark.parametrize("max_seq_len", [1024]) +@pytest.mark.parametrize("H_Q", [32]) +@pytest.mark.parametrize("H_KV", [32, 8]) +@pytest.mark.parametrize("D", [128]) +@pytest.mark.parametrize("sliding_window", [(32, 32), (32, 0), (0, 32)]) +@pytest.mark.parametrize("dtype", [torch.float32, torch.bfloat16]) +def test_context_attention_sliding_window( + B: int, + max_seq_len: int, + H_Q: int, + H_KV: int, + D: int, + sliding_window: tuple[int, int], + dtype: torch.dtype, +): + sliding_window_q, sliding_window_k = sliding_window + """Test context attention with sliding window.""" + torch.manual_seed(42) + + # Generate random sequence lengths for each batch + seq_lens = torch.randint(max_seq_len // 2, max_seq_len + 1, (B,), device="cuda") + total_tokens = seq_lens.sum().item() + + # Create batch start locations + b_start_loc = torch.zeros(B, dtype=torch.int32, device="cuda") + b_start_loc[1:] = torch.cumsum(seq_lens[:-1], dim=0) + + # Create input tensors + q = torch.randn(total_tokens, H_Q, D, dtype=dtype, device="cuda") + k = torch.randn(total_tokens, H_KV, D, dtype=dtype, device="cuda") + v = torch.randn(total_tokens, H_KV, D, dtype=dtype, device="cuda") + o = torch.zeros_like(q) + + # Call Triton kernel + context_attention_fwd( + q, + k, + v, + o, + b_start_loc, + seq_lens, + max_seq_len, + is_causal=False, + sliding_window_q=sliding_window_q, + sliding_window_k=sliding_window_k, + ) + + # Compute reference output for each sequence in batch + o_ref = torch.zeros_like(q) + for i in range(B): + start = b_start_loc[i].item() + end = start + seq_lens[i].item() + + q_seq = q[start:end] + k_seq = k[start:end] + v_seq = v[start:end] + + # Expand KV heads if using GQA + if H_Q != H_KV: + kv_group_num = H_Q // H_KV + k_seq = k_seq.repeat_interleave(kv_group_num, dim=1) + v_seq = v_seq.repeat_interleave(kv_group_num, dim=1) + + o_ref[start:end] = ref_masked_attention( + q_seq, + k_seq, + v_seq, + is_causal=False, + sliding_window_q=sliding_window_q if sliding_window_q > 0 else None, + sliding_window_k=sliding_window_k if sliding_window_k > 0 else None, + ) + + # Compare outputs + torch.testing.assert_close(o, o_ref, rtol=2e-2, atol=2e-2) diff --git a/tests/kernels/attention/test_triton_unified_attention.py b/tests/kernels/attention/test_triton_unified_attention.py index 7fb08e5780f5..55e3593481cb 100644 --- a/tests/kernels/attention/test_triton_unified_attention.py +++ b/tests/kernels/attention/test_triton_unified_attention.py @@ -8,6 +8,7 @@ from vllm.attention.ops.triton_unified_attention import unified_attention from vllm.platforms import current_platform from vllm.utils.math_utils import next_power_of_2 +from vllm.utils.torch_utils import set_random_seed NUM_HEADS = [(4, 4), (8, 2)] HEAD_SIZES = [128, 256] @@ -113,7 +114,7 @@ def test_triton_unified_attn( ) -> None: torch.set_default_device("cuda") - current_platform.seed_everything(0) + set_random_seed(0) num_seqs = len(seq_lens) query_lens = [x[0] for x in seq_lens] kv_lens = [x[1] for x in seq_lens] diff --git a/tests/kernels/core/test_activation.py b/tests/kernels/core/test_activation.py index e8777ec4f59e..1055c4745d4e 100644 --- a/tests/kernels/core/test_activation.py +++ b/tests/kernels/core/test_activation.py @@ -18,7 +18,7 @@ SiluAndMul, SwigluOAIAndMul, ) -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 2048] # Arbitrary values for testing @@ -52,7 +52,7 @@ def test_act_and_mul( seed: int, device: str, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) x = torch.randn(num_tokens, 2 * d, dtype=dtype) if activation == "silu_and_mul": @@ -129,7 +129,7 @@ def test_activation( seed: int, device: str, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) x = torch.randn(num_tokens, d, dtype=dtype) layer = activation[0]() diff --git a/tests/kernels/core/test_fused_qk_norm_rope.py b/tests/kernels/core/test_fused_qk_norm_rope.py index 05d61ec02fd2..02cd470e7704 100644 --- a/tests/kernels/core/test_fused_qk_norm_rope.py +++ b/tests/kernels/core/test_fused_qk_norm_rope.py @@ -8,6 +8,7 @@ from vllm.model_executor.layers.layernorm import RMSNorm from vllm.model_executor.layers.rotary_embedding import RotaryEmbedding from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed DTYPES = [torch.bfloat16, torch.float16] IS_NEOX = [True, False] @@ -64,7 +65,7 @@ def test_fused_qk_norm_rope_matches_reference( rotary_ratio: float, ): torch.set_default_device(device) - current_platform.seed_everything(seed) + set_random_seed(seed) num_heads, num_kv_heads, head_dim = 16, 4, 128 num_tokens = 4 diff --git a/tests/kernels/core/test_layernorm.py b/tests/kernels/core/test_layernorm.py index 49bd77f6795f..5ad032cbacac 100644 --- a/tests/kernels/core/test_layernorm.py +++ b/tests/kernels/core/test_layernorm.py @@ -7,7 +7,7 @@ from tests.kernels.quant_utils import FP8_DTYPE from tests.kernels.utils import opcheck from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed DTYPES = [torch.half, torch.bfloat16, torch.float] NUM_TOKENS = [7, 83, 4096] # Arbitrary values for testing @@ -34,7 +34,7 @@ def test_rms_norm( device: str, strided_input: bool, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) layer = RMSNorm(hidden_size).to(dtype=dtype) layer.weight.data.normal_(mean=1.0, std=0.1) @@ -88,7 +88,7 @@ def test_fused_rms_norm_quant( device: str, strided_input: bool, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) weight = torch.empty(hidden_size, dtype=dtype).normal_(mean=1.0, std=0.1) diff --git a/tests/kernels/core/test_mrope.py b/tests/kernels/core/test_mrope.py index ba5d593b2d35..c091ea49d92c 100644 --- a/tests/kernels/core/test_mrope.py +++ b/tests/kernels/core/test_mrope.py @@ -10,6 +10,7 @@ from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.platforms import current_platform from vllm.transformers_utils.config import get_config +from vllm.utils.torch_utils import set_random_seed device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -24,7 +25,7 @@ def generate_test_data( device: torch.device, ): """Generate test data for given configuration.""" - current_platform.seed_everything(42) + set_random_seed(42) # Create 2D positions (3, num_tokens) for multimodal case positions = torch.randint( 0, max_position_embeddings // 4, (3, num_tokens), device=device diff --git a/tests/kernels/core/test_pos_encoding.py b/tests/kernels/core/test_pos_encoding.py index d18f01314c8f..c7715ca35e72 100644 --- a/tests/kernels/core/test_pos_encoding.py +++ b/tests/kernels/core/test_pos_encoding.py @@ -9,7 +9,7 @@ from tests.kernels.allclose_default import get_default_atol, get_default_rtol from vllm.model_executor.layers.rotary_embedding import get_rope -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed IS_NEOX_STYLE = [True, False] DTYPES = [torch.bfloat16, torch.float] @@ -79,7 +79,7 @@ def test_rotary_embedding( if rotary_dim is None: rotary_dim = head_size - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) if rotary_dim is None: rotary_dim = head_size diff --git a/tests/kernels/mamba/test_causal_conv1d.py b/tests/kernels/mamba/test_causal_conv1d.py index 4647b97c4771..d16205694971 100644 --- a/tests/kernels/mamba/test_causal_conv1d.py +++ b/tests/kernels/mamba/test_causal_conv1d.py @@ -12,7 +12,7 @@ causal_conv1d_fn, causal_conv1d_update, ) -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed def causal_conv1d_ref( @@ -154,7 +154,7 @@ def test_causal_conv1d_update(dim, width, seqlen, has_bias, silu_activation, ity if itype == torch.bfloat16: rtol, atol = 1e-2, 5e-2 # set seed - current_platform.seed_everything(0) + set_random_seed(0) batch = 2 x = torch.randn(batch, dim, seqlen, device=device, dtype=itype) x_ref = x.clone() @@ -201,7 +201,7 @@ def test_causal_conv1d_update_with_batch_gather( rtol, atol = 1e-2, 5e-2 # set seed - current_platform.seed_everything(0) + set_random_seed(0) padding = 5 if with_padding else 0 padded_batch_size = batch_size + padding @@ -278,7 +278,7 @@ def test_causal_conv1d_varlen( if itype == torch.bfloat16: rtol, atol = 1e-2, 5e-2 # set seed - current_platform.seed_everything(0) + set_random_seed(0) seqlens = [] batch_size = batch padding = 3 if with_padding else 0 diff --git a/tests/kernels/mamba/test_mamba_mixer2.py b/tests/kernels/mamba/test_mamba_mixer2.py index 6fca33acd48a..98879ff6ed7f 100644 --- a/tests/kernels/mamba/test_mamba_mixer2.py +++ b/tests/kernels/mamba/test_mamba_mixer2.py @@ -12,8 +12,8 @@ initialize_model_parallel, ) from vllm.model_executor.layers.mamba.mamba_mixer2 import Mixer2RMSNormGated -from vllm.platforms import current_platform from vllm.utils.system_utils import update_environment_variables +from vllm.utils.torch_utils import set_random_seed @multi_gpu_test(num_gpus=2) @@ -68,7 +68,7 @@ def mixer2_gated_norm_tensor_parallel( dtype: torch.dtype, device: str, ): - current_platform.seed_everything(0) + set_random_seed(0) device = torch.device(f"cuda:{local_rank}") torch.cuda.set_device(device) diff --git a/tests/kernels/mamba/test_mamba_ssm.py b/tests/kernels/mamba/test_mamba_ssm.py index 50e48aad6eba..f50ab5344b15 100644 --- a/tests/kernels/mamba/test_mamba_ssm.py +++ b/tests/kernels/mamba/test_mamba_ssm.py @@ -13,7 +13,7 @@ selective_scan_fn, selective_state_update, ) -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed def selective_state_update_ref( @@ -271,7 +271,7 @@ def test_selective_scan( rtolw = max(rtolw, rtol) atolw = max(atolw, atol) # set seed - current_platform.seed_everything(0) + set_random_seed(0) batch_size = 1 dim = 4 dstate = 8 @@ -401,7 +401,7 @@ def test_selective_state_update(dim, dstate, has_z, itype): if torch.version.hip: atol *= 2 # set seed - current_platform.seed_everything(0) + set_random_seed(0) batch_size = 1 state = torch.randn(batch_size, dim, dstate, dtype=itype, device=device) x = torch.randn(batch_size, dim, device=device, dtype=itype) @@ -438,7 +438,7 @@ def test_selective_state_update_varlen(dim, dstate, has_z, itype, max_seq_len): if torch.version.hip: atol *= 2 # set seed - current_platform.seed_everything(0) + set_random_seed(0) batch_size = 4 token_counts = torch.randint(1, max_seq_len + 1, (batch_size,), device=device) total_tokens = int(token_counts.sum().item()) @@ -857,7 +857,7 @@ def test_selective_state_update_with_num_accepted_tokens( if torch.version.hip: atol *= 2 - current_platform.seed_everything(0) + set_random_seed(0) batch_size = 4 tokens_per_seq = torch.randint(1, max_seq_len + 1, (batch_size,), device=device) @@ -983,7 +983,7 @@ def test_selective_state_update_varlen_with_num_accepted( if torch.version.hip: atol *= 2 - current_platform.seed_everything(0) + set_random_seed(0) batch_size = 4 tokens_per_seq = torch.randint(1, max_seq_len + 1, (batch_size,), device=device) diff --git a/tests/kernels/mamba/test_mamba_ssm_ssd.py b/tests/kernels/mamba/test_mamba_ssm_ssd.py index 0b0b82e484a1..40aa3d017d78 100644 --- a/tests/kernels/mamba/test_mamba_ssm_ssd.py +++ b/tests/kernels/mamba/test_mamba_ssm_ssd.py @@ -9,7 +9,7 @@ from vllm.model_executor.layers.mamba.ops.ssd_combined import ( mamba_chunk_scan_combined_varlen, ) -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed from vllm.v1.attention.backends.mamba2_attn import compute_varlen_chunk_metadata # Added by the IBM Team, 2024 @@ -82,7 +82,7 @@ def ssd_minimal_discrete(X, A, B, C, block_len, initial_states=None): def generate_random_inputs(batch_size, seqlen, n_heads, d_head, itype, device="cuda"): - current_platform.seed_everything(0) + set_random_seed(0) A = -torch.exp(torch.rand(n_heads, dtype=itype, device=device)) dt = F.softplus( torch.randn(batch_size, seqlen, n_heads, dtype=itype, device=device) - 4 diff --git a/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py b/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py index 95db6327c4f1..08e50c52cbed 100644 --- a/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py +++ b/tests/kernels/moe/modular_kernel_tools/make_feature_matrix.py @@ -10,7 +10,7 @@ from vllm.config import VllmConfig, set_current_vllm_config from vllm.model_executor.layers.fused_moe.config import FUSED_MOE_UNQUANTIZED_CONFIG -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed from .common import ( Config, @@ -40,7 +40,7 @@ def rank_worker( config: Config, weights: WeightTensors, ): - current_platform.seed_everything(pgi.rank) + set_random_seed(pgi.rank) # sanity check from vllm import envs diff --git a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py index a3e264c5f5e2..3cdc7b82130b 100644 --- a/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py +++ b/tests/kernels/moe/modular_kernel_tools/profile_modular_kernel.py @@ -9,7 +9,7 @@ import torch from vllm.config import VllmConfig -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed from .common import Config, RankTensors, WeightTensors, make_modular_kernel from .parallel_utils import ProcessGroupInfo, parallel_launch_with_config @@ -82,7 +82,7 @@ def rank_worker( config: Config, weights: WeightTensors, ): - current_platform.seed_everything(pgi.rank) + set_random_seed(pgi.rank) # sanity check from vllm import envs diff --git a/tests/kernels/moe/test_batched_moe.py b/tests/kernels/moe/test_batched_moe.py index 2ef170f1ab30..c9d425b5b990 100644 --- a/tests/kernels/moe/test_batched_moe.py +++ b/tests/kernels/moe/test_batched_moe.py @@ -21,6 +21,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk from vllm.platforms import current_platform from vllm.triton_utils import tl +from vllm.utils.torch_utils import set_random_seed MNK_FACTORS = [ (1, 128, 128), @@ -115,7 +116,7 @@ def test_batched_mm( ): """Note: float8_e4m3fn is not supported on CUDA architecture < 89, and those tests will be skipped on unsupported hardware.""" - current_platform.seed_everything(7) + set_random_seed(7) use_fp8_w8a8 = dtype == torch.float8_e4m3fn @@ -252,7 +253,7 @@ def test_fused_moe_batched_experts( ): """Note: float8_e4m3fn is not supported on CUDA architecture < 89, and those tests will be skipped on unsupported hardware.""" - current_platform.seed_everything(7) + set_random_seed(7) use_fp8_w8a8 = dtype == torch.float8_e4m3fn diff --git a/tests/kernels/moe/test_cpu_fused_moe.py b/tests/kernels/moe/test_cpu_fused_moe.py index 4dda45a6c740..f2c1d0382981 100644 --- a/tests/kernels/moe/test_cpu_fused_moe.py +++ b/tests/kernels/moe/test_cpu_fused_moe.py @@ -8,6 +8,7 @@ from vllm._custom_ops import cpu_fused_moe, cpu_prepack_moe_weight from vllm.model_executor.layers.activation import SiluAndMul, SwigluOAIAndMul from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed if not current_platform.is_cpu(): pytest.skip("skipping CPU-only tests", allow_module_level=True) @@ -114,7 +115,7 @@ def test_cpu_fused_moe( act: str, isa: str, ): - current_platform.seed_everything(0) + set_random_seed(0) topk_num = max(expert_num // 2, 1) up_dim = 2 * intermediate_size diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py index 0160694d7bb5..4a57affdfbf4 100644 --- a/tests/kernels/moe/test_cutlass_moe.py +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -20,6 +20,7 @@ from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk from vllm.model_executor.layers.fused_moe.utils import moe_kernel_quantize_input from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed NUM_EXPERTS = [40, 64] TOP_KS = [6, 8] @@ -277,7 +278,7 @@ def test_cutlass_moe_8_bit_no_graph( workspace_init, ep_size: int | None = None, ): - current_platform.seed_everything(7) + set_random_seed(7) monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") with set_current_vllm_config(vllm_config): mt = MOETensors8Bit.make_moe_tensors_8bit(m, k, n, e, per_act_token, per_out_ch) @@ -332,7 +333,7 @@ def test_cutlass_moe_8_bit_cuda_graph( monkeypatch, workspace_init, ): - current_platform.seed_everything(7) + set_random_seed(7) monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") with set_current_vllm_config(vllm_config): dtype = torch.half @@ -469,7 +470,7 @@ def test_run_cutlass_moe_fp8( ep_size: int, workspace_init, ): - current_platform.seed_everything(7) + set_random_seed(7) with set_current_vllm_config(vllm_config): mt = MOETensors8Bit.make_moe_tensors_8bit( m, k, n, e, per_act_token, per_out_channel diff --git a/tests/kernels/moe/test_deepep_deepgemm_moe.py b/tests/kernels/moe/test_deepep_deepgemm_moe.py index f427734ef09e..8987b688ab4a 100644 --- a/tests/kernels/moe/test_deepep_deepgemm_moe.py +++ b/tests/kernels/moe/test_deepep_deepgemm_moe.py @@ -22,13 +22,13 @@ ) from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel -from vllm.platforms import current_platform from vllm.utils.deep_gemm import ( get_mk_alignment_for_contiguous_layout, is_deep_gemm_e8m0_used, is_deep_gemm_supported, ) from vllm.utils.import_utils import has_deep_ep, has_deep_gemm +from vllm.utils.torch_utils import set_random_seed from vllm.v1.worker.workspace import init_workspace_manager from ...utils import multi_gpu_test @@ -367,7 +367,7 @@ def _test_deepep_deepgemm_moe( device = torch.device(f"cuda:{pgi.local_rank}") init_workspace_manager(device) - current_platform.seed_everything(pgi.rank) + set_random_seed(pgi.rank) w1 = w1.to(device=torch.cuda.current_device()) w2 = w2.to(device=torch.cuda.current_device()) @@ -456,7 +456,7 @@ def test_ht_deepep_deepgemm_moe( """ m, n, k = mnk - current_platform.seed_everything(7) + set_random_seed(7) if topk > num_experts: pytest.skip(f"Skipping test: topk={topk} > E={num_experts}") @@ -531,7 +531,7 @@ def test_ll_deepep_deepgemm_moe( assert not is_deep_gemm_e8m0_used() m, n, k = mnk - current_platform.seed_everything(7) + set_random_seed(7) if topk > num_experts: pytest.skip(f"Skipping test: topk={topk} > E={num_experts}") diff --git a/tests/kernels/moe/test_deepep_moe.py b/tests/kernels/moe/test_deepep_moe.py index e698ca92a151..e57e0d72067e 100644 --- a/tests/kernels/moe/test_deepep_moe.py +++ b/tests/kernels/moe/test_deepep_moe.py @@ -20,8 +20,8 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import ( per_token_group_quant_fp8, ) -from vllm.platforms import current_platform from vllm.utils.import_utils import has_deep_ep +from vllm.utils.torch_utils import set_random_seed from vllm.v1.worker.workspace import init_workspace_manager from ...utils import multi_gpu_test @@ -446,7 +446,7 @@ def test_deep_ep_moe( low_latency_mode = False use_fp8_dispatch = False - current_platform.seed_everything(7) + set_random_seed(7) world_size, dp_size = world_dp_size config = TestConfig(dtype=dtype, topk=topk, m=m, k=k, n=n, num_experts=num_experts) @@ -507,7 +507,7 @@ def test_low_latency_deep_ep_moe( f"hidden sizes {DeepEPLLPrepareAndFinalize.SUPPORTED_HIDDEN_SIZES}" ) - current_platform.seed_everything(7) + set_random_seed(7) world_size, dp_size = world_dp_size config = TestConfig(dtype=dtype, topk=topk, m=m, k=k, n=n, num_experts=num_experts) diff --git a/tests/kernels/moe/test_flashinfer.py b/tests/kernels/moe/test_flashinfer.py index bf4ef2d30466..c23107965340 100644 --- a/tests/kernels/moe/test_flashinfer.py +++ b/tests/kernels/moe/test_flashinfer.py @@ -15,13 +15,14 @@ from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( apply_flashinfer_per_tensor_scale_fp8, flashinfer_cutlass_moe_fp8, - register_moe_scaling_factors, + register_scales_for_trtllm_fp8_per_tensor_moe, rotate_flashinfer_fp8_moe_weights, swap_w13_to_w31, ) from vllm.model_executor.layers.quantization.utils.fp8_utils import input_to_float8 from vllm.model_executor.models.llama4 import Llama4MoE from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed try: from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe @@ -84,7 +85,7 @@ class TestData: @staticmethod def make_moe_tensors_8bit( - m: int, k: int, n: int, e: int, reorder: bool, activation: str = "silu" + m: int, k: int, n: int, e: int, is_trtllm: bool, activation: str = "silu" ) -> "TestData": is_gated = activation != "relu2_no_mul" @@ -122,12 +123,17 @@ def make_moe_tensors_8bit( all2all_backend="naive", ) - register_moe_scaling_factors(layer) - # flashinfer expects swapped rows for w13 layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data) - if reorder: + if is_trtllm: rotate_flashinfer_fp8_moe_weights(layer.w13_weight, layer.w2_weight) + register_scales_for_trtllm_fp8_per_tensor_moe( + layer, + layer.w13_weight_scale, + layer.w13_input_scale, + layer.w2_weight_scale, + layer.w2_input_scale, + ) layer.custom_routing_function = Llama4MoE.custom_routing_function layer.intermediate_size_per_partition = n layer.ep_rank = 0 @@ -158,10 +164,10 @@ def test_flashinfer_per_tensor_moe_fp8_no_graph( ): if not current_platform.has_device_capability(100): pytest.skip("Test is only supported for sm >= 100") - current_platform.seed_everything(7) + set_random_seed(7) monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") with set_current_vllm_config(vllm_config): - td = TestData.make_moe_tensors_8bit(m, k, n, e, reorder=True) + td = TestData.make_moe_tensors_8bit(m, k, n, e, is_trtllm=True) score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16) topk_weights, topk_ids = Llama4MoE.custom_routing_function( @@ -222,11 +228,11 @@ def test_flashinfer_cutlass_moe_fp8_no_graph( monkeypatch, workspace_init, ): - current_platform.seed_everything(7) + set_random_seed(7) monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", "8192") with set_current_vllm_config(vllm_config): td = TestData.make_moe_tensors_8bit( - m, k, n, e, reorder=False, activation=activation + m, k, n, e, is_trtllm=False, activation=activation ) score = torch.randn((m, e), device="cuda", dtype=torch.bfloat16) diff --git a/tests/kernels/moe/test_flashinfer_moe.py b/tests/kernels/moe/test_flashinfer_moe.py index 133a8a4a30a6..1262eea70bab 100644 --- a/tests/kernels/moe/test_flashinfer_moe.py +++ b/tests/kernels/moe/test_flashinfer_moe.py @@ -23,6 +23,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe +from vllm.utils.torch_utils import set_random_seed if not has_flashinfer_cutlass_fused_moe() or not current_platform.has_device_capability( 100 @@ -60,7 +61,7 @@ def test_flashinfer_fp4_moe_no_graph( activation: str, workspace_init, ): - current_platform.seed_everything(7) + set_random_seed(7) with set_current_vllm_config( VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) ): diff --git a/tests/kernels/moe/test_grouped_topk.py b/tests/kernels/moe/test_grouped_topk.py index d26fe50b815b..96d793d85dc2 100644 --- a/tests/kernels/moe/test_grouped_topk.py +++ b/tests/kernels/moe/test_grouped_topk.py @@ -8,11 +8,18 @@ import pytest import torch +from vllm.config import ( + CompilationConfig, + VllmConfig, + get_cached_compilation_config, + set_current_vllm_config, +) from vllm.model_executor.layers.fused_moe.fused_moe import ( GroupedTopk, fused_grouped_topk, ) from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed @pytest.mark.skipif( @@ -41,14 +48,19 @@ def test_grouped_topk( routed_scaling_factor: float, dtype: torch.dtype, ): - current_platform.seed_everything(0) + vllm_config = VllmConfig( + compilation_config=CompilationConfig(custom_ops=["all", "+grouped_topk"]) + ) + get_cached_compilation_config.cache_clear() + + set_random_seed(0) hidden_states = torch.randn((n_token, n_hidden), dtype=dtype, device="cuda") gating_output = torch.randn((n_token, n_expert), dtype=dtype, device="cuda") e_score_correction_bias = torch.randn( (n_expert,), dtype=torch.float32, device="cuda" ) - with monkeypatch.context() as m: + with set_current_vllm_config(vllm_config), monkeypatch.context() as m: m.setenv("VLLM_USE_FUSED_MOE_GROUPED_TOPK", "0") grouped_topk = GroupedTopk( topk=topk, @@ -58,6 +70,7 @@ def test_grouped_topk( scoring_func=scoring_func, routed_scaling_factor=routed_scaling_factor, ) + assert grouped_topk._forward_method.__name__ == "forward_cuda" baseline_topk_weights, baseline_topk_ids = grouped_topk( hidden_states=hidden_states, gating_output=gating_output, diff --git a/tests/kernels/moe/test_modular_kernel_combinations.py b/tests/kernels/moe/test_modular_kernel_combinations.py index 6ebf1016c166..ec31e66140a1 100644 --- a/tests/kernels/moe/test_modular_kernel_combinations.py +++ b/tests/kernels/moe/test_modular_kernel_combinations.py @@ -15,7 +15,7 @@ from vllm.platforms import current_platform from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe from vllm.utils.import_utils import has_deep_ep, has_deep_gemm, has_pplx -from vllm.utils.torch_utils import cuda_device_count_stateless +from vllm.utils.torch_utils import cuda_device_count_stateless, set_random_seed from vllm.v1.worker.workspace import init_workspace_manager from .modular_kernel_tools.common import ( @@ -82,7 +82,7 @@ def rank_worker( device = torch.device(f"cuda:{pgi.local_rank}") init_workspace_manager(device) - current_platform.seed_everything(pgi.rank) + set_random_seed(pgi.rank) # sanity check from vllm import envs diff --git a/tests/kernels/moe/test_modular_oai_triton_moe.py b/tests/kernels/moe/test_modular_oai_triton_moe.py index 1abb08f878b2..8733ba4d8e31 100644 --- a/tests/kernels/moe/test_modular_oai_triton_moe.py +++ b/tests/kernels/moe/test_modular_oai_triton_moe.py @@ -34,6 +34,7 @@ ) from vllm.model_executor.layers.utils import shuffle_weight from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed MNK = [ (1, 512, 384), @@ -211,7 +212,7 @@ def test_oai_triton_moe( unfused: bool, workspace_init, ): - current_platform.seed_everything(0) + set_random_seed(0) ( w1, w2, diff --git a/tests/kernels/moe/test_moe.py b/tests/kernels/moe/test_moe.py index fd6ce6bfbd78..0c814bbde4cb 100644 --- a/tests/kernels/moe/test_moe.py +++ b/tests/kernels/moe/test_moe.py @@ -60,6 +60,7 @@ from vllm.model_executor.models.mixtral import MixtralMoE from vllm.platforms import current_platform from vllm.scalar_type import ScalarType, scalar_types +from vllm.utils.torch_utils import set_random_seed from vllm.v1.worker.workspace import init_workspace_manager NUM_EXPERTS = [8, 64, 192] @@ -234,7 +235,7 @@ def test_fused_moe( monkeypatch, workspace_init, ): - current_platform.seed_everything(7) + set_random_seed(7) monkeypatch.setenv("VLLM_FUSED_MOE_CHUNK_SIZE", str(chunk_size)) diff --git a/tests/kernels/moe/test_moe_align_block_size.py b/tests/kernels/moe/test_moe_align_block_size.py index 1abfc11fb460..652a2ee21614 100644 --- a/tests/kernels/moe/test_moe_align_block_size.py +++ b/tests/kernels/moe/test_moe_align_block_size.py @@ -14,12 +14,13 @@ ) from vllm.platforms import current_platform from vllm.utils.math_utils import round_up +from vllm.utils.torch_utils import set_random_seed NUM_TOKENS = [1, 3, 256, 2256, 4096] NUM_EXPERTS = [32, 160, 256, 257] TOP_KS = [1, 2, 16, 32] BLOCK_SIZES = [32, 128] -current_platform.seed_everything(0) +set_random_seed(0) def _group_tokens_by_expert( diff --git a/tests/kernels/moe/test_moe_permute_unpermute.py b/tests/kernels/moe/test_moe_permute_unpermute.py index 12dd322dccc5..45127ce0ac63 100644 --- a/tests/kernels/moe/test_moe_permute_unpermute.py +++ b/tests/kernels/moe/test_moe_permute_unpermute.py @@ -17,11 +17,12 @@ moe_unpermute, ) from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed NUM_EXPERTS = [16, 64, 256] TOP_KS = [2, 6, 8] EP_SIZE = [1, 4, 16] -current_platform.seed_everything(0) +set_random_seed(0) if current_platform.is_rocm(): pytest.skip( @@ -226,7 +227,7 @@ def test_moe_permute_unpermute( n_local_expert, expert_map, _ = determine_expert_map(ep_size, ep_rank, n_expert) expert_map = expert_map.cuda() start_expert = n_local_expert * ep_rank - current_platform.seed_everything(0) + set_random_seed(0) hidden_states = torch.randn((n_token, n_hidden), device="cuda").to(dtype) gating_output = torch.randn((n_token, n_expert), device="cuda").to(dtype) topk_weights, topk_ids, token_expert_indices = fused_topk( diff --git a/tests/kernels/moe/test_nvfp4_moe.py b/tests/kernels/moe/test_nvfp4_moe.py index e67bd76a1618..fd7388e1cff8 100644 --- a/tests/kernels/moe/test_nvfp4_moe.py +++ b/tests/kernels/moe/test_nvfp4_moe.py @@ -16,6 +16,7 @@ from vllm.model_executor.layers.fused_moe.cutlass_moe import cutlass_moe_fp4 from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed if not current_platform.has_device_capability(100): pytest.skip( @@ -42,7 +43,7 @@ def test_cutlass_fp4_moe_no_graph( m: int, n: int, k: int, e: int, topk: int, dtype: torch.dtype, workspace_init ): - current_platform.seed_everything(7) + set_random_seed(7) with set_current_vllm_config( VllmConfig(parallel_config=ParallelConfig(pipeline_parallel_size=1)) ): diff --git a/tests/kernels/moe/test_pplx_cutlass_moe.py b/tests/kernels/moe/test_pplx_cutlass_moe.py index dd4eb4da913b..3a5801ae4996 100644 --- a/tests/kernels/moe/test_pplx_cutlass_moe.py +++ b/tests/kernels/moe/test_pplx_cutlass_moe.py @@ -14,6 +14,7 @@ from vllm.model_executor.layers.fused_moe.modular_kernel import FusedMoEModularKernel from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv +from vllm.utils.torch_utils import set_random_seed from ...utils import multi_gpu_test from .parallel_utils import ProcessGroupInfo, parallel_launch @@ -290,7 +291,7 @@ def test_cutlass_moe_pplx( world_dp_size: tuple[int, int], use_internode: bool, ): - current_platform.seed_everything(7) + set_random_seed(7) with set_current_vllm_config(vllm_config): dtype = torch.half diff --git a/tests/kernels/moe/test_pplx_moe.py b/tests/kernels/moe/test_pplx_moe.py index 35e554e16cb3..c08a54f0e9f6 100644 --- a/tests/kernels/moe/test_pplx_moe.py +++ b/tests/kernels/moe/test_pplx_moe.py @@ -44,8 +44,8 @@ from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceDelegate, ) -from vllm.platforms import current_platform from vllm.utils.math_utils import round_up +from vllm.utils.torch_utils import set_random_seed from vllm.v1.worker.workspace import init_workspace_manager from ...utils import multi_gpu_test @@ -184,7 +184,7 @@ def test_fused_moe_batched_experts( dtype: torch.dtype, workspace_init, ): - current_platform.seed_everything(7) + set_random_seed(7) a = torch.randn((m, k), device="cuda", dtype=dtype) / 10 w1 = torch.randn((e, 2 * n, k), device="cuda", dtype=dtype) / 10 @@ -491,7 +491,7 @@ def test_pplx_prepare_finalize_slow( if per_act_token_quant and block_shape is not None: pytest.skip("Skip illegal quantization combination") - current_platform.seed_everything(7) + set_random_seed(7) m, n, k = mnk world_size, dp_size = world_dp_size device = "cuda" @@ -809,7 +809,7 @@ def test_pplx_moe_slow( block_shape: list[int] | None, use_internode: bool, ): - current_platform.seed_everything(7) + set_random_seed(7) m, n, k = mnk world_size, dp_size = world_dp_size @@ -888,7 +888,7 @@ def format_result(msg, ex=None): new_vllm_config.parallel_config.enable_expert_parallel = True _set_vllm_config(new_vllm_config, pgi.world_size, pgi.rank, pgi.local_rank) - current_platform.seed_everything(7) + set_random_seed(7) combos = itertools.product( PPLX_COMBOS, NUM_EXPERTS, TOP_KS, DTYPES, [False, True], [None, [128, 128]] ) @@ -982,7 +982,7 @@ def test_pplx_prepare_finalize( world_dp_size: tuple[int, int], use_internode: bool, ): - current_platform.seed_everything(7) + set_random_seed(7) world_size, dp_size = world_dp_size parallel_launch( world_size * dp_size, @@ -1005,7 +1005,7 @@ def test_pplx_moe( use_internode: bool, use_shared_experts: bool, ): - current_platform.seed_everything(7) + set_random_seed(7) world_size, dp_size = world_dp_size parallel_launch( world_size, diff --git a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py index b220205759e2..62b7ecb17fbe 100644 --- a/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py +++ b/tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py @@ -13,6 +13,7 @@ from vllm.platforms import current_platform from vllm.utils.deep_gemm import DeepGemmQuantScaleFMT, has_deep_gemm from vllm.utils.math_utils import cdiv, round_up +from vllm.utils.torch_utils import set_random_seed if current_platform.is_fp8_fnuz(): pytest.skip( @@ -201,7 +202,7 @@ def token_random(E, T, H2, tokens_per_expert): @torch.inference_mode() def test_silu_mul_fp8_quant_deep_gemm(E: int, T: int, H: int, fp8_type: torch.dtype): group_size = 128 - current_platform.seed_everything(42) + set_random_seed(42) tokens_per_expert = torch.randint( low=0, diff --git a/tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py b/tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py index ace0794fea69..cca02928b498 100644 --- a/tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py +++ b/tests/kernels/moe/test_silu_mul_per_token_group_quant_fp8_colmajor.py @@ -11,6 +11,7 @@ from vllm.platforms import current_platform from vllm.triton_utils import triton from vllm.utils.deep_gemm import is_deep_gemm_e8m0_used +from vllm.utils.torch_utils import set_random_seed FLOAT8_DTYPE = torch.float8_e4m3fn GROUP_SIZE = 128 @@ -72,7 +73,7 @@ def reference(x: torch.Tensor, use_ue8m0: bool) -> tuple[torch.Tensor, torch.Ten reason="ROCm does not support DeepGemm.", ) def test_silu_mul_fp8_quant_deep_gemm(T: int, N: int): - current_platform.seed_everything(42) + set_random_seed(42) input = torch.rand((T, N), dtype=torch.bfloat16, device="cuda") diff --git a/tests/kernels/quantization/test_awq_triton.py b/tests/kernels/quantization/test_awq_triton.py index 069bd7435534..337bc177e6df 100644 --- a/tests/kernels/quantization/test_awq_triton.py +++ b/tests/kernels/quantization/test_awq_triton.py @@ -13,7 +13,7 @@ awq_dequantize_triton, awq_gemm_triton, ) -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed device = "cuda" @@ -86,7 +86,7 @@ def test_dequantize(qweight_rows, qweight_cols, group_size): zeros_cols = qweight_cols zeros_dtype = torch.int32 - current_platform.seed_everything(0) + set_random_seed(0) qweight = torch.randint( 0, @@ -141,7 +141,7 @@ def test_gemm(N, K, M, splitK, group_size): qzeros_rows = scales_rows qzeros_cols = qweight_cols - current_platform.seed_everything(0) + set_random_seed(0) input = torch.rand((input_rows, input_cols), dtype=input_dtype, device=device) qweight = torch.randint( diff --git a/tests/kernels/quantization/test_cutlass_w4a8_moe.py b/tests/kernels/quantization/test_cutlass_w4a8_moe.py index a855f7333b61..de0e347d8fe7 100644 --- a/tests/kernels/quantization/test_cutlass_w4a8_moe.py +++ b/tests/kernels/quantization/test_cutlass_w4a8_moe.py @@ -17,6 +17,7 @@ ) from vllm.platforms import current_platform from vllm.scalar_type import ScalarType, scalar_types +from vllm.utils.torch_utils import set_random_seed IS_SUPPORTED_BY_GPU = ( current_platform.is_cuda() and current_platform.get_device_capability()[0] >= 9 @@ -248,7 +249,7 @@ def compute_moe_reference_output(setup: MoETestSetup) -> torch.Tensor: @pytest.mark.parametrize("random_zero", [True, False]) def test_cutlass_w4a8_moe_mm_end_to_end(shape, random_zero): num_experts, N, K = shape - current_platform.seed_everything(42) + set_random_seed(42) setup = make_moe_test_setup( num_experts=num_experts, K=K, N=N, max_blocks=64, random_zero=random_zero ) @@ -308,7 +309,7 @@ def forward(self, a: torch.Tensor) -> torch.Tensor: reason="W4A8 Grouped GEMM is not supported on this GPU type.", ) def test_cutlass_w4a8_moe_mm_cuda_graph(): - current_platform.seed_everything(42) + set_random_seed(42) # Fixed config for CUDA graph test (single parameter point). num_experts = 8 K = 512 diff --git a/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py b/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py index 1e5c7dafb0f5..94fa38b5aae4 100644 --- a/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py +++ b/tests/kernels/quantization/test_flashinfer_nvfp4_scaled_mm.py @@ -12,6 +12,7 @@ from vllm import _custom_ops as ops from vllm.platforms import current_platform from vllm.utils.flashinfer import flashinfer_scaled_fp4_mm +from vllm.utils.torch_utils import set_random_seed if not current_platform.has_device_capability(100): pytest.skip( @@ -72,7 +73,7 @@ def test_flashinfer_nvfp4_gemm( if backend == "trtllm" and dtype == torch.float16: pytest.skip("Only torch.bfloat16 is supported for TRTLLM FP4 GEMM operations") - current_platform.seed_everything(seed) + set_random_seed(seed) m, n, packed_k = shape k = packed_k * 2 block_size = 16 diff --git a/tests/kernels/quantization/test_flashinfer_scaled_mm.py b/tests/kernels/quantization/test_flashinfer_scaled_mm.py index b30821b6895b..2c945ffcc4cd 100644 --- a/tests/kernels/quantization/test_flashinfer_scaled_mm.py +++ b/tests/kernels/quantization/test_flashinfer_scaled_mm.py @@ -6,6 +6,7 @@ from vllm import _custom_ops as ops from vllm.platforms import current_platform from vllm.utils.flashinfer import flashinfer_scaled_fp8_mm +from vllm.utils.torch_utils import set_random_seed if not current_platform.has_device_capability(100): pytest.skip( @@ -38,7 +39,7 @@ def test_flashinfer_fp8_gemm( device: str, autotune: bool, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) m, n, k = shape a = torch.randn((m, k), dtype=dtype, device=device) b = torch.randn((n, k), dtype=dtype, device=device) / k diff --git a/tests/kernels/quantization/test_fp8_quant.py b/tests/kernels/quantization/test_fp8_quant.py index 19aa21b96a57..452206495299 100644 --- a/tests/kernels/quantization/test_fp8_quant.py +++ b/tests/kernels/quantization/test_fp8_quant.py @@ -11,7 +11,7 @@ ref_dynamic_per_token_quant, ) from tests.kernels.utils import opcheck -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed DTYPES = [torch.bfloat16, torch.float] HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193] @@ -51,7 +51,7 @@ def opcheck_fp8_quant( def test_dynamic_per_token_fp8_quant( num_tokens: int, hidden_size: int, dtype: torch.dtype, scale_ub: bool, seed: int ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) x = ( torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") + 1e-6 @@ -81,7 +81,7 @@ def test_dynamic_per_token_fp8_quant( def test_dynamic_per_tensor_fp8_quant( num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") @@ -101,7 +101,7 @@ def test_dynamic_per_tensor_fp8_quant( @torch.inference_mode() @pytest.mark.parametrize("seed", SEEDS) def test_fp8_quant_large(seed: int) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) num_tokens = 1024000 # Mistral-Nemo's max_position_embeddings hidden_size = 1152 # Smallest hidden_size to reproduce the error diff --git a/tests/kernels/quantization/test_fp8_quant_group.py b/tests/kernels/quantization/test_fp8_quant_group.py index f5e1cde94b6e..989bcf81a01c 100644 --- a/tests/kernels/quantization/test_fp8_quant_group.py +++ b/tests/kernels/quantization/test_fp8_quant_group.py @@ -7,7 +7,7 @@ from vllm.model_executor.layers.quantization.input_quant_fp8 import QuantFP8 from vllm.model_executor.layers.quantization.utils.quant_utils import GroupShape -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed @pytest.mark.parametrize( @@ -30,7 +30,7 @@ def test_quantfp8_group_functionality( Tests both CUDA and native implementations, column-major scales, and verifies consistency between implementations. """ - current_platform.seed_everything(seed) + set_random_seed(seed) x = torch.randn((batch_size, hidden_dim), dtype=torch.bfloat16, device="cuda") * 8 expected_num_groups = (hidden_dim + group_size - 1) // group_size @@ -83,7 +83,7 @@ def test_quantfp8_group_functionality( @pytest.mark.parametrize("use_ue8m0", [True, False]) @torch.inference_mode() def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) group_size = 64 @@ -136,7 +136,7 @@ def test_quantfp8_group_multidimensional(seed: int, use_ue8m0: bool) -> None: @pytest.mark.parametrize("seed", [42]) @torch.inference_mode() def test_quantfp8_group_edge_cases(seed: int) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) batch_size = 16 group_size = 64 diff --git a/tests/kernels/quantization/test_gguf.py b/tests/kernels/quantization/test_gguf.py index 0988ba01759f..912d5fee4e59 100644 --- a/tests/kernels/quantization/test_gguf.py +++ b/tests/kernels/quantization/test_gguf.py @@ -11,7 +11,7 @@ import vllm._custom_ops as ops from vllm.model_executor.layers.fused_moe import fused_experts from vllm.model_executor.layers.quantization.gguf import _fused_moe_gguf -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample") GGUF_SAMPLE_MOE = snapshot_download("SzymonOzog/test-gguf-moe-sample") @@ -91,7 +91,7 @@ def test_dequantize( @pytest.mark.parametrize("quant_type", QUANT_TYPES) @torch.inference_mode() def test_mmvq(hidden_size: int, dtype: torch.dtype, quant_type: GGMLQuantizationType): - current_platform.seed_everything(0) + set_random_seed(0) tensors = get_gguf_sample_tensors(hidden_size, quant_type) x = torch.rand((1, hidden_size), dtype=dtype, device="cuda") @@ -134,7 +134,7 @@ def test_mmq( dtype: torch.dtype, quant_type: GGMLQuantizationType, ): - current_platform.seed_everything(0) + set_random_seed(0) tensors = get_gguf_sample_tensors(hidden_size, quant_type) x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda") @@ -169,7 +169,7 @@ def test_moe( quant_type: GGMLQuantizationType, top_k: int, ): - current_platform.seed_everything(0) + set_random_seed(0) H, E = 1024, 256 x = torch.rand((num_tokens, H), dtype=dtype, device="cuda") diff --git a/tests/kernels/quantization/test_int8_quant.py b/tests/kernels/quantization/test_int8_quant.py index 48e947db5fa7..cb2cd55facfd 100644 --- a/tests/kernels/quantization/test_int8_quant.py +++ b/tests/kernels/quantization/test_int8_quant.py @@ -7,7 +7,7 @@ from tests.kernels.quant_utils import ref_dynamic_per_token_quant from tests.kernels.utils import opcheck from vllm._custom_ops import scaled_int8_quant -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed DTYPES = [torch.bfloat16, torch.float] HIDDEN_SIZES = [17, 1024, 1025, 1026, 5137, 8193] @@ -46,7 +46,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True): def test_dynamic_scaled_int8_quant( num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 @@ -70,7 +70,7 @@ def test_dynamic_scaled_int8_quant( def test_dynamic_scaled_int8_azp_quant( num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) int8_traits = torch.iinfo(torch.int8) x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300 @@ -111,7 +111,7 @@ def test_dynamic_scaled_int8_azp_quant( def test_static_scaled_int8_quant( num_tokens: int, hidden_size: int, dtype: torch.dtype, seed: int, scale: float ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) int8_traits = torch.iinfo(torch.int8) x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 @@ -144,7 +144,7 @@ def test_static_scaled_int8_azp_quant( scale: float, azp: int, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) int8_traits = torch.iinfo(torch.int8) x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000 - 300 diff --git a/tests/kernels/quantization/test_mxfp4_qutlass.py b/tests/kernels/quantization/test_mxfp4_qutlass.py index 0bacbef2046b..0ad8e48ab159 100644 --- a/tests/kernels/quantization/test_mxfp4_qutlass.py +++ b/tests/kernels/quantization/test_mxfp4_qutlass.py @@ -24,6 +24,7 @@ from vllm._custom_ops import fusedQuantizeMx, matmul_mxf4_bf16_tn from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed if not torch.cuda.is_available(): pytest.skip("CUDA required for these tests.", allow_module_level=True) @@ -205,7 +206,7 @@ def _forward_quantize_ref( @pytest.fixture(autouse=True) def _seed_each_test(): - current_platform.seed_everything(0) + set_random_seed(0) np.random.seed(0) torch.random.manual_seed(0) diff --git a/tests/kernels/quantization/test_nvfp4_quant.py b/tests/kernels/quantization/test_nvfp4_quant.py index 12f1008ecf27..d17c69663fbd 100644 --- a/tests/kernels/quantization/test_nvfp4_quant.py +++ b/tests/kernels/quantization/test_nvfp4_quant.py @@ -6,6 +6,7 @@ from vllm import _custom_ops as ops from vllm.platforms import current_platform from vllm.scalar_type import scalar_types +from vllm.utils.torch_utils import set_random_seed if not current_platform.has_device_capability(100): pytest.skip( @@ -134,7 +135,7 @@ def test_quantize_to_fp4( seed: int, device: str, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device(device) m, n = shape @@ -156,7 +157,7 @@ def test_quantize_to_fp4( @torch.inference_mode() def test_quantize_to_fp4_padded(pad_shape: tuple[int, int]) -> None: dtype = torch.float16 - current_platform.seed_everything(42) + set_random_seed(42) torch.set_default_device("cuda:0") m, n = pad_shape diff --git a/tests/kernels/quantization/test_nvfp4_qutlass.py b/tests/kernels/quantization/test_nvfp4_qutlass.py index 3824a080f504..bb25c4ab9aaf 100644 --- a/tests/kernels/quantization/test_nvfp4_qutlass.py +++ b/tests/kernels/quantization/test_nvfp4_qutlass.py @@ -25,6 +25,7 @@ from vllm._custom_ops import fusedQuantizeNv from vllm.model_executor.layers.quantization.qutlass_utils import to_blocked from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed if not torch.cuda.is_available(): pytest.skip("CUDA required for these tests.", allow_module_level=True) @@ -193,7 +194,7 @@ def _forward_quantize_ref(x: torch.Tensor, h: torch.Tensor, rot_size: int): @pytest.fixture(autouse=True) def _seed_each_test(): - current_platform.seed_everything(0) + set_random_seed(0) np.random.seed(0) torch.random.manual_seed(0) diff --git a/tests/kernels/quantization/test_nvfp4_scaled_mm.py b/tests/kernels/quantization/test_nvfp4_scaled_mm.py index 434564737c88..e7e16817593b 100644 --- a/tests/kernels/quantization/test_nvfp4_scaled_mm.py +++ b/tests/kernels/quantization/test_nvfp4_scaled_mm.py @@ -6,6 +6,7 @@ from vllm import _custom_ops as ops from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed if not current_platform.has_device_capability(100): pytest.skip( @@ -59,7 +60,7 @@ def test_nvfp4_gemm( seed: int, device: str, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) m, n, packed_k = shape k = packed_k * 2 block_size = 16 diff --git a/tests/kernels/quantization/test_silu_mul_nvfp4_quant.py b/tests/kernels/quantization/test_silu_mul_nvfp4_quant.py index 4617464a3978..1c9140007f38 100644 --- a/tests/kernels/quantization/test_silu_mul_nvfp4_quant.py +++ b/tests/kernels/quantization/test_silu_mul_nvfp4_quant.py @@ -11,6 +11,7 @@ from vllm._custom_ops import scaled_fp4_quant from vllm.model_executor.layers.activation import SiluAndMul from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed if not current_platform.has_device_capability(100): pytest.skip( @@ -33,7 +34,7 @@ def test_silu_mul_nvfp4_quant( dtype: torch.dtype, shape: tuple[int, int], ) -> None: - current_platform.seed_everything(42) + set_random_seed(42) device = "cuda:0" torch.set_default_device(device) diff --git a/tests/kernels/quantization/test_triton_scaled_mm.py b/tests/kernels/quantization/test_triton_scaled_mm.py index 6633a8bbd3c6..1cef5eb93a5c 100644 --- a/tests/kernels/quantization/test_triton_scaled_mm.py +++ b/tests/kernels/quantization/test_triton_scaled_mm.py @@ -11,6 +11,7 @@ import torch from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed device = "cuda" @@ -85,7 +86,7 @@ def test_scaled_mm( ): is_floating_point_type = lambda t: torch.tensor([1, 1], dtype=t).is_floating_point() - current_platform.seed_everything(0) + set_random_seed(0) # NOTE: There are cases, where if the matrix is large enough, an output # like 65504.4 can be produced, and can easily turn into inf when diff --git a/tests/kernels/test_apply_repetition_penalties.py b/tests/kernels/test_apply_repetition_penalties.py index a4619f5846b1..8270cf885f60 100644 --- a/tests/kernels/test_apply_repetition_penalties.py +++ b/tests/kernels/test_apply_repetition_penalties.py @@ -9,6 +9,7 @@ apply_repetition_penalties_torch, ) from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed NUM_SEQS = [1, 2, 3, 4, 8, 13, 17, 32, 37, 256, 1023, 1024, 1025] # [stress, stress, stress, Qwen, llama 4] @@ -38,7 +39,7 @@ def test_apply_repetition_penalties( Test the apply_repetition_penalties custom op against a reference implementation. """ - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device("cuda:0") # Create test data @@ -95,7 +96,7 @@ def test_apply_repetition_penalties_zero_seqs() -> None: dtype = torch.float32 seed = 0 - current_platform.seed_everything(seed) + set_random_seed(seed) torch.set_default_device("cuda:0") # Create test data diff --git a/tests/kernels/test_fla_layernorm_guard.py b/tests/kernels/test_fla_layernorm_guard.py index f944c6dcfa73..2ece5497cb06 100644 --- a/tests/kernels/test_fla_layernorm_guard.py +++ b/tests/kernels/test_fla_layernorm_guard.py @@ -10,7 +10,7 @@ layernorm_fn, rms_norm_ref, ) -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed def layer_norm_ref( @@ -114,7 +114,7 @@ def test_layer_norm_fwd_basic( is_rms_norm: bool, ) -> None: """Test basic layer norm forward pass without z (gate) tensor.""" - current_platform.seed_everything(seed) + set_random_seed(seed) device = torch.device("cuda:0") # Create inputs @@ -156,7 +156,7 @@ def test_layer_norm_fwd_with_gate( is_rms_norm: bool, ) -> None: """Test layer norm forward pass with z (gate) tensor.""" - current_platform.seed_everything(42) + set_random_seed(42) device = torch.device("cuda:0") # Create inputs @@ -213,7 +213,7 @@ def test_layer_norm_fwd_with_groups( f"hidden_size {hidden_size} not divisible by group_size {group_size}" ) - current_platform.seed_everything(42) + set_random_seed(42) device = torch.device("cuda:0") # Create inputs @@ -253,7 +253,7 @@ def test_layer_norm_rows_per_block( dtype: torch.dtype, ) -> None: """Test that rows_per_block logic works correctly for various M sizes.""" - current_platform.seed_everything(42) + set_random_seed(42) device = torch.device("cuda:0") hidden_size = 1024 @@ -278,7 +278,7 @@ def test_layer_norm_rows_per_block( def test_strided_input(dtype: torch.dtype) -> None: """Test that the kernel handles non-contiguous (strided) inputs correctly.""" - current_platform.seed_everything(42) + set_random_seed(42) device = torch.device("cuda:0") num_tokens = 128 hidden_size = 1024 @@ -318,7 +318,7 @@ def test_output_buffer_provided( dtype: torch.dtype, ) -> None: """Test that the kernel works when an output buffer is provided.""" - current_platform.seed_everything(42) + set_random_seed(42) device = torch.device("cuda:0") # Create inputs @@ -359,7 +359,7 @@ def test_multidimensional_input( dtype: torch.dtype, ) -> None: """Test that the autograd function handles multidimensional inputs.""" - current_platform.seed_everything(42) + set_random_seed(42) device = torch.device("cuda:0") hidden_size = shape[-1] diff --git a/tests/lora/test_fused_moe_lora_kernel.py b/tests/lora/test_fused_moe_lora_kernel.py index 91c8b861c3c5..a4d314be095c 100644 --- a/tests/lora/test_fused_moe_lora_kernel.py +++ b/tests/lora/test_fused_moe_lora_kernel.py @@ -18,8 +18,8 @@ get_tensor_model_parallel_world_size, ) from vllm.lora.ops.triton_ops import fused_moe_lora -from vllm.platforms import current_platform from vllm.utils.network_utils import get_open_port +from vllm.utils.torch_utils import set_random_seed @pytest.fixture(autouse=True) @@ -265,7 +265,7 @@ def test_fused_moe_lora_kernel( seed, ): torch.set_default_device(device) - current_platform.seed_everything(seed) + set_random_seed(seed) # the number of randomly generated sentences. num_sequences = 10 # generate data @@ -358,7 +358,7 @@ def test_fused_moe_lora_kernel_fully_sharded( seed, column_parallel, ): - current_platform.seed_everything(seed) + set_random_seed(seed) # the number of randomly generated sentences. num_sequences = 10 # generate data @@ -415,7 +415,7 @@ def use_fused_moe_lora_kernel_tensor_parallel( def _get_shard_slice(shard_size): return slice(local_rank * shard_size, (local_rank + 1) * shard_size) - current_platform.seed_everything(seed) + set_random_seed(seed) device = torch.device(f"cuda:{local_rank}") torch.cuda.set_device(device) diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py index dfec4236835a..611204b22f47 100644 --- a/tests/lora/test_layers.py +++ b/tests/lora/test_layers.py @@ -43,8 +43,8 @@ VocabParallelEmbedding, get_masked_input_and_mask, ) -from vllm.model_executor.utils import set_random_seed from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed from .utils import DummyLoRAManager diff --git a/tests/lora/test_punica_ops.py b/tests/lora/test_punica_ops.py index e4df9751077d..5083f500c5cd 100644 --- a/tests/lora/test_punica_ops.py +++ b/tests/lora/test_punica_ops.py @@ -9,7 +9,7 @@ import vllm.lora.ops.triton_ops as triton_ops from vllm.lora.ops.triton_ops import LoRAKernelMeta from vllm.lora.ops.triton_ops.utils import _LORA_A_PTR_DICT, _LORA_B_PTR_DICT -from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed from .utils import PunicaTensors, assert_close, generate_data_for_nslices @@ -395,7 +395,7 @@ def test_kernels( Tests LoRA kernels. """ torch.set_default_device(device) - current_platform.seed_everything(seed) + set_random_seed(seed) if op_type == "shrink": check_lora_shrink_kernel( @@ -447,7 +447,7 @@ def test_kernels_hidden_size( Tests SGMV and LoRA kernels. """ torch.set_default_device(device) - current_platform.seed_everything(seed) + set_random_seed(seed) if op_type == "shrink": check_lora_shrink_kernel( diff --git a/tests/models/fixtures/qwen2_5_math_prm_reward_step.json b/tests/models/fixtures/qwen2_5_math_prm_reward_step.json new file mode 100644 index 000000000000..dc0f3010cc3a --- /dev/null +++ b/tests/models/fixtures/qwen2_5_math_prm_reward_step.json @@ -0,0 +1 @@ +[[[0.0006361007690429688, 0.99951171875], [0.81884765625, 0.1812744140625], [0.025543212890625, 0.974609375], [0.0004382133483886719, 0.99951171875]]] \ No newline at end of file diff --git a/tests/models/language/generation/conftest.py b/tests/models/language/generation/conftest.py new file mode 100644 index 000000000000..f423b656b2f2 --- /dev/null +++ b/tests/models/language/generation/conftest.py @@ -0,0 +1,28 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Pytest configuration for vLLM language generation tests.""" + +import warnings + +import torch + +from vllm.platforms import current_platform + + +def pytest_sessionstart(session): + """Configure ROCm-specific settings before test session starts.""" + if not current_platform.is_rocm(): + return + + # Disable Flash/MemEfficient SDP on ROCm to avoid HF Transformers + # accuracy issues: https://github.com/vllm-project/vllm/issues/30167 + # TODO: Remove once ROCm SDP accuracy issues are resolved on HuggingFace + torch.backends.cuda.enable_flash_sdp(False) + torch.backends.cuda.enable_mem_efficient_sdp(False) + torch.backends.cuda.enable_math_sdp(True) + warnings.warn( + "ROCm: Disabled flash_sdp and mem_efficient_sdp, enabled math_sdp " + "to avoid HuggingFace Transformers accuracy issues", + UserWarning, + stacklevel=1, + ) diff --git a/tests/models/language/generation/test_phimoe.py b/tests/models/language/generation/test_phimoe.py index e640655784cc..1f03cf9cddf9 100644 --- a/tests/models/language/generation/test_phimoe.py +++ b/tests/models/language/generation/test_phimoe.py @@ -60,6 +60,19 @@ def test_phimoe_routing_function(): assert torch.equal(topk_ids, ground_truth[test_id]["topk_ids"]) +# There is a known issue that triggers `AttributeError: 'DynamicCache' +# object has no attribute 'seen_tokens'` when running: +# `tests/models/language/generation/test_phimoe.py::test_models +# [5-64-bfloat16-microsoft/Phi-3.5-MoE-instruct]` +# This issue is being investigated and tracked in: +# https://huggingface.co/microsoft/Phi-3.5-MoE-instruct/discussions/58 +# It is platform-agnostic. Therefore, we skip this test on all platforms for now. +@pytest.mark.skip( + reason="Skipping due to known issue: " + "'DynamicCache' object has no attribute 'seen_tokens'. See: " + "https://huggingface.co/microsoft/Phi-3.5-MoE-instruct/discussions/58 " + "for details.", +) @pytest.mark.skipif( condition=current_platform.is_cpu(), reason="This test takes a lot time to run on CPU, " diff --git a/tests/models/language/pooling/test_reward.py b/tests/models/language/pooling/test_reward.py index c42186c7db9a..22e0539a9890 100644 --- a/tests/models/language/pooling/test_reward.py +++ b/tests/models/language/pooling/test_reward.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import json +from typing import TYPE_CHECKING import pytest import torch @@ -9,7 +11,18 @@ from vllm.platforms import current_platform from ....conftest import HfRunner -from ...utils import check_transformers_version +from ....utils import VLLM_PATH +from ...registry import HF_EXAMPLE_MODELS + +if TYPE_CHECKING: + from _typeshed import StrPath + + +FIXTURES_PATH = VLLM_PATH / "tests/models/fixtures" +assert FIXTURES_PATH.exists() +FIXTURE_REWARD_RESULT = { + "Qwen/Qwen2.5-Math-PRM-7B": FIXTURES_PATH / "qwen2_5_math_prm_reward_step.json", +} @pytest.fixture @@ -60,6 +73,16 @@ def reward(prompts: list[str]) -> list[list[float]]: return hf_model +def dump_reward_outputs(outputs: list[list[float]], filename: "StrPath"): + with open(filename, "w", encoding="utf-8") as f: + json.dump(outputs, f) + + +def load_reward_outputs(filename: "StrPath") -> list[list[float]]: + with open(filename, encoding="utf-8") as f: + return json.load(f) + + @pytest.mark.parametrize( "model", [ @@ -77,9 +100,8 @@ def test_prm_models( model: str, dtype: str, ) -> None: - check_transformers_version( - "Qwen/Qwen2.5-Math-PRM-7B", max_transformers_version="4.53.2" - ) + model_info = HF_EXAMPLE_MODELS.find_hf_info(model) + model_info.check_transformers_version(on_fail="skip") if current_platform.is_cpu(): pytest.skip("CPU only supports V1") @@ -91,9 +113,46 @@ def test_prm_models( hf_model = step_reward_patch_hf_model(hf_model) hf_outputs = hf_model.reward(math_step_prompts) + dump_reward_outputs( + hf_outputs, + FIXTURE_REWARD_RESULT[model], + ) + # check logits difference for hf_output, vllm_output in zip(hf_outputs, vllm_outputs): hf_output = torch.tensor(hf_output).float() vllm_output = torch.tensor(vllm_output).float() assert torch.allclose(hf_output, vllm_output, 1.5e-2) + + +@pytest.mark.parametrize( + "model", + [ + pytest.param( + "Qwen/Qwen2.5-Math-PRM-7B", + marks=[pytest.mark.core_model, pytest.mark.cpu_model], + ), + ], +) +@pytest.mark.parametrize("dtype", ["half"]) +def test_prm_models_with_golden_outputs( + vllm_runner, + math_step_prompts, + model: str, + dtype: str, +) -> None: + if not FIXTURE_REWARD_RESULT.get(model): + pytest.skip(f"No available golden outputs for {model}.") + + with vllm_runner(model, max_model_len=1024, dtype=dtype) as vllm_model: + vllm_outputs = vllm_model.reward(math_step_prompts) + + golden_outputs = load_reward_outputs(FIXTURE_REWARD_RESULT[model]) + + # check logits difference + for golden_output, vllm_output in zip(golden_outputs, vllm_outputs): + golden_output = torch.tensor(golden_output).float() + vllm_output = torch.tensor(vllm_output).float() + + assert torch.allclose(golden_output, vllm_output, 1.5e-2) diff --git a/tests/models/language/pooling/test_token_classification.py b/tests/models/language/pooling/test_token_classification.py index 64d42432c74b..b84f05ae3099 100644 --- a/tests/models/language/pooling/test_token_classification.py +++ b/tests/models/language/pooling/test_token_classification.py @@ -34,8 +34,8 @@ def test_bert_models( # check logits difference for hf_output, vllm_output in zip(hf_outputs, vllm_outputs): - hf_output = torch.tensor(hf_output).cpu().float() - vllm_output = torch.tensor(vllm_output).cpu().float() + hf_output = hf_output.detach().clone().cpu().float() + vllm_output = vllm_output.detach().clone().cpu().float() assert torch.allclose(hf_output, vllm_output, 1e-2) @@ -49,11 +49,22 @@ def test_modernbert_models( model: str, dtype: str, ) -> None: + from vllm.platforms import current_platform + with vllm_runner(model, max_model_len=None, dtype=dtype) as vllm_model: vllm_outputs = vllm_model.token_classify(example_prompts) + # Use eager attention on ROCm to avoid HF Transformers flash attention + # accuracy issues: https://github.com/vllm-project/vllm/issues/30167 + hf_model_kwargs = {} + if current_platform.is_rocm(): + hf_model_kwargs["attn_implementation"] = "eager" + with hf_runner( - model, dtype=dtype, auto_cls=AutoModelForTokenClassification + model, + dtype=dtype, + auto_cls=AutoModelForTokenClassification, + model_kwargs=hf_model_kwargs, ) as hf_model: tokenizer = hf_model.tokenizer hf_outputs = [] @@ -65,9 +76,9 @@ def test_modernbert_models( # check logits difference for hf_output, vllm_output in zip(hf_outputs, vllm_outputs): - hf_output = torch.tensor(hf_output).cpu().float() - vllm_output = torch.tensor(vllm_output).cpu().float() - assert torch.allclose(hf_output, vllm_output, atol=1e-2) + hf_output = hf_output.detach().clone().cpu().float() + vllm_output = vllm_output.detach().clone().cpu().float() + torch.testing.assert_close(hf_output, vllm_output, atol=1.2e-2, rtol=1e-3) @pytest.mark.parametrize("model", ["bd2lcco/Qwen3-0.6B-finetuned"]) @@ -96,6 +107,6 @@ def test_auto_conversion( # check logits difference for hf_output, vllm_output in zip(hf_outputs, vllm_outputs): - hf_output = torch.tensor(hf_output).cpu().float() - vllm_output = torch.tensor(vllm_output).cpu().float() + hf_output = hf_output.detach().clone().cpu().float() + vllm_output = vllm_output.detach().clone().cpu().float() assert torch.allclose(hf_output, vllm_output, atol=1e-2) diff --git a/tests/models/language/pooling_mteb_test/mteb_embed_utils.py b/tests/models/language/pooling_mteb_test/mteb_embed_utils.py index a0b469f93064..e048318e9316 100644 --- a/tests/models/language/pooling_mteb_test/mteb_embed_utils.py +++ b/tests/models/language/pooling_mteb_test/mteb_embed_utils.py @@ -19,9 +19,9 @@ # - Model implementation and minor changes in tensor dtype # results in differences less than 1e-4 # - Different model results in differences more than 1e-3 -# 1e-4 is a good tolerance threshold +# 5e-4 is a good tolerance threshold MTEB_EMBED_TASKS = ["STS12"] -MTEB_EMBED_TOL = 1e-4 +MTEB_EMBED_TOL = 5e-4 _empty_model_meta = ModelMeta( diff --git a/tests/models/language/pooling_mteb_test/mteb_score_utils.py b/tests/models/language/pooling_mteb_test/mteb_score_utils.py index 6c1350231773..c5c23b153fcd 100644 --- a/tests/models/language/pooling_mteb_test/mteb_score_utils.py +++ b/tests/models/language/pooling_mteb_test/mteb_score_utils.py @@ -3,13 +3,16 @@ import tempfile from pathlib import Path +from typing import Any import mteb import numpy as np import requests +import torch from mteb.models import ModelMeta from torch.utils.data import DataLoader +from tests.conftest import HfRunner from tests.models.utils import ( RerankModelInfo, get_vllm_extra_kwargs, @@ -67,6 +70,12 @@ def predict( queries = [text for batch in inputs1 for text in batch["text"]] corpus = [text for batch in inputs2 for text in batch["text"]] + # Hoping to discover potential scheduling + # issues by randomizing the order. + r = self.rng.permutation(len(queries)) + queries = [queries[i] for i in r] + corpus = [corpus[i] for i in r] + outputs = self.llm.score( queries, corpus, @@ -75,6 +84,7 @@ def predict( chat_template=self.chat_template, ) scores = np.array(outputs) + scores = scores[np.argsort(r)] return scores @@ -84,7 +94,6 @@ class ScoreClientMtebEncoder(MtebCrossEncoderMixin): def __init__(self, model_name: str, url): self.model_name = model_name self.url = url - self.rng = np.random.default_rng(seed=42) def predict( self, @@ -130,6 +139,50 @@ def get_score(self, query, corpus): return response["results"][0]["relevance_score"] +class HFMtebCrossEncoder(MtebCrossEncoderMixin, HfRunner): + chat_template: str | None = None + + def __init__(self, model_name: str, dtype: str = "auto", **kwargs: Any) -> None: + HfRunner.__init__( + self, model_name=model_name, is_cross_encoder=True, dtype=dtype, **kwargs + ) + + @torch.no_grad + def predict( + self, + inputs1: DataLoader[mteb.types.BatchedInput], + inputs2: DataLoader[mteb.types.BatchedInput], + *args, + **kwargs, + ) -> np.ndarray: + queries = [text for batch in inputs1 for text in batch["text"]] + corpus = [text for batch in inputs2 for text in batch["text"]] + + if self.chat_template is not None: + tokenizer = self.model.tokenizer + prompts = [] + for query, document in zip(queries, corpus): + conversation = [ + {"role": "query", "content": query}, + {"role": "document", "content": document}, + ] + + prompt = tokenizer.apply_chat_template( + conversation=conversation, + tools=None, + chat_template=self.chat_template, + tokenize=False, + ) + prompts.append(prompt) + outputs_list = HfRunner.classify(self, prompts) + scores = np.array(outputs_list).squeeze(-1) + return scores + else: + prompts = list(zip(queries, corpus)) + outputs_tensor = HfRunner.predict(self, prompts, show_progress_bar=False) + return outputs_tensor.cpu().numpy() + + def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages): with tempfile.TemporaryDirectory() as prediction_folder: bm25s = mteb.get_model("bm25s") @@ -168,31 +221,21 @@ def run_mteb_rerank(cross_encoder: mteb.CrossEncoderProtocol, tasks, languages): return main_score -def mteb_test_rerank_models_hf( - hf_runner, model_name, hf_dtype="float32", hf_model_callback=None -): - with hf_runner(model_name, is_cross_encoder=True, dtype=hf_dtype) as hf_model: - if hf_model_callback is not None: - hf_model_callback(hf_model) - - st_main_score = run_mteb_rerank( - hf_model, tasks=MTEB_RERANK_TASKS, languages=MTEB_RERANK_LANGS - ) - st_dtype = next(hf_model.model.model.parameters()).dtype - return st_main_score, st_dtype - - def mteb_test_rerank_models( - hf_runner, vllm_runner, model_info: RerankModelInfo, + hf_runner=HFMtebCrossEncoder, vllm_extra_kwargs=None, - hf_model_callback=None, vllm_mteb_encoder=VllmMtebCrossEncoder, atol=MTEB_RERANK_TOL, ): vllm_extra_kwargs = get_vllm_extra_kwargs(model_info, vllm_extra_kwargs) + # Maybe load chat_template. + chat_template: str | None = None + if model_info.chat_template_name is not None: + chat_template = (template_home / model_info.chat_template_name).read_text() + with vllm_runner( model_info.name, runner="pooling", @@ -201,6 +244,7 @@ def mteb_test_rerank_models( **vllm_extra_kwargs, ) as vllm_model: model_config = vllm_model.llm.llm_engine.model_config + vllm_model.chat_template = chat_template # Confirm whether vllm is using the correct architecture if model_info.architecture: @@ -209,12 +253,6 @@ def mteb_test_rerank_models( # Score API is only enabled for num_labels == 1 assert model_config.hf_config.num_labels == 1 - # Maybe load chat_template. - chat_template: str | None = None - if model_info.chat_template_name is not None: - chat_template = (template_home / model_info.chat_template_name).read_text() - vllm_model.chat_template = chat_template - # Confirm whether the important configs in model_config are correct. if model_info.pooling_type is not None: assert model_config.pooler_config.pooling_type == model_info.pooling_type @@ -242,9 +280,14 @@ def mteb_test_rerank_models( # Accelerate mteb test by setting # SentenceTransformers mteb score to a constant if model_info.mteb_score is None: - st_main_score, st_dtype = mteb_test_rerank_models_hf( - hf_runner, model_info.name, model_info.hf_dtype, hf_model_callback - ) + with hf_runner(model_info.name, dtype=model_info.hf_dtype) as hf_model: + hf_model.chat_template = chat_template + st_main_score = run_mteb_rerank( + hf_model, + tasks=MTEB_RERANK_TASKS, + languages=MTEB_RERANK_LANGS, + ) + st_dtype = next(hf_model.model.model.parameters()).dtype else: st_main_score = model_info.mteb_score st_dtype = "Constant" diff --git a/tests/models/language/pooling_mteb_test/test_baai.py b/tests/models/language/pooling_mteb_test/test_baai.py index 2e55622a5d48..2a639f550e19 100644 --- a/tests/models/language/pooling_mteb_test/test_baai.py +++ b/tests/models/language/pooling_mteb_test/test_baai.py @@ -65,7 +65,6 @@ "BAAI/bge-code-v1", architecture="Qwen2Model", mteb_score=0.75724465, - dtype="float32", pooling_type="LAST", attn_type="decoder", is_prefix_caching_supported=True, @@ -112,7 +111,5 @@ def test_embed_models_correctness( @pytest.mark.parametrize("model_info", RERANK_MODELS) -def test_rerank_models_mteb( - hf_runner, vllm_runner, model_info: RerankModelInfo -) -> None: - mteb_test_rerank_models(hf_runner, vllm_runner, model_info) +def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None: + mteb_test_rerank_models(vllm_runner, model_info) diff --git a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py index 00f2d33546ef..3e58d5999f89 100644 --- a/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py +++ b/tests/models/language/pooling_mteb_test/test_bge_reranker_v2_gemma.py @@ -11,40 +11,60 @@ from tests.conftest import HfRunner from tests.models.utils import RerankModelInfo -from .mteb_score_utils import VllmMtebCrossEncoder, mteb_test_rerank_models +from .mteb_score_utils import ( + MtebCrossEncoderMixin, + mteb_test_rerank_models, +) RERANK_MODELS = [ RerankModelInfo( "BAAI/bge-reranker-v2-gemma", architecture="GemmaForSequenceClassification", - mteb_score=0.33757, hf_overrides={ "architectures": ["GemmaForSequenceClassification"], "classifier_from_token": ["Yes"], "method": "no_post_processing", }, + mteb_score=0.33757, pooling_type="LAST", attn_type="decoder", is_prefix_caching_supported=True, is_chunked_prefill_supported=True, + chat_template_name="bge-reranker-v2-gemma.jinja", ), ] PROMPT = "Given a query A and a passage B, determine whether the passage contains an answer to the query by providing a prediction of either 'Yes' or 'No'." # noqa: E501 -class GemmaRerankerHfRunner(HfRunner): +class GemmaRerankerHfRunner(MtebCrossEncoderMixin, HfRunner): def __init__( self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any ) -> None: from transformers import AutoModelForCausalLM, AutoTokenizer - super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM) + HfRunner.__init__( + self, + model_name=model_name, + auto_cls=AutoModelForCausalLM, + dtype=dtype, + **kwargs, + ) + self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left") self.yes_loc = self.tokenizer.convert_tokens_to_ids("Yes") - @torch.no_grad() - def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor: + @torch.no_grad + def predict( + self, + inputs1: DataLoader[mteb.types.BatchedInput], + inputs2: DataLoader[mteb.types.BatchedInput], + *args, + **kwargs, + ) -> np.ndarray: + queries = [text for batch in inputs1 for text in batch["text"]] + corpus = [text for batch in inputs2 for text in batch["text"]] + def get_inputs(pairs, tokenizer, prompt=None): if prompt is None: prompt = PROMPT @@ -89,8 +109,8 @@ def get_inputs(pairs, tokenizer, prompt=None): ) scores = [] - for query, doc, *_ in prompts: - pairs = [(query, doc)] + for query, document in zip(queries, corpus): + pairs = [(query, document)] inputs = get_inputs(pairs, self.tokenizer) inputs = inputs.to(self.model.device) _n_tokens = inputs["input_ids"].shape[1] @@ -107,41 +127,10 @@ def get_inputs(pairs, tokenizer, prompt=None): return torch.Tensor(scores) -class GemmaMtebEncoder(VllmMtebCrossEncoder): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.query_template = "A: {query}\n" - self.document_template = "B: {doc}\n{prompt}" - - def predict( - self, - inputs1: DataLoader[mteb.types.BatchedInput], - inputs2: DataLoader[mteb.types.BatchedInput], - *args, - **kwargs, - ) -> np.ndarray: - queries = [ - self.query_template.format(query=text) - for batch in inputs1 - for text in batch["text"] - ] - corpus = [ - self.document_template.format(doc=text, prompt=PROMPT) - for batch in inputs2 - for text in batch["text"] - ] - outputs = self.llm.score( - queries, corpus, truncate_prompt_tokens=-1, use_tqdm=False - ) - scores = np.array(outputs) - return scores - - @pytest.mark.parametrize("model_info", RERANK_MODELS) def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None: mteb_test_rerank_models( - GemmaRerankerHfRunner, vllm_runner, model_info, - vllm_mteb_encoder=GemmaMtebEncoder, + hf_runner=GemmaRerankerHfRunner, ) diff --git a/tests/models/language/pooling_mteb_test/test_cross_encoder.py b/tests/models/language/pooling_mteb_test/test_cross_encoder.py index 8bca49bb5b02..fb7b0fff36a4 100644 --- a/tests/models/language/pooling_mteb_test/test_cross_encoder.py +++ b/tests/models/language/pooling_mteb_test/test_cross_encoder.py @@ -11,27 +11,26 @@ RERANK_MODELS = [ RerankModelInfo( "cross-encoder/ms-marco-TinyBERT-L-2-v2", - mteb_score=0.32898, architecture="BertForSequenceClassification", pooling_type="CLS", attn_type="encoder_only", is_prefix_caching_supported=False, is_chunked_prefill_supported=False, + mteb_score=0.32898, ), RerankModelInfo( "tomaarsen/Qwen3-Reranker-0.6B-seq-cls", - mteb_score=0.25736, architecture="Qwen3ForSequenceClassification", pooling_type="LAST", attn_type="decoder", is_prefix_caching_supported=True, is_chunked_prefill_supported=True, + chat_template_name="qwen3_reranker.jinja", + mteb_score=0.33459, ), ] @pytest.mark.parametrize("model_info", RERANK_MODELS) -def test_rerank_models_mteb( - hf_runner, vllm_runner, model_info: RerankModelInfo -) -> None: - mteb_test_rerank_models(hf_runner, vllm_runner, model_info) +def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None: + mteb_test_rerank_models(vllm_runner, model_info) diff --git a/tests/models/language/pooling_mteb_test/test_gte.py b/tests/models/language/pooling_mteb_test/test_gte.py index 3d1d5aa84091..2a5b2090b458 100644 --- a/tests/models/language/pooling_mteb_test/test_gte.py +++ b/tests/models/language/pooling_mteb_test/test_gte.py @@ -89,7 +89,6 @@ "Qwen/Qwen3-Embedding-0.6B", mteb_score=0.771163695, architecture="Qwen3ForCausalLM", - dtype="float32", pooling_type="LAST", attn_type="decoder", is_prefix_caching_supported=True, @@ -99,7 +98,6 @@ EmbedModelInfo( "Qwen/Qwen3-Embedding-4B", architecture="Qwen3ForCausalLM", - dtype="float32", enable_test=False, ), ] @@ -143,7 +141,5 @@ def test_embed_models_correctness( @pytest.mark.parametrize("model_info", RERANK_MODELS) -def test_rerank_models_mteb( - hf_runner, vllm_runner, model_info: RerankModelInfo -) -> None: - mteb_test_rerank_models(hf_runner, vllm_runner, model_info) +def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None: + mteb_test_rerank_models(vllm_runner, model_info) diff --git a/tests/models/language/pooling_mteb_test/test_jina.py b/tests/models/language/pooling_mteb_test/test_jina.py index b98ac91b9757..cf6ba1851879 100644 --- a/tests/models/language/pooling_mteb_test/test_jina.py +++ b/tests/models/language/pooling_mteb_test/test_jina.py @@ -28,7 +28,6 @@ attn_type="encoder_only", is_prefix_caching_supported=False, is_chunked_prefill_supported=False, - dtype="float32", ) ] @@ -72,10 +71,8 @@ def hf_model_callback(model): @pytest.mark.parametrize("model_info", RERANK_MODELS) -def test_rerank_models_mteb( - hf_runner, vllm_runner, model_info: RerankModelInfo -) -> None: - mteb_test_rerank_models(hf_runner, vllm_runner, model_info) +def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None: + mteb_test_rerank_models(vllm_runner, model_info) @pytest.mark.parametrize("model_info", EMBEDDING_MODELS) diff --git a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py index 50dc6a0bd0ad..b03f599622cd 100644 --- a/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py +++ b/tests/models/language/pooling_mteb_test/test_mxbai_rerank.py @@ -2,13 +2,16 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from typing import Any +import mteb +import numpy as np import pytest import torch +from torch.utils.data import DataLoader from tests.conftest import HfRunner from tests.models.utils import RerankModelInfo -from .mteb_score_utils import mteb_test_rerank_models +from .mteb_score_utils import MtebCrossEncoderMixin, mteb_test_rerank_models mxbai_rerank_hf_overrides = { "architectures": ["Qwen2ForSequenceClassification"], @@ -21,50 +24,69 @@ "mixedbread-ai/mxbai-rerank-base-v2", architecture="Qwen2ForSequenceClassification", hf_overrides=mxbai_rerank_hf_overrides, - mteb_score=0.273, pooling_type="LAST", attn_type="decoder", is_prefix_caching_supported=True, is_chunked_prefill_supported=True, + chat_template_name="mxbai_rerank_v2.jinja", + mteb_score=0.33651, enable_test=True, ), RerankModelInfo( "mixedbread-ai/mxbai-rerank-large-v2", architecture="Qwen2ForSequenceClassification", hf_overrides=mxbai_rerank_hf_overrides, + chat_template_name="mxbai_rerank_v2.jinja", enable_test=False, ), ] -class MxbaiRerankerHfRunner(HfRunner): +class MxbaiRerankerHfRunner(MtebCrossEncoderMixin, HfRunner): def __init__( self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any ) -> None: from transformers import AutoModelForCausalLM, AutoTokenizer - super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM) + HfRunner.__init__( + self, + model_name=model_name, + auto_cls=AutoModelForCausalLM, + dtype=dtype, + **kwargs, + ) self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left") self.yes_loc = self.tokenizer.convert_tokens_to_ids("1") self.no_loc = self.tokenizer.convert_tokens_to_ids("0") - def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor: - def process_inputs(pairs): - inputs = self.tokenizer( - pairs, - padding=False, - truncation="longest_first", - return_attention_mask=False, + @torch.no_grad + def predict( + self, + inputs1: DataLoader[mteb.types.BatchedInput], + inputs2: DataLoader[mteb.types.BatchedInput], + *args, + **kwargs, + ) -> np.ndarray: + queries = [text for batch in inputs1 for text in batch["text"]] + corpus = [text for batch in inputs2 for text in batch["text"]] + + tokenizer = self.tokenizer + prompts = [] + for query, document in zip(queries, corpus): + conversation = [ + {"role": "query", "content": query}, + {"role": "document", "content": document}, + ] + + prompt = tokenizer.apply_chat_template( + conversation=conversation, + tools=None, + chat_template=self.chat_template, + tokenize=False, ) - for i, ele in enumerate(inputs["input_ids"]): - inputs["input_ids"][i] = ele - inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt") - for key in inputs: - inputs[key] = inputs[key].to(self.model.device) - return inputs - - @torch.no_grad() + prompts.append(prompt) + def compute_logits(inputs): logits = self.model(**inputs).logits[:, -1, :] yes_logits = logits[:, self.yes_loc] @@ -74,9 +96,9 @@ def compute_logits(inputs): return scores scores = [] - for query, doc, *_ in prompts: - pairs = [(query, doc)] - inputs = process_inputs(pairs) + for prompt in prompts: + inputs = tokenizer([prompt], return_tensors="pt") + inputs = self.wrap_device(inputs) score = compute_logits(inputs) scores.append(score[0].item()) return torch.Tensor(scores) @@ -84,4 +106,4 @@ def compute_logits(inputs): @pytest.mark.parametrize("model_info", RERANK_MODELS) def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None: - mteb_test_rerank_models(MxbaiRerankerHfRunner, vllm_runner, model_info) + mteb_test_rerank_models(vllm_runner, model_info, hf_runner=MxbaiRerankerHfRunner) diff --git a/tests/models/language/pooling_mteb_test/test_nemotron.py b/tests/models/language/pooling_mteb_test/test_nemotron.py index c91616c9ec01..4e8304dde0b4 100644 --- a/tests/models/language/pooling_mteb_test/test_nemotron.py +++ b/tests/models/language/pooling_mteb_test/test_nemotron.py @@ -46,7 +46,5 @@ def test_embed_models_mteb(hf_runner, vllm_runner, model_info: EmbedModelInfo) - @pytest.mark.parametrize("model_info", RERANK_MODELS) -def test_rerank_models_mteb( - hf_runner, vllm_runner, model_info: RerankModelInfo -) -> None: - mteb_test_rerank_models(hf_runner, vllm_runner, model_info) +def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None: + mteb_test_rerank_models(vllm_runner, model_info) diff --git a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py index a8e79c839107..228ae457b355 100644 --- a/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py +++ b/tests/models/language/pooling_mteb_test/test_qwen3_reranker.py @@ -1,15 +1,19 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: E501 from typing import Any +import mteb +import numpy as np import pytest import torch +from torch.utils.data import DataLoader from tests.conftest import HfRunner from tests.models.utils import RerankModelInfo from tests.utils import multi_gpu_test -from .mteb_score_utils import mteb_test_rerank_models +from .mteb_score_utils import MtebCrossEncoderMixin, mteb_test_rerank_models qwen3_reranker_hf_overrides = { "architectures": ["Qwen3ForSequenceClassification"], @@ -21,51 +25,71 @@ RerankModelInfo( "Qwen/Qwen3-Reranker-0.6B", architecture="Qwen3ForSequenceClassification", - mteb_score=0.25736, hf_overrides=qwen3_reranker_hf_overrides, + chat_template_name="qwen3_reranker.jinja", pooling_type="LAST", attn_type="decoder", is_prefix_caching_supported=True, is_chunked_prefill_supported=True, + mteb_score=0.33459, enable_test=True, ), RerankModelInfo( "Qwen/Qwen3-Reranker-4B", architecture="Qwen3ForSequenceClassification", + chat_template_name="qwen3_reranker.jinja", hf_overrides=qwen3_reranker_hf_overrides, enable_test=False, ), ] -class Qwen3RerankerHfRunner(HfRunner): +class Qwen3RerankerHfRunner(MtebCrossEncoderMixin, HfRunner): def __init__( self, model_name: str, dtype: str = "auto", *args: Any, **kwargs: Any ) -> None: from transformers import AutoModelForCausalLM, AutoTokenizer - super().__init__(model_name, dtype, auto_cls=AutoModelForCausalLM) + HfRunner.__init__( + self, + model_name=model_name, + auto_cls=AutoModelForCausalLM, + dtype=dtype, + **kwargs, + ) self.tokenizer = AutoTokenizer.from_pretrained(model_name, padding_side="left") self.token_false_id = self.tokenizer.convert_tokens_to_ids("no") self.token_true_id = self.tokenizer.convert_tokens_to_ids("yes") - - def predict(self, prompts: list[list[str]], *args, **kwargs) -> torch.Tensor: - def process_inputs(pairs): - inputs = self.tokenizer( - pairs, - padding=False, - truncation="longest_first", - return_attention_mask=False, + self.max_length = 40960 + + @torch.no_grad + def predict( + self, + inputs1: DataLoader[mteb.types.BatchedInput], + inputs2: DataLoader[mteb.types.BatchedInput], + *args, + **kwargs, + ) -> np.ndarray: + queries = [text for batch in inputs1 for text in batch["text"]] + corpus = [text for batch in inputs2 for text in batch["text"]] + + tokenizer = self.tokenizer + prompts = [] + for query, document in zip(queries, corpus): + conversation = [ + {"role": "query", "content": query}, + {"role": "document", "content": document}, + ] + + prompt = tokenizer.apply_chat_template( + conversation=conversation, + tools=None, + chat_template=self.chat_template, + tokenize=False, ) - for i, ele in enumerate(inputs["input_ids"]): - inputs["input_ids"][i] = ele - inputs = self.tokenizer.pad(inputs, padding=True, return_tensors="pt") - for key in inputs: - inputs[key] = inputs[key].to(self.model.device) - return inputs - - @torch.no_grad() + prompts.append(prompt) + def compute_logits(inputs): batch_scores = self.model(**inputs).logits[:, -1, :] true_vector = batch_scores[:, self.token_true_id] @@ -76,9 +100,9 @@ def compute_logits(inputs): return scores scores = [] - for query, doc, *_ in prompts: - pairs = [(query, doc)] - inputs = process_inputs(pairs) + for prompt in prompts: + inputs = tokenizer([prompt], return_tensors="pt") + inputs = self.wrap_device(inputs) score = compute_logits(inputs) scores.append(score[0].item()) return torch.Tensor(scores) @@ -86,7 +110,7 @@ def compute_logits(inputs): @pytest.mark.parametrize("model_info", RERANK_MODELS) def test_rerank_models_mteb(vllm_runner, model_info: RerankModelInfo) -> None: - mteb_test_rerank_models(Qwen3RerankerHfRunner, vllm_runner, model_info) + mteb_test_rerank_models(vllm_runner, model_info, hf_runner=Qwen3RerankerHfRunner) @pytest.mark.parametrize("model_info", RERANK_MODELS) @@ -99,5 +123,8 @@ def test_rerank_models_mteb_tp(vllm_runner, model_info: RerankModelInfo) -> None } mteb_test_rerank_models( - Qwen3RerankerHfRunner, vllm_runner, model_info, vllm_extra_kwargs + vllm_runner, + model_info, + vllm_extra_kwargs=vllm_extra_kwargs, + hf_runner=Qwen3RerankerHfRunner, ) diff --git a/tests/models/language/pooling_mteb_test/test_st_projector.py b/tests/models/language/pooling_mteb_test/test_st_projector.py index c1fd61b8e227..4ce7a4aed58e 100644 --- a/tests/models/language/pooling_mteb_test/test_st_projector.py +++ b/tests/models/language/pooling_mteb_test/test_st_projector.py @@ -29,7 +29,6 @@ is_prefix_caching_supported=False, is_chunked_prefill_supported=False, enable_test=True, - dtype="float32", ), ] diff --git a/tests/models/multimodal/generation/test_common.py b/tests/models/multimodal/generation/test_common.py index b2cc57cb68da..b1be3a3760c8 100644 --- a/tests/models/multimodal/generation/test_common.py +++ b/tests/models/multimodal/generation/test_common.py @@ -121,10 +121,6 @@ ), auto_cls=AutoModelForImageTextToText, vllm_output_post_proc=model_utils.paligemma_vllm_to_hf_output, - dtype="bfloat16", - marks=[ - pytest.mark.skip(reason="vLLM does not support PrefixLM attention mask") - ], ), "qwen2_5_vl": VLMTestInfo( models=["Qwen/Qwen2.5-VL-3B-Instruct"], diff --git a/tests/models/multimodal/generation/test_nemotron_parse.py b/tests/models/multimodal/generation/test_nemotron_parse.py new file mode 100644 index 000000000000..1b05d336c10b --- /dev/null +++ b/tests/models/multimodal/generation/test_nemotron_parse.py @@ -0,0 +1,89 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from collections.abc import Sequence + +import pytest +from transformers import AutoModel + +from tests.models.utils import check_logprobs_close +from vllm.assets.image import ImageAsset + +from ....conftest import HfRunner, PromptImageInput, VllmRunner +from ....utils import create_new_process_for_each_test + +IMAGE = ImageAsset("paper-11").pil_image_ext(ext="png").convert("RGB") +PROMPT = "" + + +def run_test( + hf_runner: type[HfRunner], + vllm_runner: type[VllmRunner], + inputs: Sequence[tuple[list[str], PromptImageInput]], + model: str, + *, + dtype: str, + max_tokens: int, + num_logprobs: int, +) -> None: + """Verify that the inference result is the same between hf and vllm.""" + with vllm_runner( + model, + dtype=dtype, + max_num_seqs=64, + limit_mm_per_prompt={"image": 1}, + trust_remote_code=True, + ) as vllm_model: + vllm_outputs_per_case = [ + vllm_model.generate_greedy_logprobs( + prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images, + ) + for prompts, images in inputs + ] + + with hf_runner(model, dtype=dtype, auto_cls=AutoModel) as hf_model: + hf_outputs_per_case = [ + hf_model.generate_greedy_logprobs_limit( + prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images, + use_cache=False, # HF Nemotron Parse crashes here without this + ) + for prompts, images in inputs + ] + + for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case): + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", + ) + + +@pytest.mark.core_model +@pytest.mark.parametrize("model", ["nvidia/NVIDIA-Nemotron-Parse-v1.1"]) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.parametrize("num_logprobs", [5]) +@create_new_process_for_each_test("spawn") +def test_models( + hf_runner, vllm_runner, model: str, dtype: str, num_logprobs: int +) -> None: + run_test( + hf_runner, + vllm_runner, + inputs=[ + ( + [PROMPT] * 10, + [IMAGE] * 10, + ), + ], + model=model, + dtype=dtype, + max_tokens=100, + num_logprobs=num_logprobs, + ) diff --git a/tests/models/multimodal/generation/test_whisper.py b/tests/models/multimodal/generation/test_whisper.py index b206995a9cec..23459963f090 100644 --- a/tests/models/multimodal/generation/test_whisper.py +++ b/tests/models/multimodal/generation/test_whisper.py @@ -114,7 +114,7 @@ def check_model_available(model: str) -> None: @pytest.mark.core_model @pytest.mark.cpu_model @pytest.mark.parametrize("model", ["openai/whisper-large-v3-turbo"]) -@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("dtype", ["half", "float"]) @pytest.mark.parametrize("num_logprobs", [5]) @pytest.mark.parametrize("enforce_eager", [True, False]) @create_new_process_for_each_test("spawn") diff --git a/tests/models/multimodal/pooling/test_radio.py b/tests/models/multimodal/pooling/test_radio.py index 1f5baed83fa6..8b19b5630462 100644 --- a/tests/models/multimodal/pooling/test_radio.py +++ b/tests/models/multimodal/pooling/test_radio.py @@ -40,15 +40,15 @@ def run_radio_test( for image in images ] - config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) + hf_config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) # RADIO model on HF does not properly handle torch_dtype argument # And relies on args["dtype"] which we have to patch manually: - config.args["dtype"] = torch_dtype + hf_config.args["dtype"] = torch_dtype hf_model = AutoModel.from_pretrained( model_id, - config=config, + config=hf_config, dtype=torch_dtype, trust_remote_code=True, ).to("cuda") @@ -62,13 +62,14 @@ def run_radio_test( hf_model.make_preprocessor_external() hf_outputs_per_image = [ - hf_model(pixel_value.to("cuda")).features for pixel_value in pixel_values + hf_model(pixel_value.to("cuda")) for pixel_value in pixel_values ] - radio_config = RadioConfig( - model_name=config.args["model"], reg_tokens=config.args["register_multiple"] + vllm_config = RadioConfig( + model_name=hf_config.args["model"], + **hf_config.args, ) - vllm_model = RadioModel(radio_config) + vllm_model = RadioModel(vllm_config) vllm_model.load_weights(hf_model.state_dict()) vllm_model = vllm_model.to("cuda", torch_dtype) @@ -80,7 +81,8 @@ def run_radio_test( cos_similar = nn.CosineSimilarity(dim=-1) for vllm_output, hf_output in zip(vllm_outputs_per_image, hf_outputs_per_image): - assert cos_similar(vllm_output, hf_output).mean() > 0.99 + assert cos_similar(vllm_output[0], hf_output[0]).mean() > 0.99 + assert cos_similar(vllm_output[1], hf_output[1]).mean() > 0.99 @pytest.mark.parametrize( diff --git a/tests/models/multimodal/processing/test_common.py b/tests/models/multimodal/processing/test_common.py index b170b29c241f..271920ef0e6a 100644 --- a/tests/models/multimodal/processing/test_common.py +++ b/tests/models/multimodal/processing/test_common.py @@ -102,6 +102,7 @@ def glmasr_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict: # incorrect token ids. So we need use `add_special_tokens=False` here # to leave bos_token to be added by the processor. _ADD_SPECIAL_TOKENS_OVERRIDES = { + "nemotron_parse": False, "ovis": False, "ovis2_5": False, "paligemma": False, diff --git a/tests/models/multimodal/processing/test_qwen3_omni.py b/tests/models/multimodal/processing/test_qwen3_omni.py new file mode 100644 index 000000000000..c80944a52390 --- /dev/null +++ b/tests/models/multimodal/processing/test_qwen3_omni.py @@ -0,0 +1,285 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Tests for Qwen3 Omni audio processing and sample rate handling.""" + +from typing import Any + +import numpy as np +import pytest + +from vllm.multimodal import MULTIMODAL_REGISTRY + +from ...utils import build_model_context + + +@pytest.mark.parametrize("model_id", ["Qwen/Qwen3-Omni-30B-A3B-Instruct"]) +@pytest.mark.parametrize( + ("audio_sample_rate", "audio_duration_sec"), + [ + (16000, 1.0), # Native Whisper sample rate, 1 second + (16000, 2.0), # Native Whisper sample rate, 2 seconds + ], +) +def test_processor_with_audio_sample_rate( + model_id: str, + audio_sample_rate: int, + audio_duration_sec: float, +) -> None: + """ + Test that vLLM's processor generates expected outputs with audio_sample_rate. + + This validates the reviewer's request that we test the actual processor + can handle different audio_sample_rate values and generate audio tokens. + """ + # Setup: Build model context and processor + ctx = build_model_context( + model_id, + limit_mm_per_prompt={"audio": 1, "image": 0, "video": 0}, + ) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + tokenizer = processor.info.get_tokenizer() + + # Create audio data at the specified sample rate + audio_length = int(audio_sample_rate * audio_duration_sec) + rng = np.random.RandomState(42) + audio_data = rng.rand(audio_length).astype(np.float32) + + # Build prompt with audio placeholder + prompt = "<|audio_start|><|audio_pad|><|audio_end|>" + mm_data = {"audio": [(audio_data, audio_sample_rate)]} + + # Execute: Apply processor with audio_sample_rate in mm_kwargs + hf_processor_mm_kwargs: dict[str, Any] = { + "audio_sample_rate": audio_sample_rate, + } + processed_inputs = processor.apply(prompt, mm_data, hf_processor_mm_kwargs) + + # Assert: Verify audio tokens are generated + hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs) + audio_token_id = tokenizer.convert_tokens_to_ids(hf_processor.audio_token) + aud_tok_count = processed_inputs["prompt_token_ids"].count(audio_token_id) + + # Audio should generate at least 1 token + assert aud_tok_count >= 1, ( + f"Expected at least 1 audio token but got {aud_tok_count}. " + f"sample_rate: {audio_sample_rate}Hz, duration: {audio_duration_sec}s" + ) + + +@pytest.mark.parametrize("model_id", ["Qwen/Qwen3-Omni-30B-A3B-Instruct"]) +def test_longer_audio_generates_more_tokens(model_id: str) -> None: + """ + Test that longer audio generates more tokens than shorter audio. + + This validates that audio_sample_rate is being used correctly by checking + that audio duration affects token count as expected. + """ + ctx = build_model_context( + model_id, + limit_mm_per_prompt={"audio": 1, "image": 0, "video": 0}, + ) + processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) + tokenizer = processor.info.get_tokenizer() + + audio_sample_rate = 16000 + rng = np.random.RandomState(42) + + def get_token_count(duration: float) -> int: + audio_length = int(audio_sample_rate * duration) + audio_data = rng.rand(audio_length).astype(np.float32) + prompt = "<|audio_start|><|audio_pad|><|audio_end|>" + mm_data = {"audio": [(audio_data, audio_sample_rate)]} + hf_processor_mm_kwargs: dict[str, Any] = { + "audio_sample_rate": audio_sample_rate, + } + processed = processor.apply(prompt, mm_data, hf_processor_mm_kwargs) + hf_proc = processor.info.get_hf_processor(**hf_processor_mm_kwargs) + audio_token_id = tokenizer.convert_tokens_to_ids(hf_proc.audio_token) + return processed["prompt_token_ids"].count(audio_token_id) + + # Get token counts for different durations + short_tokens = get_token_count(1.0) + long_tokens = get_token_count(2.0) + + # Longer audio should produce more tokens + assert long_tokens > short_tokens, ( + f"Expected longer audio (2s) to have more tokens than shorter (1s). " + f"Got short={short_tokens}, long={long_tokens}" + ) + + +class TestQwen3OmniAudioSampleRatePreservation: + """Test that audio_sample_rate is preserved during kwargs restructuring. + + These tests validate the fix for the audio_sample_rate bug in Qwen3 Omni + where the parameter was lost during kwargs restructuring. + """ + + @staticmethod + def _process_kwargs( + mm_kwargs: dict[str, Any], + tok_kwargs: dict[str, Any], + transformers_version: str = "4.57.0", + ) -> dict[str, Any]: + """ + Helper method to simulate kwargs processing logic from production code. + + This method simulates the kwargs restructuring that happens in the + Qwen3 Omni model when transformers < 4.58.0. By centralizing this + logic, we make tests easier to maintain if the production logic changes. + + Args: + mm_kwargs: Multimodal kwargs (e.g., audio_sample_rate, truncation) + tok_kwargs: Tokenizer kwargs (e.g., truncation) + transformers_version: Version string to test against (default: "4.57.0") + + Returns: + Processed kwargs dictionary with restructured audio_kwargs and text_kwargs + """ + from packaging.version import Version + + mm_kwargs_copy = dict(mm_kwargs) + tok_kwargs_copy = dict(tok_kwargs) + + if Version(transformers_version) < Version("4.58.0"): + # Extract audio_sample_rate before restructuring (THE FIX) + audio_sample_rate = mm_kwargs_copy.pop("audio_sample_rate", None) + + # Restructure kwargs + mm_kwargs_copy["audio_kwargs"] = { + "truncation": mm_kwargs_copy.pop("truncation", False) + } + mm_kwargs_copy["text_kwargs"] = { + "truncation": tok_kwargs_copy.pop("truncation", False) + } + + # Put audio_sample_rate into audio_kwargs (THE FIX) + if audio_sample_rate is not None: + mm_kwargs_copy["audio_kwargs"]["audio_sample_rate"] = audio_sample_rate + + return mm_kwargs_copy + + def test_audio_sample_rate_preserved_in_audio_kwargs(self) -> None: + """ + Test that audio_sample_rate is moved from top-level mm_kwargs + into audio_kwargs during kwargs restructuring. + + This is the core fix: when transformers < 4.58.0, the code + restructures kwargs into audio_kwargs and text_kwargs, and + audio_sample_rate must be preserved in audio_kwargs. + """ + # Setup: Create mm_kwargs with audio_sample_rate at top level + mm_kwargs: dict[str, Any] = { + "audio_sample_rate": 16000, + "truncation": True, + } + tok_kwargs: dict[str, Any] = { + "truncation": False, + } + + # Execute: Process kwargs using helper method + result = self._process_kwargs(mm_kwargs, tok_kwargs) + + # Assert: Verify audio_sample_rate is in audio_kwargs + assert "audio_kwargs" in result + assert "audio_sample_rate" in result["audio_kwargs"] + assert result["audio_kwargs"]["audio_sample_rate"] == 16000 + + # Assert: Verify truncation is also in audio_kwargs + assert result["audio_kwargs"]["truncation"] is True + + # Assert: Verify text_kwargs is created correctly + assert "text_kwargs" in result + assert result["text_kwargs"]["truncation"] is False + + def test_audio_sample_rate_absent_when_not_provided(self) -> None: + """ + Test that when audio_sample_rate is not provided in mm_kwargs, + the restructured audio_kwargs doesn't contain it. + """ + # Setup: Create mm_kwargs WITHOUT audio_sample_rate + mm_kwargs: dict[str, Any] = { + "truncation": True, + } + tok_kwargs: dict[str, Any] = { + "truncation": False, + } + + # Execute: Process kwargs using helper method + result = self._process_kwargs(mm_kwargs, tok_kwargs) + + # Assert: Verify audio_sample_rate is NOT in audio_kwargs + assert "audio_kwargs" in result + assert "audio_sample_rate" not in result["audio_kwargs"] + + # Assert: Verify truncation is still in audio_kwargs + assert result["audio_kwargs"]["truncation"] is True + + @pytest.mark.parametrize("sample_rate", [8000, 16000, 22050, 24000, 44100, 48000]) + def test_various_audio_sample_rates_preserved(self, sample_rate: int) -> None: + """ + Test that various common audio sample rates are preserved. + + Common sample rates: + - 8000: Telephone quality + - 16000: Wideband speech (Qwen3 Omni default) + - 22050: Low-quality audio + - 24000: High-quality speech + - 44100: CD quality + - 48000: Professional audio + """ + # Setup: Create mm_kwargs with specific sample rate + mm_kwargs: dict[str, Any] = { + "audio_sample_rate": sample_rate, + "truncation": True, + } + tok_kwargs: dict[str, Any] = {"truncation": False} + + # Execute: Process kwargs using helper method + result = self._process_kwargs(mm_kwargs, tok_kwargs) + + # Assert: Verify the specific sample rate is preserved + assert result["audio_kwargs"]["audio_sample_rate"] == sample_rate + + def test_kwargs_unchanged_for_newer_transformers_version(self) -> None: + """ + Test that kwargs structure remains unchanged for transformers >= 4.58.0. + + This test ensures that when transformers version is 4.58.0 or higher, + the kwargs restructuring is bypassed and audio_sample_rate remains + at the top level as originally passed. + """ + from packaging.version import Version + + # Setup: Create mm_kwargs with audio_sample_rate at top level + mm_kwargs: dict[str, Any] = { + "audio_sample_rate": 16000, + "truncation": True, + } + tok_kwargs: dict[str, Any] = { + "truncation": False, + } + + # Execute: Simulate with transformers >= 4.58.0 + mm_kwargs_copy = dict(mm_kwargs) + tok_kwargs_copy = dict(tok_kwargs) + + transformers_ver = "4.58.0" # Version that bypasses restructuring + if Version(transformers_ver) < Version("4.58.0"): + # This block should NOT execute for >= 4.58.0 + audio_sample_rate = mm_kwargs_copy.pop("audio_sample_rate", None) + mm_kwargs_copy["audio_kwargs"] = { + "truncation": mm_kwargs_copy.pop("truncation", False) + } + mm_kwargs_copy["text_kwargs"] = { + "truncation": tok_kwargs_copy.pop("truncation", False) + } + if audio_sample_rate is not None: + mm_kwargs_copy["audio_kwargs"]["audio_sample_rate"] = audio_sample_rate + + # Assert: Verify kwargs structure is unchanged + assert "audio_kwargs" not in mm_kwargs_copy + assert "text_kwargs" not in mm_kwargs_copy + assert mm_kwargs_copy["audio_sample_rate"] == 16000 + assert mm_kwargs_copy["truncation"] is True + assert tok_kwargs_copy["truncation"] is False diff --git a/tests/models/registry.py b/tests/models/registry.py index 884501b8fce6..570bcc734146 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -907,6 +907,9 @@ def check_available_online( is_available_online=False, ), # [Encoder-decoder] + "NemotronParseForConditionalGeneration": _HfExamplesInfo( + "nvidia/NVIDIA-Nemotron-Parse-v1.1", trust_remote_code=True + ), "WhisperForConditionalGeneration": _HfExamplesInfo( "openai/whisper-large-v3-turbo", extras={"v3": "openai/whisper-large-v3"}, diff --git a/tests/models/test_vision.py b/tests/models/test_vision.py index 82ba958a58c4..24e49e9d61c8 100644 --- a/tests/models/test_vision.py +++ b/tests/models/test_vision.py @@ -21,6 +21,7 @@ from vllm.platforms import current_platform from vllm.utils.network_utils import get_open_port from vllm.utils.system_utils import update_environment_variables +from vllm.utils.torch_utils import set_random_seed pytestmark = pytest.mark.cpu_test @@ -98,7 +99,7 @@ def run_dp_sharded_vision_model_vs_direct( """ # Set random seed for reproducibility - current_platform.seed_everything(0) + set_random_seed(0) device = f"{current_platform.device_name}:{local_rank}" current_platform.set_device(device) @@ -284,7 +285,7 @@ def run_dp_sharded_mrope_vision_model_vs_direct( calling the model directly. """ # Set random seed for reproducibility - current_platform.seed_everything(0) + set_random_seed(0) device = f"{current_platform.device_name}:{local_rank}" current_platform.set_device(device) torch.set_default_device(device) @@ -408,7 +409,7 @@ def run_dp_sharded_mrope_vision_model_uneven_load_worker( ): """Test run_dp_sharded_mrope_vision_model with uneven load distribution.""" # Set up distributed environment - current_platform.seed_everything(123) + set_random_seed(123) device = f"{current_platform.device_name}:{local_rank}" current_platform.set_device(device) torch.set_default_device(device) diff --git a/tests/models/utils.py b/tests/models/utils.py index a39a4c187cb7..bd9fcf31deab 100644 --- a/tests/models/utils.py +++ b/tests/models/utils.py @@ -471,12 +471,16 @@ def dummy_hf_overrides( "num_kv_shared_layers": 1, } + _hf_config = hf_config + class DummyConfig: + hf_config = _hf_config hf_text_config = text_config + model_arch_config = ModelConfig.get_model_arch_config(DummyConfig) # Only set MoE related config when the model has MoE layers. # Otherwise all models detected as MoE by _get_transformers_backend_cls. - if ModelConfig.get_num_experts(DummyConfig) > 0: + if model_arch_config.num_experts > 0: update_dict.update( { "num_experts": num_experts, diff --git a/tests/multimodal/test_processing.py b/tests/multimodal/test_processing.py index 262ea42e4d0f..64bb88960e86 100644 --- a/tests/multimodal/test_processing.py +++ b/tests/multimodal/test_processing.py @@ -1021,9 +1021,8 @@ def test_hf_processor_init_kwargs( DummyProcessor, # type: ignore[arg-type] **inference_kwargs, ) - - for k, v in expected_kwargs.items(): - assert getattr(processor, k) == v + assert processor.a == expected_kwargs["a"] + assert processor.b == expected_kwargs["b"] @pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) # Dummy diff --git a/tests/rocm/aiter/test_mla_fp8_support_check.py b/tests/rocm/aiter/test_mla_fp8_support_check.py new file mode 100644 index 000000000000..e3dc0f8ea13d --- /dev/null +++ b/tests/rocm/aiter/test_mla_fp8_support_check.py @@ -0,0 +1,118 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Unit tests for AITER MLA FP8 support detection. + +These tests verify that the _check_aiter_mla_fp8_support() function +correctly handles various error conditions without crashing. +""" + +from unittest.mock import patch + +import pytest + + +class TestAiterMlaFp8SupportCheck: + """Test cases for _check_aiter_mla_fp8_support() function.""" + + def setup_method(self): + """Reset the global cache before each test.""" + import vllm._aiter_ops as aiter_ops + + aiter_ops._AITER_MLA_SUPPORTS_FP8 = None + + @patch("vllm._aiter_ops.is_aiter_found_and_supported", return_value=True) + def test_import_error_handling(self, mock_supported): + """Test that ImportError is handled gracefully.""" + import vllm._aiter_ops as aiter_ops + from vllm._aiter_ops import _check_aiter_mla_fp8_support + + aiter_ops._AITER_MLA_SUPPORTS_FP8 = None + + # Should return False without raising + with patch( + "vllm._aiter_ops.inspect.signature", + side_effect=ImportError("No module"), + ): + result = _check_aiter_mla_fp8_support() + assert result is False + + @patch("vllm._aiter_ops.is_aiter_found_and_supported", return_value=True) + def test_module_not_found_error_handling(self, mock_supported): + """Test that ModuleNotFoundError is handled gracefully.""" + import vllm._aiter_ops as aiter_ops + from vllm._aiter_ops import _check_aiter_mla_fp8_support + + aiter_ops._AITER_MLA_SUPPORTS_FP8 = None + + with patch( + "vllm._aiter_ops.inspect.signature", + side_effect=ModuleNotFoundError("Module not found"), + ): + # Should return False without raising + assert _check_aiter_mla_fp8_support() is False + # Cache should be set to False + assert aiter_ops._AITER_MLA_SUPPORTS_FP8 is False + + @patch("vllm._aiter_ops.is_aiter_found_and_supported", return_value=True) + def test_attribute_error_handling(self, mock_supported): + """Test that AttributeError is handled gracefully.""" + import vllm._aiter_ops as aiter_ops + from vllm._aiter_ops import _check_aiter_mla_fp8_support + + aiter_ops._AITER_MLA_SUPPORTS_FP8 = None + + with patch( + "vllm._aiter_ops.inspect.signature", + side_effect=AttributeError("No attribute"), + ): + assert _check_aiter_mla_fp8_support() is False + assert aiter_ops._AITER_MLA_SUPPORTS_FP8 is False + + @patch("vllm._aiter_ops.is_aiter_found_and_supported", return_value=True) + def test_value_error_handling(self, mock_supported): + """Test that ValueError is handled gracefully (no signature).""" + import vllm._aiter_ops as aiter_ops + from vllm._aiter_ops import _check_aiter_mla_fp8_support + + aiter_ops._AITER_MLA_SUPPORTS_FP8 = None + + with patch( + "vllm._aiter_ops.inspect.signature", + side_effect=ValueError("No signature"), + ): + assert _check_aiter_mla_fp8_support() is False + assert aiter_ops._AITER_MLA_SUPPORTS_FP8 is False + + @patch("vllm._aiter_ops.is_aiter_found_and_supported", return_value=True) + def test_type_error_handling(self, mock_supported): + """Test that TypeError is handled gracefully (not callable).""" + import vllm._aiter_ops as aiter_ops + from vllm._aiter_ops import _check_aiter_mla_fp8_support + + aiter_ops._AITER_MLA_SUPPORTS_FP8 = None + + with patch( + "vllm._aiter_ops.inspect.signature", + side_effect=TypeError("Not a callable"), + ): + assert _check_aiter_mla_fp8_support() is False + assert aiter_ops._AITER_MLA_SUPPORTS_FP8 is False + + @patch("vllm._aiter_ops.is_aiter_found_and_supported", return_value=True) + def test_result_caching(self, mock_supported): + """Test that the result is cached after first check.""" + import vllm._aiter_ops as aiter_ops + + # Set cache to True + aiter_ops._AITER_MLA_SUPPORTS_FP8 = True + + from vllm._aiter_ops import _check_aiter_mla_fp8_support + + # Should return cached value without re-checking + result = _check_aiter_mla_fp8_support() + assert result is True + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/test_config.py b/tests/test_config.py index ee706ab3d9c8..905ad3fa25f9 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -1,5 +1,6 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project + import logging import os from dataclasses import MISSING, Field, asdict, dataclass, field @@ -205,8 +206,8 @@ def test_default_pooling_type(model_id, default_pooling_type, pooling_type): ) def test_moe_model_detection(model_id, expected_is_moe_model): model_config = ModelConfig(model_id) - # Just check that is_moe_model field exists and is a boolean - assert model_config.is_model_moe() == expected_is_moe_model + # Just check that is_moe field exists and is a boolean + assert model_config.is_moe == expected_is_moe_model @pytest.mark.parametrize( @@ -224,7 +225,7 @@ def test_moe_model_detection(model_id, expected_is_moe_model): def test_is_quantized(model_id, quantized): model_config = ModelConfig(model_id) # Just check that quantized field exists and is a boolean - assert model_config.is_quantized() == quantized + assert model_config.is_quantized == quantized @pytest.mark.skipif( @@ -925,7 +926,7 @@ def test_vllm_config_callable_defaults(): model_config=quantized_model, optimization_level=OptimizationLevel.O2 ) enable_if_quantized = lambda cfg: ( - cfg.model_config is not None and cfg.model_config.is_quantized() + cfg.model_config is not None and cfg.model_config.is_quantized ) assert enable_if_quantized(config_quantized) is True assert enable_if_quantized(config_no_model) is False @@ -936,7 +937,7 @@ def test_vllm_config_callable_defaults(): model_config=moe_model, optimization_level=OptimizationLevel.O2 ) enable_if_sequential = lambda cfg: ( - cfg.model_config is not None and not cfg.model_config.is_model_moe() + cfg.model_config is not None and not cfg.model_config.is_moe ) assert enable_if_sequential(config_moe) is False assert enable_if_sequential(config_quantized) is True @@ -1050,3 +1051,46 @@ def test_scheduler_config_init(): with pytest.raises(AttributeError): # InitVar does not become an attribute print(SchedulerConfig.default_factory().max_model_len) + + +@pytest.mark.parametrize( + ( + "model_id", + "data_parallel_size", + "external_lb", + "expected_needs_coordinator", + ), + [ + # Non-MoE model with DP=1 should not need coordinator + ("facebook/opt-125m", 1, False, False), + # Non-MoE model with DP>1 internal LB should need coordinator + ("facebook/opt-125m", 2, False, True), + # Non-MoE model with DP>1 external LB should not need coordinator + ("facebook/opt-125m", 2, True, False), + # MoE model with DP=1 should not need coordinator + ("mistralai/Mixtral-8x7B-Instruct-v0.1", 1, False, False), + # MoE model with DP>1 internal LB should need both coordinator + # and wave coordination + ("mistralai/Mixtral-8x7B-Instruct-v0.1", 2, False, True), + # MoE model with DP>1 external LB needs coordinator for wave coordination + # (wave coordination runs in coordinator process) + ("mistralai/Mixtral-8x7B-Instruct-v0.1", 2, True, True), + ], +) +def test_needs_dp_coordination( + model_id, + data_parallel_size, + external_lb, + expected_needs_coordinator, +): + """Test that DP coordinator and wave coordination are configured correctly.""" + from vllm.config import ParallelConfig + + model_config = ModelConfig(model_id) + parallel_config = ParallelConfig( + data_parallel_size=data_parallel_size, + data_parallel_external_lb=external_lb, + ) + vllm_config = VllmConfig(model_config=model_config, parallel_config=parallel_config) + + assert vllm_config.needs_dp_coordinator == expected_needs_coordinator diff --git a/tests/v1/attention/test_attention_backends.py b/tests/v1/attention/test_attention_backends.py index f4f40babaf79..2b9b0dc1e6e4 100644 --- a/tests/v1/attention/test_attention_backends.py +++ b/tests/v1/attention/test_attention_backends.py @@ -15,11 +15,16 @@ create_vllm_config, try_get_attention_backend, ) +from vllm.attention.backends.abstract import AttentionType from vllm.attention.backends.registry import AttentionBackendEnum from vllm.config import ModelConfig from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv -from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, is_torch_equal_or_newer +from vllm.utils.torch_utils import ( + STR_DTYPE_TO_TORCH_DTYPE, + is_torch_equal_or_newer, + set_random_seed, +) from vllm.v1.attention.backends.utils import ( CommonAttentionMetadata, set_kv_cache_layout, @@ -79,6 +84,13 @@ def _convert_dtype_to_torch(dtype): ), "single_decode": BatchSpec(seq_lens=[1024], query_lens=[1]), "single_prefill": BatchSpec(seq_lens=[1024], query_lens=[64]), + # encoder-only + "small_encoder_prefill": BatchSpec( + seq_lens=[32, 64, 128, 256], query_lens=[32, 64, 128, 256] + ), + "medium_encoder_prefill": BatchSpec( + seq_lens=[256, 512, 1024, 2048], query_lens=[256, 512, 1024, 2048] + ), } @@ -114,12 +126,12 @@ def create_and_prepopulate_kv_cache( Tuple of (kv_cache, updated_block_table) """ batch_size = len(k_contexts) - seq_lens = common_attn_metadata.seq_lens_cpu + seq_lens = common_attn_metadata.seq_lens.cpu() query_lens = ( common_attn_metadata.query_start_loc_cpu[1:] - common_attn_metadata.query_start_loc_cpu[:-1] ) - context_lens = common_attn_metadata.num_computed_tokens_cpu + context_lens = seq_lens - query_lens block_table = common_attn_metadata.block_table_tensor slot_mapping = common_attn_metadata.slot_mapping @@ -205,6 +217,7 @@ def run_attention_backend( key: torch.Tensor, value: torch.Tensor, kv_cache: torch.Tensor, + attn_type: AttentionType = AttentionType.DECODER, sliding_window: int | None = None, ) -> torch.Tensor: """Run attention computation using the specified backend's AttentionImpl.""" @@ -272,6 +285,7 @@ def mock_get_per_layer_parameters(vllm_config, layer_names, impl_cls): num_kv_heads=num_kv_heads, alibi_slopes=None, sliding_window=sliding_window, + attn_type=attn_type, kv_cache_dtype="auto", ) @@ -295,6 +309,7 @@ def _test_backend_correctness( backend_to_test: list[AttentionBackendEnum | str], mask_mod, *, + attn_type: AttentionType = AttentionType.DECODER, block_size: int = 16, atol: float = 1e-2, rtol: float = 1e-2, @@ -320,7 +335,7 @@ def _test_backend_correctness( multiple GPUs. This tests that backends work correctly with different head counts. """ - current_platform.seed_everything(42) + set_random_seed(42) hf_config_override = None if tensor_parallel_size > 1: @@ -432,6 +447,9 @@ def _test_backend_correctness( common_attn_metadata = create_common_attn_metadata( batch_spec, vllm_config.cache_config.block_size, device ) + if attn_type == AttentionType.ENCODER_ONLY: + # For encoder-only, all tokens are prefill tokens + common_attn_metadata.causal = False # 3. Simulate Paged KV Cache and a realistic slot_mapping kv_cache = create_and_prepopulate_kv_cache( @@ -487,6 +505,7 @@ def _test_backend_correctness( value_vllm, kv_cache_for_backend, sliding_window=sliding_window, + attn_type=attn_type, ) finally: if reset_kv_cache_layout: @@ -672,3 +691,45 @@ def sliding_window_mask_mod( block_size=128, tensor_parallel_size=tensor_parallel_size, ) + + +@pytest.mark.parametrize( + "batch_spec_name", + [ + "small_encoder_prefill", + "medium_encoder_prefill", + ], +) +@pytest.mark.parametrize("model", ["google/embeddinggemma-300m"]) +@pytest.mark.parametrize("tensor_parallel_size", [1, 2]) +def test_sliding_window_encoder_backend_correctness( + batch_spec_name: str, model: str, tensor_parallel_size: int +): + """Test backend's correctness with sliding window attention.""" + + def bidi_sliding_window_mask_mod( + b: torch.Tensor, + h: torch.Tensor, + q_idx: torch.Tensor, + kv_idx: torch.Tensor, + *, + context_len: int, + sliding_window: int, + ): + return torch.abs(q_idx + context_len - kv_idx) < sliding_window + + batch_spec = BATCH_SPECS[batch_spec_name] + model_config = ModelConfig(model=model, max_model_len=max(batch_spec.seq_lens)) + sliding_window = model_config.get_sliding_window() + sliding_window_mask_mod_fn = partial( + bidi_sliding_window_mask_mod, sliding_window=sliding_window + ) + + _test_backend_correctness( + batch_spec, + model, + SLIDING_WINDOW_BACKENDS_TO_TEST, + sliding_window_mask_mod_fn, + attn_type=AttentionType.ENCODER_ONLY, + tensor_parallel_size=tensor_parallel_size, + ) diff --git a/tests/v1/attention/test_chunked_local_attention.py b/tests/v1/attention/test_chunked_local_attention.py index 4529c2cfc29b..3b5af45956a1 100644 --- a/tests/v1/attention/test_chunked_local_attention.py +++ b/tests/v1/attention/test_chunked_local_attention.py @@ -1,13 +1,119 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass +from unittest.mock import MagicMock import numpy as np import pytest import torch -from tests.v1.attention.utils import BatchSpec, create_common_attn_metadata -from vllm.v1.attention.backends.utils import make_local_attention_virtual_batches +from tests.v1.attention.utils import ( + BatchSpec, + create_common_attn_metadata, + create_vllm_config, +) +from vllm.attention.layers.chunked_local_attention import ( + create_chunked_local_attention_backend, +) +from vllm.v1.attention.backends.utils import ( + AttentionMetadataBuilder, + CommonAttentionMetadata, +) + + +def create_mock_underlying_backend(device: torch.device): + """Create a mock underlying attention backend for testing.""" + + class MockMetadata: + """Minimal metadata that captures what was passed to build().""" + + pass + + class MockUnderlyingBuilder(AttentionMetadataBuilder[MockMetadata]): + def __init__( + self, + kv_cache_spec, + layer_names, + vllm_config, + device, + ): + self.kv_cache_spec = kv_cache_spec + self.layer_names = layer_names + self.vllm_config = vllm_config + self.device = device + # Capture what was passed to build for verification + self.last_common_attn_metadata = None + + def build( + self, + common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata, + fast_build: bool = False, + ) -> MockMetadata: + # Capture the metadata for test verification + self.last_common_attn_metadata = common_attn_metadata + return MockMetadata() + + def update_block_table(self, metadata, blk_table, slot_mapping): + return metadata + + class MockUnderlyingBackend: + @classmethod + def get_builder_cls(cls): + return MockUnderlyingBuilder + + return MockUnderlyingBackend + + +def build_chunked_local_attention( + batch_spec: BatchSpec, + attn_chunk_size: int, + block_size: int, + device: torch.device, + arange_block_indices: bool = True, +) -> CommonAttentionMetadata: + """Build chunked local attention metadata using the real builder.""" + # Create the backend + mock_backend = create_mock_underlying_backend(device) + chunked_backend = create_chunked_local_attention_backend( + mock_backend, attn_chunk_size, block_size + ) + + # Create mock kv_cache_spec + mock_kv_cache_spec = MagicMock() + mock_kv_cache_spec.block_size = block_size + + # Create vllm_config with enough capacity + vllm_config = create_vllm_config( + max_num_seqs=len(batch_spec.query_lens), + max_num_batched_tokens=max( + sum(batch_spec.query_lens), len(batch_spec.query_lens) + ), + block_size=block_size, + ) + + # Create the builder + builder_cls = chunked_backend.get_builder_cls() + builder = builder_cls( + kv_cache_spec=mock_kv_cache_spec, + layer_names=["layer0"], + vllm_config=vllm_config, + device=device, + ) + + # Create common attention metadata + common_attn_metadata = create_common_attn_metadata( + batch_spec, + block_size, + device, + arange_block_indices=arange_block_indices, + ) + + # Build and return the result + builder.build(0, common_attn_metadata) + + # The underlying builder's last_common_attn_metadata has the virtual batches + return builder.last_common_attn_metadata @dataclass @@ -159,26 +265,21 @@ def test_local_attention_virtual_batches(test_data: LocalAttentionTestData): expected_k_seqlens = test_data.expected_k_seqlens expected_local_block_table = test_data.expected_local_block_table - # Create common attention metadata - common_attn_metadata = create_common_attn_metadata( + # Call the builder + result = build_chunked_local_attention( batch_spec, + attn_chunk_size, block_size, device, - # Use torch.arange instead of torch.randint so we can assert on - # block table tensor values. The block table will have shape - # (num_batches, cdiv(max_seq_len, block_size)) and the values will be - # arranged from 0 to cdiv(max_seq_len, block_size)-1 arange_block_indices=True, ) - # Call the function - result, _ = make_local_attention_virtual_batches( - attn_chunk_size, common_attn_metadata, block_size - ) + # Get actual count (trim padding - find first zero in k_seqlens) + actual_count = len(expected_k_seqlens) - # Convert to numpy for easier comparison - actual_q_seqlens = np.diff(result.query_start_loc_cpu.numpy()) - actual_k_seqlens = result.seq_lens_cpu.numpy() + # Convert to numpy for comparison (use GPU tensors, then transfer to CPU) + actual_q_seqlens = np.diff(result.query_start_loc.cpu().numpy())[:actual_count] + actual_k_seqlens = result.seq_lens.cpu().numpy()[:actual_count] # Check that all query lengths are less than or equal to attn_chunk_size assert all(q_len <= attn_chunk_size for q_len in actual_q_seqlens) @@ -196,6 +297,8 @@ def test_local_attention_virtual_batches(test_data: LocalAttentionTestData): ) print(f"Expected block table:\n{expected_block_table_tensor}") - print(f"Actual block table:\n{result.block_table_tensor}") + print(f"Actual block table:\n{result.block_table_tensor[:actual_count]}") - torch.testing.assert_close(result.block_table_tensor, expected_block_table_tensor) + torch.testing.assert_close( + result.block_table_tensor[:actual_count], expected_block_table_tensor + ) diff --git a/tests/v1/attention/test_local_attention_triton_kernel.py b/tests/v1/attention/test_local_attention_triton_kernel.py new file mode 100644 index 000000000000..22c49a7d5622 --- /dev/null +++ b/tests/v1/attention/test_local_attention_triton_kernel.py @@ -0,0 +1,669 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Unit tests for the Triton kernel implementation of chunked local attention. + +Tests focus on: +1. Edge cases (single batch, single token, large batches) +2. Various chunk sizes and block sizes +3. Consistency between seqlens_q, seqlens_k, and cu_seqlens +4. Equivalence with the original numpy implementation +""" + +from unittest.mock import MagicMock + +import numpy as np +import pytest +import torch + +from tests.v1.attention.utils import create_vllm_config +from vllm.attention.layers.chunked_local_attention import ( + create_chunked_local_attention_backend, +) +from vllm.utils.math_utils import cdiv +from vllm.v1.attention.backends.utils import ( + AttentionMetadataBuilder, + CommonAttentionMetadata, +) + + +def _make_local_attention_virtual_batches_reference( + attn_chunk_size: int, + query_start_loc_np: np.ndarray, + seq_lens_np: np.ndarray, + block_table: torch.Tensor, + block_size: int, +) -> tuple[np.ndarray, np.ndarray, np.ndarray, torch.Tensor]: + """ + Reference implementation using numpy (the original algorithm). + Returns: (cu_seqlens_q_local, seqlens_q_local, seqlens_k_local, block_table_local) + """ + q_seqlens = query_start_loc_np[1:] - query_start_loc_np[:-1] + actual_batch_size = seq_lens_np.shape[0] + + # q_tokens_in_first_block + q_tokens_in_first_block = np.minimum( + attn_chunk_size - ((seq_lens_np - q_seqlens) % attn_chunk_size), q_seqlens + ).astype(np.int32) + tokens_in_last_block = attn_chunk_size + (seq_lens_np % -attn_chunk_size) + local_blocks = 1 + cdiv(q_seqlens - q_tokens_in_first_block, attn_chunk_size) + + # Batched arange + cu_num_blocks = np.cumsum(local_blocks) + virtual_batches = cu_num_blocks[-1] + block_offsets = np.repeat(cu_num_blocks - local_blocks, local_blocks) + arange = np.arange(virtual_batches, dtype=np.int32) - block_offsets + rarange = np.repeat(local_blocks, local_blocks) - arange - 1 + + # seqlens_q_local + seqlens_q_local = np.repeat(q_seqlens - q_tokens_in_first_block, local_blocks) + seqlens_q_local[arange == 0] = q_tokens_in_first_block + seqlens_q_local[arange > 0] = np.minimum( + seqlens_q_local - attn_chunk_size * (arange - 1), attn_chunk_size + )[arange > 0] + + # cu_seqlens_q_local + cu_seqlens_q_local = np.empty(virtual_batches + 1, dtype=np.int32) + np.cumsum(seqlens_q_local, out=cu_seqlens_q_local[1:]) + cu_seqlens_q_local[0] = 0 + + # seqlens_k_local + seqlens_k_local = np.full(cu_num_blocks[-1], attn_chunk_size, dtype=np.int32) + seqlens_k_local[cu_num_blocks - 1] = tokens_in_last_block + + # Block table + k_seqstarts_absolute = np.repeat(seq_lens_np, local_blocks) - ( + rarange * attn_chunk_size + np.repeat(tokens_in_last_block, local_blocks) + ) + block_starts = k_seqstarts_absolute // block_size + pages_per_local_batch = attn_chunk_size // block_size + + block_indices = block_starts[:, None] + np.arange( + pages_per_local_batch, dtype=np.int32 + ) + block_indices = block_indices.reshape(-1).clip(max=block_table.shape[1] - 1) + batch_indices = np.repeat( + np.arange(actual_batch_size, dtype=np.int32), + local_blocks * pages_per_local_batch, + ) + + batch_indices_torch = torch.from_numpy(batch_indices) + block_indices_torch = torch.from_numpy(block_indices) + block_table_local = block_table[batch_indices_torch, block_indices_torch].view( + virtual_batches, -1 + ) + + return cu_seqlens_q_local, seqlens_q_local, seqlens_k_local, block_table_local + + +def create_mock_underlying_backend(device: torch.device): + """Create a mock underlying attention backend for testing.""" + + class MockMetadata: + """Minimal metadata that captures what was passed to build().""" + + pass + + class MockUnderlyingBuilder(AttentionMetadataBuilder[MockMetadata]): + def __init__( + self, + kv_cache_spec, + layer_names, + vllm_config, + device, + ): + self.kv_cache_spec = kv_cache_spec + self.layer_names = layer_names + self.vllm_config = vllm_config + self.device = device + # Capture what was passed to build for verification + self.last_common_attn_metadata = None + + def build( + self, + common_prefix_len: int, + common_attn_metadata: CommonAttentionMetadata, + fast_build: bool = False, + ) -> MockMetadata: + # Capture the metadata for test verification + self.last_common_attn_metadata = common_attn_metadata + return MockMetadata() + + def update_block_table(self, metadata, blk_table, slot_mapping): + return metadata + + class MockUnderlyingBackend: + @classmethod + def get_builder_cls(cls): + return MockUnderlyingBuilder + + return MockUnderlyingBackend + + +def create_test_metadata( + query_lens: list[int], + seq_lens: list[int], + block_size: int, + device: torch.device, + arange_block_indices: bool = True, +) -> CommonAttentionMetadata: + """Create CommonAttentionMetadata for testing.""" + batch_size = len(query_lens) + max_seq_len = max(seq_lens) + max_blocks = cdiv(max_seq_len, block_size) + + # Create cumulative query_start_loc + query_start_loc = torch.zeros(batch_size + 1, dtype=torch.int32, device=device) + for i, q in enumerate(query_lens): + query_start_loc[i + 1] = query_start_loc[i] + q + + seq_lens_t = torch.tensor(seq_lens, dtype=torch.int32, device=device) + + # Create block table + if arange_block_indices: + block_table = torch.arange( + batch_size * max_blocks, dtype=torch.int32, device=device + ).view(batch_size, max_blocks) + else: + block_table = torch.randint( + 0, 1000, (batch_size, max_blocks), dtype=torch.int32, device=device + ) + + return CommonAttentionMetadata( + query_start_loc=query_start_loc, + query_start_loc_cpu=query_start_loc.cpu(), + seq_lens=seq_lens_t, + num_reqs=batch_size, + num_actual_tokens=sum(query_lens), + max_query_len=max(query_lens), + max_seq_len=max_seq_len, + block_table_tensor=block_table, + slot_mapping=torch.zeros(sum(query_lens), dtype=torch.int64, device=device), + ) + + +def build_virtual_batches( + query_lens: list[int], + seq_lens: list[int], + attn_chunk_size: int, + block_size: int, + device: torch.device, + arange_block_indices: bool = True, +) -> CommonAttentionMetadata: + """Build chunked local attention metadata using the real builder.""" + meta = create_test_metadata( + query_lens=query_lens, + seq_lens=seq_lens, + block_size=block_size, + device=device, + arange_block_indices=arange_block_indices, + ) + + # Create the backend + mock_backend = create_mock_underlying_backend(device) + chunked_backend = create_chunked_local_attention_backend( + mock_backend, attn_chunk_size, block_size + ) + + # Create mock kv_cache_spec + mock_kv_cache_spec = MagicMock() + mock_kv_cache_spec.block_size = block_size + + # Create vllm_config with enough capacity + vllm_config = create_vllm_config( + max_num_seqs=len(query_lens), + max_num_batched_tokens=max(sum(query_lens), len(query_lens)), + block_size=block_size, + ) + + # Create the builder + builder_cls = chunked_backend.get_builder_cls() + builder = builder_cls( + kv_cache_spec=mock_kv_cache_spec, + layer_names=["layer0"], + vllm_config=vllm_config, + device=device, + ) + + # Build and return the result + builder.build(0, meta) + + return builder.last_common_attn_metadata, meta + + +def get_actual_seqlens(result: CommonAttentionMetadata) -> tuple[list[int], list[int]]: + """Extract actual (non-padded) seqlens from result.""" + q_seqlens = (result.query_start_loc[1:] - result.query_start_loc[:-1]).cpu() + k_seqlens = result.seq_lens.cpu() + # Find actual count (first zero or end) + nonzero_mask = k_seqlens > 0 + if nonzero_mask.all(): + actual_count = len(k_seqlens) + else: + actual_count = int(nonzero_mask.int().argmin()) + if actual_count == 0 and k_seqlens[0] > 0: + actual_count = len(k_seqlens) + return q_seqlens[:actual_count].tolist(), k_seqlens[:actual_count].tolist() + + +class TestLocalAttentionKernelBasic: + """Basic correctness tests.""" + + @pytest.fixture + def device(self): + return torch.device("cuda:0") + + def test_single_batch_single_chunk(self, device): + """Single batch that fits in one chunk.""" + result, _ = build_virtual_batches( + query_lens=[3], seq_lens=[3], attn_chunk_size=4, block_size=2, device=device + ) + + q_seqlens, k_seqlens = get_actual_seqlens(result) + assert q_seqlens == [3] + assert k_seqlens == [3] + + def test_single_batch_multiple_chunks(self, device): + """Single batch spanning multiple chunks.""" + result, _ = build_virtual_batches( + query_lens=[10], + seq_lens=[10], + attn_chunk_size=4, + block_size=2, + device=device, + ) + + # 10 tokens with chunk_size=4: chunks at [0-4), [4-8), [8-10) + # -> 3 virtual batches + q_seqlens, k_seqlens = get_actual_seqlens(result) + assert q_seqlens == [4, 4, 2] + assert k_seqlens == [4, 4, 2] + + def test_multiple_batches_uniform(self, device): + """Multiple batches with uniform sizes.""" + result, _ = build_virtual_batches( + query_lens=[4, 4, 4], + seq_lens=[4, 4, 4], + attn_chunk_size=4, + block_size=2, + device=device, + ) + + # Each batch produces 1 virtual batch + q_seqlens, k_seqlens = get_actual_seqlens(result) + assert q_seqlens == [4, 4, 4] + assert k_seqlens == [4, 4, 4] + + def test_docstring_example(self, device): + """Test the example from the docstring.""" + result, _ = build_virtual_batches( + query_lens=[4, 10, 5], + seq_lens=[6, 17, 9], + attn_chunk_size=4, + block_size=2, + device=device, + ) + + expected_q_seqlens = [2, 2, 1, 4, 4, 1, 4, 1] + expected_k_seqlens = [4, 2, 4, 4, 4, 1, 4, 1] + + q_seqlens, k_seqlens = get_actual_seqlens(result) + assert q_seqlens == expected_q_seqlens + assert k_seqlens == expected_k_seqlens + + +class TestLocalAttentionKernelEdgeCases: + """Edge case tests.""" + + @pytest.fixture + def device(self): + return torch.device("cuda:0") + + def test_single_token_per_batch(self, device): + """Each batch has only one token.""" + result, _ = build_virtual_batches( + query_lens=[1, 1, 1], + seq_lens=[1, 1, 1], + attn_chunk_size=4, + block_size=2, + device=device, + ) + + q_seqlens, k_seqlens = get_actual_seqlens(result) + assert q_seqlens == [1, 1, 1] + assert k_seqlens == [1, 1, 1] + + def test_chunk_size_larger_than_seq(self, device): + """Chunk size larger than sequence length.""" + result, _ = build_virtual_batches( + query_lens=[3], + seq_lens=[5], + attn_chunk_size=10, + block_size=2, + device=device, + ) + + # Everything fits in one chunk + q_seqlens, k_seqlens = get_actual_seqlens(result) + assert k_seqlens == [5] + + def test_chunk_equals_block_size(self, device): + """Chunk size equals block size.""" + result, _ = build_virtual_batches( + query_lens=[8], seq_lens=[8], attn_chunk_size=4, block_size=4, device=device + ) + + # 8 tokens with chunk=block=4: 2 virtual batches + q_seqlens, k_seqlens = get_actual_seqlens(result) + assert q_seqlens == [4, 4] + + def test_prefill_with_context(self, device): + """Query starts in the middle of a chunk (has context).""" + # seq_lens=5, query_lens=1 -> 4 context tokens + # With chunk_size=4, query starts in second chunk + result, _ = build_virtual_batches( + query_lens=[1], seq_lens=[5], attn_chunk_size=4, block_size=2, device=device + ) + + # Query is in second chunk [4-5), so 1 virtual batch + q_seqlens, k_seqlens = get_actual_seqlens(result) + assert q_seqlens == [1] + assert k_seqlens == [1] + + +class TestLocalAttentionKernelLargeBatches: + """Tests with larger batch sizes to stress binary search.""" + + @pytest.fixture + def device(self): + return torch.device("cuda:0") + + def test_large_batch_count(self, device): + """Many small batches.""" + batch_size = 100 + result, _ = build_virtual_batches( + query_lens=[4] * batch_size, + seq_lens=[4] * batch_size, + attn_chunk_size=4, + block_size=2, + device=device, + ) + + # Each batch produces 1 virtual batch + q_seqlens, k_seqlens = get_actual_seqlens(result) + assert len(q_seqlens) == batch_size + + def test_large_batch_varying_sizes(self, device): + """Many batches with varying sizes.""" + batch_size = 50 + query_lens = [(i % 10) + 1 for i in range(batch_size)] + seq_lens = [(i % 10) + 5 for i in range(batch_size)] + + result, _ = build_virtual_batches( + query_lens=query_lens, + seq_lens=seq_lens, + attn_chunk_size=4, + block_size=2, + device=device, + ) + + # Verify total query tokens preserved + q_seqlens, _ = get_actual_seqlens(result) + assert sum(q_seqlens) == sum(query_lens) + + +class TestLocalAttentionKernelBlockTable: + """Tests for block table correctness.""" + + @pytest.fixture + def device(self): + return torch.device("cuda:0") + + def test_block_table_values(self, device): + """Verify block table values are correct.""" + result, _ = build_virtual_batches( + query_lens=[4, 10, 5], + seq_lens=[6, 17, 9], + attn_chunk_size=4, + block_size=2, + device=device, + arange_block_indices=True, + ) + + # Expected block table from docstring + expected = [ + [0, 1], # batch 0, k[0-4) + [2, 3], # batch 0, k[4-6) + [11, 12], # batch 1, k[4-8) + [13, 14], # batch 1, k[8-12) + [15, 16], # batch 1, k[12-16) + [17, 17], # batch 1, k[16-17) - clipped + [20, 21], # batch 2, k[4-8) + [22, 23], # batch 2, k[8-9) + ] + expected_tensor = torch.tensor(expected, dtype=torch.int32, device=device) + # Compare only actual (non-padded) entries + _, k_seqlens = get_actual_seqlens(result) + actual_count = len(k_seqlens) + torch.testing.assert_close( + result.block_table_tensor[:actual_count], expected_tensor + ) + + def test_block_table_shape(self, device): + """Verify block table has correct shape for actual entries.""" + result, _ = build_virtual_batches( + query_lens=[8], + seq_lens=[12], + attn_chunk_size=4, + block_size=4, + device=device, + ) + + # pages_per_local_batch = 4 / 4 = 1 + # 2 actual virtual batches (may have padding) + _, k_seqlens = get_actual_seqlens(result) + actual_count = len(k_seqlens) + assert actual_count == 2 + assert result.block_table_tensor.shape[1] == 1 # pages_per_local_batch + + +class TestLocalAttentionKernelInvariants: + """Tests for mathematical invariants.""" + + @pytest.fixture + def device(self): + return torch.device("cuda:0") + + @pytest.mark.parametrize("chunk_size", [4, 8, 16]) + @pytest.mark.parametrize("block_size", [2, 4]) + def test_seqlens_invariants(self, device, chunk_size, block_size): + """Verify seqlen invariants hold.""" + if chunk_size % block_size != 0: + pytest.skip("chunk_size must be divisible by block_size") + + result, _ = build_virtual_batches( + query_lens=[7, 15, 3], + seq_lens=[10, 20, 8], + attn_chunk_size=chunk_size, + block_size=block_size, + device=device, + ) + + q_seqlens, k_seqlens = get_actual_seqlens(result) + + # All q_seqlens <= chunk_size + assert all(q <= chunk_size for q in q_seqlens) + + # All k_seqlens <= chunk_size + assert all(k <= chunk_size for k in k_seqlens) + + # Total q tokens preserved + assert sum(q_seqlens) == 7 + 15 + 3 + + def test_cumsum_consistency(self, device): + """Verify cu_seqlens is consistent with seqlens.""" + result, _ = build_virtual_batches( + query_lens=[5, 12, 7], + seq_lens=[8, 15, 10], + attn_chunk_size=4, + block_size=2, + device=device, + ) + + q_seqlens_full = result.query_start_loc[1:] - result.query_start_loc[:-1] + + # Recompute cumsum and verify + expected_cu = torch.zeros( + result.num_reqs + 1, dtype=torch.int32, device=result.query_start_loc.device + ) + torch.cumsum(q_seqlens_full, dim=0, out=expected_cu[1:]) + + torch.testing.assert_close(result.query_start_loc, expected_cu) + + +class TestLocalAttentionVsReference: + """Tests comparing Triton implementation against original numpy reference.""" + + @pytest.fixture + def device(self): + return torch.device("cuda:0") + + def _run_comparison( + self, + query_lens: list[int], + seq_lens: list[int], + attn_chunk_size: int, + block_size: int, + device: torch.device, + ): + """Run both implementations and compare results.""" + result, meta = build_virtual_batches( + query_lens=query_lens, + seq_lens=seq_lens, + attn_chunk_size=attn_chunk_size, + block_size=block_size, + device=device, + arange_block_indices=True, + ) + + # Run reference implementation + query_start_loc_np = meta.query_start_loc.cpu().numpy() + seq_lens_np = meta.seq_lens.cpu().numpy() + ref_cu_seqlens_q, ref_seqlens_q, ref_seqlens_k, ref_block_table = ( + _make_local_attention_virtual_batches_reference( + attn_chunk_size, + query_start_loc_np, + seq_lens_np, + meta.block_table_tensor, + block_size, + ) + ) + + # Get actual count (non-padded entries) + actual_count = len(ref_seqlens_q) + + # Compare results (trim padding) + actual_seqlens_q = ( + (result.query_start_loc[1:] - result.query_start_loc[:-1]) + .cpu() + .numpy()[:actual_count] + ) + actual_seqlens_k = result.seq_lens.cpu().numpy()[:actual_count] + actual_cu_seqlens_q = result.query_start_loc.cpu().numpy()[: actual_count + 1] + actual_block_table = result.block_table_tensor[:actual_count] + + np.testing.assert_array_equal( + actual_seqlens_q, ref_seqlens_q, err_msg="seqlens_q mismatch" + ) + np.testing.assert_array_equal( + actual_seqlens_k, ref_seqlens_k, err_msg="seqlens_k mismatch" + ) + np.testing.assert_array_equal( + actual_cu_seqlens_q, ref_cu_seqlens_q, err_msg="cu_seqlens_q mismatch" + ) + torch.testing.assert_close( + actual_block_table, ref_block_table, msg="block_table mismatch" + ) + + def test_docstring_example_vs_reference(self, device): + """Test the docstring example against reference.""" + self._run_comparison( + query_lens=[4, 10, 5], + seq_lens=[6, 17, 9], + attn_chunk_size=4, + block_size=2, + device=device, + ) + + def test_single_batch_vs_reference(self, device): + """Single batch comparison.""" + self._run_comparison( + query_lens=[8], + seq_lens=[12], + attn_chunk_size=4, + block_size=2, + device=device, + ) + + def test_many_batches_vs_reference(self, device): + """Many batches comparison.""" + self._run_comparison( + query_lens=[3, 7, 2, 9, 5, 1, 8, 4], + seq_lens=[5, 10, 6, 15, 8, 3, 12, 7], + attn_chunk_size=4, + block_size=2, + device=device, + ) + + def test_large_chunk_size_vs_reference(self, device): + """Large chunk size comparison.""" + self._run_comparison( + query_lens=[5, 12, 8], + seq_lens=[10, 20, 15], + attn_chunk_size=16, + block_size=4, + device=device, + ) + + def test_chunk_equals_block_vs_reference(self, device): + """Chunk size equals block size comparison.""" + self._run_comparison( + query_lens=[8, 12], + seq_lens=[8, 12], + attn_chunk_size=4, + block_size=4, + device=device, + ) + + @pytest.mark.parametrize("batch_size", [10, 50, 100]) + def test_random_batches_vs_reference(self, device, batch_size): + """Random batch configurations comparison.""" + np.random.seed(42 + batch_size) + query_lens = np.random.randint(1, 20, size=batch_size).tolist() + # seq_lens >= query_lens + seq_lens = [q + np.random.randint(0, 10) for q in query_lens] + + self._run_comparison( + query_lens=query_lens, + seq_lens=seq_lens, + attn_chunk_size=4, + block_size=2, + device=device, + ) + + @pytest.mark.parametrize( + "chunk_size,block_size", [(4, 2), (8, 2), (8, 4), (16, 4), (16, 8), (32, 8)] + ) + def test_various_sizes_vs_reference(self, device, chunk_size, block_size): + """Various chunk and block size combinations.""" + self._run_comparison( + query_lens=[7, 15, 3, 22, 9], + seq_lens=[10, 25, 8, 30, 12], + attn_chunk_size=chunk_size, + block_size=block_size, + device=device, + ) + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/tests/v1/attention/test_mla_backends.py b/tests/v1/attention/test_mla_backends.py index 783e02ce89bd..bd2feac41100 100644 --- a/tests/v1/attention/test_mla_backends.py +++ b/tests/v1/attention/test_mla_backends.py @@ -154,12 +154,12 @@ def create_and_prepopulate_kv_cache( MLA KV cache tensor """ batch_size = len(kv_c_contexts) - seq_lens = common_attn_metadata.seq_lens_cpu + seq_lens = common_attn_metadata.seq_lens.cpu() query_lens = ( common_attn_metadata.query_start_loc_cpu[1:] - common_attn_metadata.query_start_loc_cpu[:-1] ) - context_lens = common_attn_metadata.num_computed_tokens_cpu + context_lens = seq_lens - query_lens block_table = common_attn_metadata.block_table_tensor slot_mapping = common_attn_metadata.slot_mapping diff --git a/tests/v1/attention/test_sparse_mla_backends.py b/tests/v1/attention/test_sparse_mla_backends.py index 9b7c5822db98..f4ca3dccfb5e 100644 --- a/tests/v1/attention/test_sparse_mla_backends.py +++ b/tests/v1/attention/test_sparse_mla_backends.py @@ -297,7 +297,7 @@ def test_sparse_backend_decode_correctness( positions = np.arange(starts[-1], dtype=np.int32) - np.repeat( starts[:-1], seg_lengths ) - seq_lengths = np.asarray(common_attn_metadata.seq_lens_cpu, dtype=np.int32) + seq_lengths = np.asarray(common_attn_metadata.seq_lens.cpu(), dtype=np.int32) prefix_lengths = seq_lengths - seg_lengths positions += np.repeat(prefix_lengths, seg_lengths) diff --git a/tests/v1/core/test_scheduler.py b/tests/v1/core/test_scheduler.py index 1999e9f6c3b9..b44c2a1c3fb0 100644 --- a/tests/v1/core/test_scheduler.py +++ b/tests/v1/core/test_scheduler.py @@ -2284,7 +2284,6 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv( # 4th Schedule - this should trigger the resumption output = scheduler.schedule() scheduled_cached_reqs = output.scheduled_cached_reqs - resumed_from_preemption = scheduled_cached_reqs.resumed_from_preemption assert len(output.scheduled_new_reqs) == 0 assert scheduled_cached_reqs.num_reqs == 1 @@ -2292,14 +2291,14 @@ def test_priority_scheduling_preemption_and_resumption_when_out_of_kv( assert len(scheduler.running) == 1 # Preempted request resumed in scheduled_cached_reqs - assert len(resumed_from_preemption) == 1 - assert len(scheduled_cached_reqs.resumed_req_token_ids) == 1 - assert resumed_from_preemption[0] + assert len(scheduled_cached_reqs.resumed_req_ids) == 1 + assert len(scheduled_cached_reqs.all_token_ids) == 1 assert scheduled_cached_reqs.req_ids[0] == request_low.request_id - assert scheduled_cached_reqs.resumed_req_token_ids[0] is not None + assert request_low.request_id in scheduled_cached_reqs.resumed_req_ids + assert request_low.request_id in scheduled_cached_reqs.all_token_ids # Resumed tokens include 30 prompt tokens and 2 decoded tokens - assert len(scheduled_cached_reqs.resumed_req_token_ids[0]) == 32 - assert scheduled_cached_reqs.resumed_req_token_ids[0][31] == 100 + assert len(scheduled_cached_reqs.all_token_ids[request_low.request_id]) == 32 + assert scheduled_cached_reqs.all_token_ids[request_low.request_id][31] == 100 @pytest.mark.parametrize( @@ -3122,7 +3121,6 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption( # 4th Schedule - this should trigger req_low resumption from waiting output = scheduler.schedule() scheduled_cached_reqs = output.scheduled_cached_reqs - resumed_from_preemption = scheduled_cached_reqs.resumed_from_preemption assert len(output.scheduled_new_reqs) == 0 assert scheduled_cached_reqs.num_reqs == 1 @@ -3130,14 +3128,14 @@ def test_priority_scheduling_ec_connector_preemption_and_resumption( assert len(scheduler.running) == 1 # Preempted request resumed in scheduled_cached_reqs - assert len(resumed_from_preemption) == 1 - assert len(scheduled_cached_reqs.resumed_req_token_ids) == 1 - assert resumed_from_preemption[0] + assert len(scheduled_cached_reqs.resumed_req_ids) == 1 + assert len(scheduled_cached_reqs.all_token_ids) == 1 assert scheduled_cached_reqs.req_ids[0] == request_low.request_id - assert scheduled_cached_reqs.resumed_req_token_ids[0] is not None + assert request_low.request_id in scheduled_cached_reqs.resumed_req_ids + assert request_low.request_id in scheduled_cached_reqs.all_token_ids ## Resumed tokens include 94 prompt tokens and 2 decoded tokens - assert len(scheduled_cached_reqs.resumed_req_token_ids[0]) == 96 - assert scheduled_cached_reqs.resumed_req_token_ids[0][95] == 100 + assert len(scheduled_cached_reqs.all_token_ids[request_low.request_id]) == 96 + assert scheduled_cached_reqs.all_token_ids[request_low.request_id][95] == 100 assert scheduler.running[0].request_id == request_low.request_id assert request_high.request_id in output.finished_req_ids diff --git a/tests/v1/e2e/test_async_scheduling.py b/tests/v1/e2e/test_async_scheduling.py index 6447a33838d7..c18546ce45d6 100644 --- a/tests/v1/e2e/test_async_scheduling.py +++ b/tests/v1/e2e/test_async_scheduling.py @@ -154,7 +154,7 @@ def run_tests( with monkeypatch.context() as m: # lock matmul precision to full FP32 (IEEE) - m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "ieee") + m.setenv("VLLM_FLOAT32_MATMUL_PRECISION", "highest") # m.setenv("VLLM_BATCH_INVARIANT", "1") outputs: list[tuple[str, list, list]] = [] for n, ( diff --git a/tests/v1/engine/test_engine_core_client.py b/tests/v1/engine/test_engine_core_client.py index a0e2e5e25a47..5067fc255d73 100644 --- a/tests/v1/engine/test_engine_core_client.py +++ b/tests/v1/engine/test_engine_core_client.py @@ -133,6 +133,7 @@ def setsockopt(self, *_args, **_kwargs): parallel_config = SimpleNamespace( data_parallel_size=1, data_parallel_rank=0, + data_parallel_index=0, data_parallel_size_local=1, data_parallel_rank_local=None, data_parallel_hybrid_lb=False, diff --git a/tests/v1/engine/test_output_processor.py b/tests/v1/engine/test_output_processor.py index f1185222f713..9630f8cae3ed 100644 --- a/tests/v1/engine/test_output_processor.py +++ b/tests/v1/engine/test_output_processor.py @@ -274,12 +274,28 @@ def _validate_logprobs( # the logprob token id at this sequence position decoded_token = pos_logprob_dict[lp_tok].decoded_token ref_decoded_token = _ref_convert_id_to_token(dtv.tokenizer, lp_tok) - assert decoded_token == ref_decoded_token, ( - f"Sampled logprob token id {lp_tok} decodes to" - f" {ref_decoded_token} but Logprob decoded" - f" token is {decoded_token} instead" - f" (at position {idx})" - ) + + # With UTF-8 correction logic, tokens ending with "�" + # (incomplete byte sequences) are corrected to either + # empty string or proper UTF-8 characters + if ref_decoded_token.endswith("�"): + # Token needs UTF-8 correction + assert not decoded_token.endswith("�"), ( + f"Sampled logprob token id {lp_tok} decodes to" + f" '{ref_decoded_token}' (ends with replacement char)" + f" but corrected decoded token '{decoded_token}'" + f" still ends with replacement char" + f" (at position {idx}). UTF-8 correction should" + f" have removed it." + ) + else: + # No correction needed, should match exactly + assert decoded_token == ref_decoded_token, ( + f"Sampled logprob token id {lp_tok} decodes to" + f" {ref_decoded_token} but Logprob decoded" + f" token is {decoded_token} instead" + f" (at position {idx})" + ) ref_cumulative_logprob += pos_logprob_dict[sampled_token].logprob # Assert that cumulative logprobs are correct @@ -420,12 +436,28 @@ def _validate_logprobs( # the logprob token id at this sequence position decoded_token = pos_logprob_dict[plp_tok].decoded_token ref_decoded_token = _ref_convert_id_to_token(dtv.tokenizer, plp_tok) - assert decoded_token == ref_decoded_token, ( - f"Prompt logprob token id {plp_tok} decodes to" - f" {ref_decoded_token} but Logprob decoded" - f" token is {decoded_token} instead" - f" (at position {idx})" - ) + + # With UTF-8 correction logic, tokens ending with "�" + # (incomplete byte sequences) are corrected to either + # empty string or proper UTF-8 characters + if ref_decoded_token.endswith("�"): + # Token needs UTF-8 correction + assert not decoded_token.endswith("�"), ( + f"Prompt logprob token id {plp_tok} decodes to" + f" '{ref_decoded_token}' (ends with replacement char)" + f" but corrected decoded token '{decoded_token}'" + f" still ends with replacement char" + f" (at position {idx}). UTF-8 correction should" + f" have removed it." + ) + else: + # No correction needed, should match exactly + assert decoded_token == ref_decoded_token, ( + f"Prompt logprob token id {plp_tok} decodes to" + f" {ref_decoded_token} but Logprob decoded" + f" token is {decoded_token} instead" + f" (at position {idx})" + ) else: # Prompt logprobs disabled for this request assert prompt_logprobs is None diff --git a/tests/v1/kv_offload/test_cpu_gpu.py b/tests/v1/kv_offload/test_cpu_gpu.py index 3516c0013879..4d9d54e038e8 100644 --- a/tests/v1/kv_offload/test_cpu_gpu.py +++ b/tests/v1/kv_offload/test_cpu_gpu.py @@ -7,6 +7,7 @@ import torch from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed from vllm.v1.attention.backends.flash_attn import FlashAttentionBackend from vllm.v1.kv_offload.mediums import CPULoadStoreSpec, GPULoadStoreSpec from vllm.v1.kv_offload.worker.cpu_gpu import CpuGpuOffloadingHandlers @@ -62,7 +63,7 @@ def test_transfer( seed: int, device: str, ) -> None: - current_platform.seed_everything(seed) + set_random_seed(seed) # create per-layer GPU KV caches based on available attn_backends attn_backends_list = BACKENDS_TO_TEST diff --git a/tests/v1/kv_offload/test_cpu_offloading.py b/tests/v1/kv_offload/test_cpu_offloading.py index 1ac5e5b8cdc5..239f3b5d41e4 100644 --- a/tests/v1/kv_offload/test_cpu_offloading.py +++ b/tests/v1/kv_offload/test_cpu_offloading.py @@ -15,12 +15,10 @@ from vllm.platforms import current_platform CPU_BLOCK_SIZES = [48] -ATTN_BACKENDS = ["FLASH_ATTN"] +ATTN_BACKENDS = ["FLASH_ATTN", "TRITON_ATTN"] if current_platform.is_cuda(): ATTN_BACKENDS.append("FLASHINFER") -elif current_platform.is_rocm(): - ATTN_BACKENDS = ["TRITON_ATTN"] class MockSubscriber: diff --git a/tests/v1/metrics/test_perf_metrics.py b/tests/v1/metrics/test_perf_metrics.py index b6cda7bef3d4..e3846a7a3ef1 100644 --- a/tests/v1/metrics/test_perf_metrics.py +++ b/tests/v1/metrics/test_perf_metrics.py @@ -16,6 +16,10 @@ from transformers.models.qwen3_moe.configuration_qwen3_moe import Qwen3MoeConfig from vllm.config.model import ModelConfig, get_hf_text_config +from vllm.transformers_utils.model_arch_config_convertor import ( + MODEL_ARCH_CONFIG_CONVERTORS, + ModelArchConfigConvertorBase, +) from vllm.v1.metrics.perf import ( AttentionMetrics, BaseConfigParser, @@ -33,6 +37,12 @@ class MockModelConfig: def __init__(self, hf_config, dtype): self.hf_config = hf_config self.hf_text_config = get_hf_text_config(hf_config) + convertor_cls = MODEL_ARCH_CONFIG_CONVERTORS.get( + self.hf_config.model_type, ModelArchConfigConvertorBase + ) + self.model_arch_config = convertor_cls( + self.hf_config, self.hf_text_config + ).convert() self.dtype = dtype self.is_attention_free = False diff --git a/tests/v1/sample/test_logprobs.py b/tests/v1/sample/test_logprobs.py index 1e2cc2241ba9..abb3ce2ef4ab 100644 --- a/tests/v1/sample/test_logprobs.py +++ b/tests/v1/sample/test_logprobs.py @@ -514,6 +514,424 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode): del llm +class TestCorrectDecodedToken: + """Unit tests for _correct_decoded_token method in LogprobsProcessor. + + This method handles UTF-8 decoding issues where incomplete byte sequences + result in the Unicode replacement character "�" (U+FFFD). This commonly + happens with byte-fallback tokenization when multi-byte UTF-8 characters + are split across tokens. + """ + + @pytest.fixture + def mock_tokenizer(self): + """Create a mock tokenizer for testing.""" + from unittest.mock import Mock + + tokenizer = Mock() + return tokenizer + + @pytest.fixture + def processor_with_empty_logprobs(self, mock_tokenizer): + """Create a LogprobsProcessor with empty logprobs.""" + from vllm.v1.engine.logprobs import LogprobsProcessor + + processor = LogprobsProcessor( + tokenizer=mock_tokenizer, + logprobs=[], + prompt_logprobs=None, + cumulative_logprob=0.0, + num_logprobs=1, + num_prompt_logprobs=None, + ) + return processor + + @pytest.fixture + def processor_with_previous_logprobs(self, mock_tokenizer): + """Create a LogprobsProcessor with previous logprobs.""" + from vllm.v1.engine.logprobs import LogprobsProcessor + + processor = LogprobsProcessor( + tokenizer=mock_tokenizer, + logprobs=[{123: None}], # Previous token ID is 123 + prompt_logprobs=None, + cumulative_logprob=0.0, + num_logprobs=1, + num_prompt_logprobs=None, + ) + return processor + + def test_correction_with_previous_token_in_list( + self, processor_with_empty_logprobs + ): + """Test correction using previous token in the same list. + + Scenario: Token at idx=1 ends with "�", but when decoded with + the previous token (idx=0), it forms a valid UTF-8 sequence. + Example: token[0]="�", token[1]="�" -> together form "polarized" + """ + processor = processor_with_empty_logprobs + tokens = [100, 101, 102] # token IDs + + # Mock tokenizer behavior: + # - decode([102]) returns "�" (ends with replacement char) + # - decode([101, 102]) returns "valid" (no replacement char) + processor.tokenizer.decode.side_effect = lambda ids: ( + "valid" if ids == [101, 102] else "�" + ) + + result = processor._correct_decoded_token(2, tokens) + assert result == "valid" + processor.tokenizer.decode.assert_called_with([101, 102]) + + def test_correction_with_previous_logprob_token( + self, processor_with_previous_logprobs + ): + """Test correction using previous logprob token. + + Scenario: Cannot correct with previous token in list (idx=0), + but can correct with previous logprob token. + """ + processor = processor_with_previous_logprobs + tokens = [100] # single token + + # Mock tokenizer behavior: + # - decode([100]) returns "�" (ends with replacement char) + # - decode([123, 100]) returns " "polarized" (no replacement char) + # Token 123 is from previous logprobs + def mock_decode(ids): + if ids == [123, 100]: + return ' "polarized"' + return "�" + + processor.tokenizer.decode.side_effect = mock_decode + + result = processor._correct_decoded_token(0, tokens) + assert result == ' "polarized"' + + def test_correction_at_idx_zero_no_previous_logprobs( + self, processor_with_empty_logprobs + ): + """Test correction at idx=0 with no previous logprobs. + + Scenario: First token in list, no previous logprobs available. + Should return empty string as fallback. + """ + processor = processor_with_empty_logprobs + tokens = [100] + + # Mock tokenizer always returns "�" + processor.tokenizer.decode.return_value = "�" + + result = processor._correct_decoded_token(0, tokens) + assert result == "" + + def test_correction_at_idx_zero_with_previous_logprobs( + self, processor_with_previous_logprobs + ): + """Test correction at idx=0 with previous logprobs available. + + Scenario: First token in list, but previous logprobs exist. + Should try correction with previous logprob token. + """ + processor = processor_with_previous_logprobs + tokens = [200] + + # Mock tokenizer behavior + def mock_decode(ids): + if ids == [123, 200]: + return "corrected" + return "�" + + processor.tokenizer.decode.side_effect = mock_decode + + result = processor._correct_decoded_token(0, tokens) + assert result == "corrected" + + def test_no_correction_needed_returns_fallback( + self, processor_with_previous_logprobs + ): + """Test fallback to empty string when no correction works. + + Scenario: All correction attempts still end with "�". + Should return empty string as final fallback. + """ + processor = processor_with_previous_logprobs + tokens = [100, 101, 102] + + # Mock tokenizer always returns text ending with "�" + processor.tokenizer.decode.return_value = "still�" + + result = processor._correct_decoded_token(2, tokens) + assert result == "" + + def test_middle_token_correction(self, processor_with_previous_logprobs): + """Test correction for a token in the middle of the list. + + Scenario: Token at idx=5 in a longer list needs correction. + """ + processor = processor_with_previous_logprobs + tokens = [10, 20, 30, 40, 50, 60, 70, 80] + + # Mock tokenizer behavior for middle token + def mock_decode(ids): + if ids == [50, 60]: + return "olar" + return "�" + + processor.tokenizer.decode.side_effect = mock_decode + + result = processor._correct_decoded_token(5, tokens) + assert result == "olar" + + def test_multiple_consecutive_replacement_chars( + self, processor_with_previous_logprobs + ): + """Test handling of multiple consecutive replacement characters. + + Scenario: Sequence like ["�", "�", "p"] where first two should + become empty strings. + """ + processor = processor_with_previous_logprobs + + # Test first replacement char + tokens = [100, 101, 102] + processor.tokenizer.decode.return_value = "still�" + result1 = processor._correct_decoded_token(0, tokens) + assert result1 == "" + + # Test second replacement char + result2 = processor._correct_decoded_token(1, tokens) + assert result2 == "" + + def test_correction_with_multibyte_utf8(self, processor_with_previous_logprobs): + """Test correction involving multi-byte UTF-8 characters. + + Scenario: Byte-fallback tokenization splits multi-byte UTF-8 + characters (e.g., curly quotes, Chinese characters, emojis). + Example from user: "�", "�" -> "", "\"" + """ + processor = processor_with_previous_logprobs + tokens = [200, 201] + + # Mock tokenizer behavior for multi-byte UTF-8 correction + def mock_decode(ids): + # When decoding first token (idx=0) with previous logprob token + if ids == [123, 200]: + return ' "' # Space + left curly quote + # When decoding second token (idx=1) with previous token in list + elif ids == [200, 201]: + return '"' # Right curly quote + # When decoding second token (idx=1) with previous logprob + prev token + elif ids == [123, 200, 201]: + return ' ""' # Full sequence + return "�" + + processor.tokenizer.decode.side_effect = mock_decode + + # First token correction (idx=0) + # Will call decode([123, 200]) since idx=0 uses previous logprob token + result1 = processor._correct_decoded_token(0, tokens) + assert result1 == ' "' + + # Second token correction (idx=1) + # Will call decode([200, 201]) since idx>0 uses previous token in list + result2 = processor._correct_decoded_token(1, tokens) + assert result2 == '"' + + def test_real_world_opt125m_scenario(self, mock_tokenizer): + """Test the real-world scenario from user's example. + + User's example with facebook/opt-125m: + Before: [" the", " term", " �", "�", "p", "olar", "ized", "�", "�", ...] + After: [" the", " term", "", " "", "p", "olar", "ized", "", "\"", ...] + """ + from vllm.v1.engine.logprobs import LogprobsProcessor + + # Simulate the sequence of tokens + processor = LogprobsProcessor( + tokenizer=mock_tokenizer, + logprobs=[], + prompt_logprobs=None, + cumulative_logprob=0.0, + num_logprobs=1, + num_prompt_logprobs=None, + ) + + # Token IDs representing the problematic sequence + tokens = [1, 2, 3, 4, 5, 6, 7, 8, 9] # placeholder IDs + + # Mock decode behavior simulating the real scenario + def mock_decode(ids): + # Simulate cases where individual tokens decode to "�" + # but combinations decode correctly + if len(ids) == 1: + if ids[0] == 3 or ids[0] == 4 or ids[0] == 8 or ids[0] == 9: + return "�" + elif len(ids) == 2: + if ids == [2, 3]: + return " term�" # Still ends with �, need more context + elif ids == [3, 4]: + return ' "' # Corrected to space + left curly quote + elif ids == [7, 8]: + return "ized�" # Still ends with � + elif ids == [8, 9]: + return '"' # Corrected to right curly quote + elif len(ids) == 3: + if ids == [1, 2, 3]: + return " the term�" # Still ends with issue + elif ids == [2, 3, 4]: + return ' term "' # With all context + return "normal_text" + + mock_tokenizer.decode.side_effect = mock_decode + + # Test token at index 2 (should fail to correct, return "") + # Token 3 individually is "�" + # decode([2, 3]) = " term�" (still ends with �) + # No previous logprobs, so fallback to "" + result = processor._correct_decoded_token(2, tokens) + assert result == "" + + # Test token at index 3 (should correct to " "") + # Token 4 individually is "�" + # decode([3, 4]) = " "" (corrected!) + processor.logprobs = [{2: None}] # Add previous logprob + result = processor._correct_decoded_token(3, tokens) + assert result == ' "' + + +def test_verify_tokens_integration(): + """Integration test for _verify_tokens with real model. + + This test validates that _verify_tokens correctly identifies and + corrects tokens ending with the replacement character "�". + Uses facebook/opt-125m which is known to produce these issues. + """ + runner = VllmRunner( + "facebook/opt-125m", + max_logprobs=0, + enable_prefix_caching=False, + gpu_memory_utilization=0.15, + max_model_len=256, + ) + + # Use a prompt that triggers multi-byte UTF-8 issues + # Based on user's example: "In this example," + test_prompts = ["In this example,"] + + sampling_params = SamplingParams( + max_tokens=16, + temperature=0, + logprobs=0, + ) + + results = runner.llm.generate(test_prompts, sampling_params=sampling_params) + + # Verify that decoded tokens don't contain replacement characters + for result in results: + assert result.outputs[0].logprobs is not None + for logprob_dict in result.outputs[0].logprobs: + for token_id, logprob_info in logprob_dict.items(): + decoded_token = logprob_info.decoded_token + # Decoded tokens should not end with replacement character + # They should either be corrected or empty string + assert not decoded_token.endswith("�"), ( + f"Token {token_id} decoded to '{decoded_token}' which " + f"ends with replacement character" + ) + # Decoded tokens should not contain lone replacement characters + assert decoded_token != "�", ( + f"Token {token_id} is a lone replacement character" + ) + + +def test_utf8_edge_cases_with_real_model(): + """Test various UTF-8 edge cases with a real model. + + Tests prompts that are likely to trigger byte-fallback tokenization + and multi-byte UTF-8 splitting. + """ + runner = VllmRunner( + "facebook/opt-125m", + max_logprobs=1, + enable_prefix_caching=False, + gpu_memory_utilization=0.15, + max_model_len=256, + ) + + # Prompts with various multi-byte UTF-8 characters + test_prompts = [ + 'Smart quotes: "Hello"', # Curly quotes + "Em dash — test", # Em dash + "Ellipsis… continues", # Ellipsis + "Chinese: 你好", # Chinese characters + "Emoji: 😀 🎉", # Emojis + 'Mixed: "quoted" — with symbols', # Mixed + ] + + sampling_params = SamplingParams( + max_tokens=10, + temperature=0, + logprobs=1, + ) + + results = runner.llm.generate(test_prompts, sampling_params=sampling_params) + + for i, result in enumerate(results): + prompt = test_prompts[i] + assert result.outputs[0].logprobs is not None + + # Check that no decoded tokens end with replacement character + for logprob_dict in result.outputs[0].logprobs: + for token_id, logprob_info in logprob_dict.items(): + decoded_token = logprob_info.decoded_token + assert not decoded_token.endswith("�"), ( + f"Prompt: '{prompt}'\n" + f"Token {token_id} decoded to '{decoded_token}' which " + f"ends with replacement character" + ) + + +def test_correct_decoded_token_preserves_valid_tokens(): + """Test that valid tokens (not ending with �) are not modified. + + The _correct_decoded_token method should only be called for tokens + ending with "�", but this test verifies the broader _verify_tokens + logic doesn't affect valid tokens. + """ + runner = VllmRunner( + "facebook/opt-125m", + max_logprobs=2, + enable_prefix_caching=False, + gpu_memory_utilization=0.15, + max_model_len=256, + ) + + # Simple prompt with standard ASCII characters + test_prompts = ["Hello world, this is a test."] + + sampling_params = SamplingParams( + max_tokens=10, + temperature=0, + logprobs=2, + ) + + results = runner.llm.generate(test_prompts, sampling_params=sampling_params) + + for result in results: + assert result.outputs[0].logprobs is not None + + # All decoded tokens should be valid strings + for logprob_dict in result.outputs[0].logprobs: + for token_id, logprob_info in logprob_dict.items(): + decoded_token = logprob_info.decoded_token + # Valid tokens should be non-empty strings (or empty if corrected) + assert isinstance(decoded_token, str) + # Should not contain replacement character + assert "�" not in decoded_token + + @pytest.mark.parametrize("logprobs_mode", get_args(LogprobsMode)) @pytest.mark.parametrize( "model_setup", diff --git a/tests/v1/tpu/test_mha_attn.py b/tests/v1/tpu/test_mha_attn.py index 84968dee6b60..b7ebf5919607 100644 --- a/tests/v1/tpu/test_mha_attn.py +++ b/tests/v1/tpu/test_mha_attn.py @@ -15,6 +15,7 @@ from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention from vllm.attention.selector import _cached_get_attn_backend from vllm.platforms import current_platform +from vllm.utils.torch_utils import set_random_seed @pytest.fixture(autouse=True) @@ -63,7 +64,7 @@ def test_mha_attn_forward( head_size: int, device: str, ): - current_platform.seed_everything(0) + set_random_seed(0) # These are expected to be f32 q = torch.randn(batch_size, seq_len, num_heads * head_size, device=device) k = torch.randn(batch_size, seq_len, num_kv_heads * head_size, device=device) diff --git a/tests/v1/worker/test_gpu_model_runner.py b/tests/v1/worker/test_gpu_model_runner.py index 59f1ac705829..4ab5aa66c315 100644 --- a/tests/v1/worker/test_gpu_model_runner.py +++ b/tests/v1/worker/test_gpu_model_runner.py @@ -26,6 +26,7 @@ from vllm.sampling_params import SamplingParams from vllm.utils.mem_constants import GiB_bytes from vllm.utils.system_utils import update_environment_variables +from vllm.utils.torch_utils import set_random_seed from vllm.v1.core.kv_cache_utils import estimate_max_model_len, get_kv_cache_configs from vllm.v1.core.sched.output import CachedRequestData, NewRequestData, SchedulerOutput from vllm.v1.kv_cache_interface import ( @@ -776,7 +777,7 @@ def test_hybrid_attention_mamba_tensor_shapes(): will not corrupt an attention block and vice versa """ - current_platform.seed_everything(42) + set_random_seed(42) update_environment_variables( { diff --git a/tools/ep_kernels/install_python_libraries.sh b/tools/ep_kernels/install_python_libraries.sh index 1bb7fd834523..89da24f95dac 100755 --- a/tools/ep_kernels/install_python_libraries.sh +++ b/tools/ep_kernels/install_python_libraries.sh @@ -6,11 +6,12 @@ set -ex # --mode "install" (default) or "wheel" # --pplx-ref pplx-kernels commit hash # --deepep-ref DeepEP commit hash +# --nvshmem-ver NVSHMEM version CUDA_HOME=${CUDA_HOME:-/usr/local/cuda} PPLX_COMMIT_HASH=${PPLX_COMMIT_HASH:-"12cecfd"} DEEPEP_COMMIT_HASH=${DEEPEP_COMMIT_HASH:-"73b6ea4"} -NVSHMEM_VER=3.3.24 # Suppports both CUDA 12 and 13 +NVSHMEM_VER=${NVSHMEM_VER:-"3.3.24"} # Default supports both CUDA 12 and 13 WORKSPACE=${WORKSPACE:-$(pwd)/ep_kernels_workspace} MODE=${MODE:-install} CUDA_VERSION_MAJOR=$(${CUDA_HOME}/bin/nvcc --version | egrep -o "release [0-9]+" | cut -d ' ' -f 2) @@ -50,6 +51,18 @@ while [[ $# -gt 0 ]]; do DEEPEP_COMMIT_HASH="$2" shift 2 ;; + --nvshmem-ver) + if [[ -z "$2" || "$2" =~ ^- ]]; then + echo "Error: --nvshmem-ver requires an argument." >&2 + exit 1 + fi + if [[ "$2" =~ / ]]; then + echo "Error: NVSHMEM version should not contain slashes." >&2 + exit 1 + fi + NVSHMEM_VER="$2" + shift 2 + ;; *) echo "Error: Unknown argument '$1'" >&2 exit 1 @@ -57,6 +70,13 @@ while [[ $# -gt 0 ]]; do esac done +# Validate NVSHMEM_VER to prevent path traversal attacks +# Only allow alphanumeric characters, dots, and hyphens (typical version string chars) +if [[ ! "$NVSHMEM_VER" =~ ^[a-zA-Z0-9.-]+$ ]]; then + echo "Error: NVSHMEM_VER contains invalid characters. Only alphanumeric, dots, and hyphens are allowed." >&2 + exit 1 +fi + mkdir -p "$WORKSPACE" WHEEL_DIR="$WORKSPACE/dist" diff --git a/tools/flashinfer-build.sh b/tools/flashinfer-build.sh index 6c14d87348c3..b3cc6c308710 100755 --- a/tools/flashinfer-build.sh +++ b/tools/flashinfer-build.sh @@ -32,9 +32,12 @@ if [[ "${CUDA_VERSION}" == 11.* ]]; then FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9" elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a" +elif [[ "${CUDA_VERSION}" == 12.[8-9]* ]]; then + # CUDA 12.8–12.9 + FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 10.3a 12.0" else - # CUDA 12.8+ supports 10.0a and 12.0 - FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0" + # CUDA 13.0+ + FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0f 12.0" fi echo "🏗️ Building FlashInfer AOT for arches: ${FI_TORCH_CUDA_ARCH_LIST}" diff --git a/vllm/_aiter_ops.py b/vllm/_aiter_ops.py index 5820832ed486..b443f773525a 100644 --- a/vllm/_aiter_ops.py +++ b/vllm/_aiter_ops.py @@ -288,7 +288,17 @@ def _check_aiter_mla_fp8_support() -> bool: _AITER_MLA_SUPPORTS_FP8 = ( "q_scale" in sig.parameters and "kv_scale" in sig.parameters ) - except Exception: + except ( + ImportError, + ModuleNotFoundError, + AttributeError, + ValueError, + TypeError, + ): + # ImportError/ModuleNotFoundError: aiter.mla module not available + # AttributeError: mla_decode_fwd doesn't exist + # ValueError: mla_decode_fwd has no signature (e.g., built-in) + # TypeError: mla_decode_fwd is not a callable _AITER_MLA_SUPPORTS_FP8 = False return _AITER_MLA_SUPPORTS_FP8 @@ -1374,14 +1384,14 @@ def triton_rotary_embed( key_ = key[..., :rotary_dim] positions = positions.view(*query.shape[:1]) rope_cached_thd_positions_2c_fwd_inplace( - positions, - sin, - cos, query_, key_, + cos, + sin, + positions, rotate_style, reuse_freqs_front_part=True, - is_nope_first=False, + nope_first=False, ) query = query.view(query_shape) key = key.view(key_shape) diff --git a/vllm/assets/image.py b/vllm/assets/image.py index c1a0f2b9cc29..a91eb7d4b67d 100644 --- a/vllm/assets/image.py +++ b/vllm/assets/image.py @@ -42,8 +42,11 @@ def get_path(self, ext: str) -> Path: ) @property - def pil_image(self, ext="jpg") -> Image.Image: - image_path = self.get_path(ext) + def pil_image(self) -> Image.Image: + return self.pil_image_ext(ext="jpg") + + def pil_image_ext(self, ext: str) -> Image.Image: + image_path = self.get_path(ext=ext) return Image.open(image_path) @property diff --git a/vllm/attention/layer.py b/vllm/attention/layer.py index a88544c1c0f9..a09666b65a99 100644 --- a/vllm/attention/layer.py +++ b/vllm/attention/layer.py @@ -357,8 +357,11 @@ def forward( if self.use_output: if output_shape is None: + # Handle both 2D [num_tokens, hidden] and + # 3D [num_tokens, heads, head_dim] query + num_tokens = query.shape[0] output_shape = torch.Size( - (*query.shape[:-1], self.num_heads * self.head_size_v) + (num_tokens, self.num_heads * self.head_size_v) ) output_shape = output_shape if output_shape is not None else query.shape output = torch.empty(output_shape, dtype=output_dtype, device=query.device) diff --git a/vllm/attention/layers/chunked_local_attention.py b/vllm/attention/layers/chunked_local_attention.py index 7e3794d40833..1bd50de80003 100644 --- a/vllm/attention/layers/chunked_local_attention.py +++ b/vllm/attention/layers/chunked_local_attention.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import functools +from dataclasses import replace import torch @@ -10,11 +11,12 @@ from vllm.config import CacheConfig from vllm.config.vllm import VllmConfig from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.triton_utils import tl, triton from vllm.v1.attention.backends.utils import ( AttentionCGSupport, AttentionMetadataBuilder, CommonAttentionMetadata, - make_local_attention_virtual_batches, + make_lazy_sync_tensor_property, subclass_attention_backend, ) from vllm.v1.kv_cache_interface import ( @@ -23,6 +25,164 @@ KVCacheSpec, ) +""" +Chunked Local Attention Implementation + +Implements chunked local attention by splitting sequences into "virtual batches" +- one per attention chunk. Each virtual batch attends only to tokens within its +chunk, enabling chunked local attention without modifying the underlying backend. + +For example, if performing a chunked prefill on a batch of 3 sequences: + q_seqlens = [4, 10, 5] + kv_seqlens = [6, 17, 9] + +For regular attention, batch idx 0 (q_seqlens=4, kv_seqlens=6) would use mask: + k_toks > 0 1 2 3 4 5 + q_toks v _____________ + 0 | 1 1 1 + 1 | 1 1 1 1 + 2 | 1 1 1 1 1 + 3 | 1 1 1 1 1 1 + +For chunked local attention (attn_chunk_size=4), the mask becomes: + k_toks > 0 1 2 3 4 5 + q_toks v _____________ + 0 | 1 1 1 + 1 | 1 1 1 1 + 2 | 1 + 3 | 1 1 + +We simulate this by breaking sequences into virtual batches, each covering one +attention chunk. Batch idx 0 becomes: + + virtual batch 0 (q_seqlens=2, kv_seqlens=4): + k_toks > 0 1 2 3 + q_toks v _____________ + 0 | 1 1 1 + 1 | 1 1 1 1 + + virtual batch 1 (q_seqlens=2, kv_seqlens=2): + k_toks > 4 5 + q_toks v _____________ + 2 | 1 + 3 | 1 1 +""" + + +@torch.compile(dynamic=True) +def _compute_cu_num_vb( + query_start_loc: torch.Tensor, + seq_lens: torch.Tensor, + chunk: int, +) -> torch.Tensor: + """Compute cumulative virtual batches per request (fused via torch.compile).""" + q_seqlens = query_start_loc[1:] - query_start_loc[:-1] + context_lens = seq_lens - q_seqlens + space_in_first_chunk = chunk - context_lens % chunk + q_in_first_chunk = torch.minimum(space_in_first_chunk, q_seqlens) + q_in_remaining_chunks = q_seqlens - q_in_first_chunk + num_vb_per_req = 1 + (q_in_remaining_chunks + chunk - 1) // chunk + # Prepend 0 and compute cumsum for output offsets + cu_num_vb = torch.zeros( + seq_lens.shape[0] + 1, dtype=torch.int32, device=seq_lens.device + ) + cu_num_vb[1:] = torch.cumsum(num_vb_per_req, dim=0) + return cu_num_vb + + +@triton.jit +def _compute_virtual_batches_attn_metadata_kernel( + # Inputs + query_start_loc_ptr, # [batch_size + 1] + seq_lens_ptr, # [batch_size] + cu_num_vb_ptr, # [batch_size + 1] - cumsum of virtual batches per request + block_table_ptr, # [batch_size, max_blocks_per_seq] + # Outputs + seqlens_k_ptr, # [num_vb_ub] - virtual batch kv seqlens + cu_seqlens_q_ptr, # [num_vb_ub + 1] - cumulative query seqlens + virtual_batches_block_table_ptr, # [num_vb_ub * pages_per_virtual_batch] + batch_mapping_ptr, # [num_vb_ub] - maps vb -> original batch + block_indices_ptr, # [num_vb_ub * pages_per_virtual_batch] - block indices + # Sizes + batch_size, + max_blocks_per_seq, + # Constants + ATTN_CHUNK_SIZE: tl.constexpr, + BLOCK_SIZE: tl.constexpr, + PAGES_PER_VIRTUAL_BATCH: tl.constexpr, + MAX_VIRTUAL_BATCHES: tl.constexpr, # Max virtual batches per request +): + batch_idx = tl.program_id(0) + if batch_idx >= batch_size: + return + + # Load batch boundaries and sequence data + output_start = tl.load(cu_num_vb_ptr + batch_idx) + output_end = tl.load(cu_num_vb_ptr + batch_idx + 1) + num_vb = output_end - output_start + + q_start = tl.load(query_start_loc_ptr + batch_idx) + q_end = tl.load(query_start_loc_ptr + batch_idx + 1) + q_seqlen = q_end - q_start + kv_seqlen = tl.load(seq_lens_ptr + batch_idx) + + # Compute q_tokens_in_first_block + context_len = kv_seqlen - q_seqlen + remainder = context_len % ATTN_CHUNK_SIZE + space_in_first = tl.where( + remainder == 0, ATTN_CHUNK_SIZE, ATTN_CHUNK_SIZE - remainder + ) + q_first = tl.minimum(space_in_first, q_seqlen) + + # Compute tokens_in_last_block + last_remainder = kv_seqlen % ATTN_CHUNK_SIZE + tokens_last = tl.where(last_remainder == 0, ATTN_CHUNK_SIZE, last_remainder) + + # Running sum for cu_seqlens_q (base offset is q_start from query_start_loc) + cu_q_running = q_start + + # Loop over virtual batches for this request (use mask instead of break) + for vb_local_idx in range(MAX_VIRTUAL_BATCHES): + valid = vb_local_idx < num_vb + vb_idx = output_start + vb_local_idx + + # Compute seqlen_q + is_first = vb_local_idx == 0 + consumed = tl.where(vb_local_idx > 0, (vb_local_idx - 1) * ATTN_CHUNK_SIZE, 0) + remaining = q_seqlen - q_first - consumed + seqlen_q = tl.where( + is_first, q_first, tl.minimum(tl.maximum(remaining, 0), ATTN_CHUNK_SIZE) + ) + + # Compute seqlen_k (0 for padding entries where kv_seqlen=0) + is_last = vb_local_idx == num_vb - 1 + seqlen_k = tl.where( + kv_seqlen > 0, tl.where(is_last, tokens_last, ATTN_CHUNK_SIZE), 0 + ) + + # Compute block_start_idx for block table + rarange = num_vb - vb_local_idx - 1 + k_seqstart = kv_seqlen - (rarange * ATTN_CHUNK_SIZE + tokens_last) + k_seqstart = tl.maximum(k_seqstart, 0) + block_start_idx = k_seqstart // BLOCK_SIZE + + # Store outputs (masked) + tl.store(seqlens_k_ptr + vb_idx, seqlen_k, mask=valid) + tl.store(batch_mapping_ptr + vb_idx, batch_idx, mask=valid) + + # Update and store cu_seqlens_q + cu_q_running = tl.where(valid, cu_q_running + seqlen_q, cu_q_running) + tl.store(cu_seqlens_q_ptr + vb_idx + 1, cu_q_running, mask=valid) + + # Store block table entries and indices (masked) + for page_idx in range(PAGES_PER_VIRTUAL_BATCH): + flat_idx = vb_idx * PAGES_PER_VIRTUAL_BATCH + page_idx + block_idx = tl.minimum(block_start_idx + page_idx, max_blocks_per_seq - 1) + src_idx = batch_idx * max_blocks_per_seq + block_idx + block_val = tl.load(block_table_ptr + src_idx, mask=valid, other=0) + tl.store(virtual_batches_block_table_ptr + flat_idx, block_val, mask=valid) + tl.store(block_indices_ptr + flat_idx, block_idx, mask=valid) + @functools.lru_cache def create_chunked_local_attention_backend( @@ -36,15 +196,87 @@ def create_chunked_local_attention_backend( assert issubclass(underlying_builder, AttentionMetadataBuilder) class ChunkedLocalAttentionBuilder(underlying_builder): # type: ignore + supports_update_block_table: bool = True + + def __init__( + self, + kv_cache_spec: AttentionSpec, + layer_names: list[str], + vllm_config: VllmConfig, + device: torch.device, + ): + # Compute loose, upper bound on number of virtual batches + # for persistent buffer allocation + max_num_seqs = vllm_config.scheduler_config.max_num_seqs + sched = vllm_config.scheduler_config + max_num_batched_tokens = sched.max_num_batched_tokens + max_vb_per_req = ( + 1 + + (max_num_batched_tokens + attention_chunk_size - 2) + // attention_chunk_size + ) + num_vb_ub = max_num_seqs * max_vb_per_req + pages_per_virtual_batch = attention_chunk_size // block_size + + # Create modified config with num_vb_ub as max_num_seqs so the + # underlying builder allocates buffers large enough for virtual + # batches (required for CUDA graph support so the underlying builder + # can allocate persistent buffers large enough for virtual batches) + # Also bump max_num_batched_tokens if needed to satisfy the + # max_num_batched_tokens >= max_num_seqs validation + modified_max_batched = max(max_num_batched_tokens, num_vb_ub) + modified_scheduler_config = replace( + vllm_config.scheduler_config, + max_model_len=vllm_config.model_config.max_model_len, + is_encoder_decoder=vllm_config.model_config.is_encoder_decoder, + max_num_seqs=num_vb_ub, + max_num_batched_tokens=modified_max_batched, + ) + modified_vllm_config = replace( + vllm_config, scheduler_config=modified_scheduler_config + ) + + # Call parent __init__ with modified config + super().__init__(kv_cache_spec, layer_names, modified_vllm_config, device) + + # Store for use in build() + self._pages_per_virtual_batch = pages_per_virtual_batch + + # Pre-allocate persistent buffers for virtual batch metadata + self._virtual_seqlens = torch.zeros( + num_vb_ub, dtype=torch.int32, device=device + ) + self._cu_virtual_seqlens_q = torch.zeros( + num_vb_ub + 1, dtype=torch.int32, device=device + ) + self._virtual_batch_to_batch_mapping = torch.zeros( + num_vb_ub, dtype=torch.int32, device=device + ) + self._virtual_batches_block_table = torch.zeros( + (num_vb_ub, pages_per_virtual_batch), + dtype=torch.int32, + device=device, + ) + self._virtual_batch_block_indices = torch.zeros( + (num_vb_ub, pages_per_virtual_batch), + dtype=torch.int32, + device=device, + ) + + # Pinned memory buffer for async GPU->CPU copy of cu_virtual_seqlens_q + self._cu_virtual_seqlens_q_cpu = torch.zeros( + num_vb_ub + 1, dtype=torch.int32, pin_memory=True + ) + @classmethod def get_cudagraph_support( cls: type["AttentionMetadataBuilder"], vllm_config: VllmConfig, kv_cache_spec: AttentionSpec, ) -> AttentionCGSupport: - # Explicit override in case the underlying builder specialized this getter. - # @override omitted only because of mypy limitation due to type variable. - return AttentionCGSupport.NEVER + # Support UNIFORM_BATCH for FULL CGs (decode with uniform q_len=1) + # Each request produces exactly 1 virtual batch, so num_vb = batch_size + return AttentionCGSupport.UNIFORM_BATCH def build( self, @@ -52,18 +284,126 @@ def build( common_attn_metadata: CommonAttentionMetadata, fast_build: bool = False, ): - cm, make_virtual_batches_block_table = make_local_attention_virtual_batches( - attention_chunk_size, common_attn_metadata, block_size + batch_size = common_attn_metadata.num_reqs + block_table = common_attn_metadata.block_table_tensor + query_start_loc = common_attn_metadata.query_start_loc + seq_lens = common_attn_metadata.seq_lens + query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu + chunk = attention_chunk_size + + # Compute num_vb_ub from CPU data (no GPU sync) + # N query tokens can span at most: 1 + (N + chunk - 2) // chunk + q_seqlens_cpu = (query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]).numpy() + num_vb_per_req_ub = 1 + (q_seqlens_cpu + chunk - 2) // chunk + num_vb_ub = int(num_vb_per_req_ub.sum()) + max_vb_per_req = max(1, int(num_vb_per_req_ub.max())) + + # Compute cumulative virtual batches per request on GPU + cu_num_vb = _compute_cu_num_vb(query_start_loc, seq_lens, chunk) + + # Get max_blocks_per_seq from actual block_table shape + max_blocks_per_seq = block_table.shape[1] + pages_per_vb = self._pages_per_virtual_batch + + # Zero buffers before kernel to clear stale data from previous + # calls (kernel uses masked writes, stale data may remain) + self._virtual_batch_to_batch_mapping[:num_vb_ub].zero_() + self._virtual_batch_block_indices[:num_vb_ub, :pages_per_vb].zero_() + + _compute_virtual_batches_attn_metadata_kernel[(batch_size,)]( + query_start_loc, + seq_lens, + cu_num_vb, + block_table, + self._virtual_seqlens, + self._cu_virtual_seqlens_q, + self._virtual_batches_block_table, + self._virtual_batch_to_batch_mapping, + self._virtual_batch_block_indices, + batch_size, + max_blocks_per_seq, + ATTN_CHUNK_SIZE=chunk, + BLOCK_SIZE=block_size, + PAGES_PER_VIRTUAL_BATCH=pages_per_vb, + MAX_VIRTUAL_BATCHES=max_vb_per_req, ) + + # Pad cu_virtual_seqlens_q for FULL CG (must be monotonic) + total_tokens = int(query_start_loc_cpu[-1]) + self._cu_virtual_seqlens_q[num_vb_ub + 1 :].fill_(total_tokens) + + # Compute query_start_loc_cpu for virtual batches. + # We handle two cases differently to avoid CPU<>GPU sync: + # + # 1. Uniform single token decode case (max_q_len == 1): + # Each request has exactly 1 query token, so each produces exactly + # 1 virtual batch. Therefore cu_virtual_seqlens = [0, 1, 2, ..., N] + # which is identical to input query_start_loc_cpu. Reuse it. + # + # 2. Spec-decode / Prefill case (max_q_len > 1): + # Requests may span multiple chunks, so we need GPU-computed values. + # We use non-blocking D2H copy + lazy sync: the CPU tensor is wrapped + # so accessing it synchronizes exactly once on first access. This way + # backends like FlashAttn (which don't use query_start_loc_cpu) never + # block, while backends that do block only when actually accessing it. + max_q_len = common_attn_metadata.max_query_len + use_lazy_sync = max_q_len > 1 + + if use_lazy_sync: + # Async copy to pinned memory - will sync lazily on access + cpu_buf = self._cu_virtual_seqlens_q_cpu[: num_vb_ub + 1] + cpu_buf.copy_( + self._cu_virtual_seqlens_q[: num_vb_ub + 1], non_blocking=True + ) + sync_event = torch.cuda.Event() + sync_event.record() + cu_virtual_seqlens_q_cpu = cpu_buf + else: + # Uniform decode: reuse input directly (no copy needed) + cu_virtual_seqlens_q_cpu = query_start_loc_cpu + + # Build metadata with virtual batch tensors + cm = CommonAttentionMetadata( + query_start_loc=self._cu_virtual_seqlens_q[: num_vb_ub + 1], + query_start_loc_cpu=cu_virtual_seqlens_q_cpu, + seq_lens=self._virtual_seqlens[:num_vb_ub], + num_reqs=num_vb_ub, + num_actual_tokens=common_attn_metadata.num_actual_tokens, + max_query_len=chunk, + max_seq_len=chunk, + block_table_tensor=self._virtual_batches_block_table[:num_vb_ub], + slot_mapping=common_attn_metadata.slot_mapping, + causal=True, + ) + + # Wrap CPU tensor with lazy sync if needed + if use_lazy_sync: + make_lazy_sync_tensor_property( + cm, "query_start_loc_cpu", cpu_buf, sync_event + ) + metadata = super().build(common_prefix_len, cm, fast_build) - metadata.make_virtual_batches_block_table = make_virtual_batches_block_table + + # Clone indices onto metadata so they're stable for + # update_block_table (different layers have different builders) + metadata._virtual_batch_to_batch_mapping = ( + self._virtual_batch_to_batch_mapping[:num_vb_ub].clone() + ) + # Only keep the columns we actually use (clamped to max_blocks_per_seq) + metadata._virtual_batch_block_indices = self._virtual_batch_block_indices[ + :num_vb_ub, :pages_per_vb + ].clone() return metadata def update_block_table( self, metadata, blk_table: torch.Tensor, slot_mapping: torch.Tensor ): - blk_table = metadata.make_virtual_batches_block_table(blk_table) - return super().update_block_table(metadata, blk_table, slot_mapping) + # Use cloned indices stored on metadata (stable across builders) + new_block_table = blk_table[ + metadata._virtual_batch_to_batch_mapping.unsqueeze(1), + metadata._virtual_batch_block_indices, + ] + return super().update_block_table(metadata, new_block_table, slot_mapping) attn_backend = subclass_attention_backend( name_prefix=prefix, diff --git a/vllm/attention/layers/cross_attention.py b/vllm/attention/layers/cross_attention.py index cfd203bdd37b..f58c9d541775 100644 --- a/vllm/attention/layers/cross_attention.py +++ b/vllm/attention/layers/cross_attention.py @@ -149,16 +149,20 @@ def __init__( kv_cache_dtype = "auto" block_size = 16 - underlying_attn_backend = get_attn_backend( - head_size, dtype, kv_cache_dtype, block_size - ) - attn_backend = create_cross_attention_backend(underlying_attn_backend) - if attn_type is not None: assert attn_type == AttentionType.ENCODER_DECODER, ( "CrossAttention only supports AttentionType.ENCODER_DECODER" ) + underlying_attn_backend = get_attn_backend( + head_size, + dtype, + kv_cache_dtype, + block_size, + attn_type=AttentionType.ENCODER_DECODER, + ) + attn_backend = create_cross_attention_backend(underlying_attn_backend) + super().__init__( num_heads=num_heads, head_size=head_size, diff --git a/vllm/attention/ops/triton_prefill_attention.py b/vllm/attention/ops/triton_prefill_attention.py new file mode 100644 index 000000000000..ae7332830c44 --- /dev/null +++ b/vllm/attention/ops/triton_prefill_attention.py @@ -0,0 +1,271 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +# Adapted from +# https://github.com/sgl-project/sglang/blob/97cb762bb65ebf05025eb342de03c184660427a3/python/sglang/srt/layers/attention/triton_ops/prefill_attention.py +# Changes: +# - Add support for sliding window attention + +# Copyright 2023-2024 SGLang Team +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================== +""" +Memory-efficient attention for prefill. +It supports page size = 1. +""" + +# Adapted from +# https://github.com/ModelTC/lightllm/blob/f2a54f0912293f683bf1d1695fd12c4098a5bf82/lightllm/models/llama/triton_kernel/context_flashattention_nopad.py#L1 +import torch + +from vllm.platforms import current_platform +from vllm.triton_utils import tl, triton + + +@triton.jit +def _fwd_kernel( + Q, + K, + V, + sm_scale, + B_Start_Loc, + B_Seqlen, + Out, + stride_qbs, + stride_qh, + stride_kbs, + stride_kh, + stride_vbs, + stride_vh, + stride_obs, + stride_oh, + kv_group_num: tl.constexpr, + BLOCK_M: tl.constexpr, + BLOCK_DMODEL: tl.constexpr, + BLOCK_N: tl.constexpr, + IS_CAUSAL: tl.constexpr, + SLIDING_WINDOW_Q: tl.constexpr, + SLIDING_WINDOW_K: tl.constexpr, + Lk: tl.constexpr, +): + cur_batch = tl.program_id(0) + cur_head = tl.program_id(1) + start_m = tl.program_id(2) + + cur_kv_head = cur_head // kv_group_num + + cur_batch_seq_len = tl.load(B_Seqlen + cur_batch) + cur_batch_in_all_start_index = tl.load(B_Start_Loc + cur_batch) + + block_start_loc = BLOCK_M * start_m + + # initialize offsets + offs_n = tl.arange(0, BLOCK_N) + offs_d = tl.arange(0, BLOCK_DMODEL) + offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M) + off_q = ( + (cur_batch_in_all_start_index + offs_m[:, None]) * stride_qbs + + cur_head * stride_qh + + offs_d[None, :] + ) + off_k = offs_n[None, :] * stride_kbs + cur_kv_head * stride_kh + offs_d[:, None] + off_v = offs_n[:, None] * stride_vbs + cur_kv_head * stride_vh + offs_d[None, :] + + mask_d = offs_d < Lk + + q = tl.load( + Q + off_q, + mask=(offs_m[:, None] < cur_batch_seq_len) & (mask_d[None, :]), + other=0.0, + ) + + k_ptrs = K + off_k + v_ptrs = V + off_v + + # initialize pointer to m and l + m_i = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf") + l_i = tl.zeros([BLOCK_M], dtype=tl.float32) + acc = tl.zeros([BLOCK_M, BLOCK_DMODEL], dtype=tl.float32) + + block_mask = tl.where(block_start_loc < cur_batch_seq_len, 1, 0) + + # Calculate the end position for attention computation + end_n = cur_batch_seq_len + + # Apply causal attention pruning and sliding window attention pruning + end_n = tl.minimum(end_n, (start_m + 1) * BLOCK_M) if IS_CAUSAL else end_n + + # Calculate the start position for backward sliding window + start_n_limit = 0 + end_n_limit = block_mask * end_n + + for start_n in range(start_n_limit, end_n_limit, BLOCK_N): + start_n = tl.multiple_of(start_n, BLOCK_N) + # -- compute qk ---- + k = tl.load( + k_ptrs + (cur_batch_in_all_start_index + start_n) * stride_kbs, + mask=((start_n + offs_n[None, :]) < cur_batch_seq_len) & (mask_d[:, None]), + other=0.0, + ) + + # Apply attention mask (causal + bidirectional sliding window) + # Position indices in the sequence + pos_q = offs_m[:, None] # Query positions [BLOCK_M, 1] + pos_k = start_n + offs_n[None, :] # Key positions [1, BLOCK_N] + + # Valid sequence mask + mask = pos_k < cur_batch_seq_len + # Causal mask + if IS_CAUSAL: + mask &= pos_q >= pos_k + + # Bidirectional sliding window masks + sliding_mask_q = ( + pos_q - pos_k <= SLIDING_WINDOW_Q if SLIDING_WINDOW_Q > 0 else None + ) + sliding_mask_k = ( + pos_k - pos_q <= SLIDING_WINDOW_K if SLIDING_WINDOW_K > 0 else None + ) + if sliding_mask_q is not None: + mask &= sliding_mask_q + if sliding_mask_k is not None: + mask &= sliding_mask_k + + qk = tl.zeros([BLOCK_M, BLOCK_N], dtype=tl.float32) + qk += tl.where(mask, 0, float("-inf")) + qk += tl.dot(q, k) + qk *= sm_scale + + # -- compute m_ij, p, l_ij + m_ij = tl.max(qk, 1) + # For sliding window there's a chance the max is -inf due to masking of + # the entire row. In this case we need to set m_j 0 to avoid NaN + m_ij_valid_mask = m_ij > float("-inf") + m_ij_masked = tl.where(m_ij_valid_mask, m_ij, 0.0) + # -- compute p and l_ij -- + p = tl.exp(qk - m_ij_masked[:, None]) + l_ij = tl.sum(p, 1) + # -- update m_i and l_i + m_i_new = tl.maximum(m_i, m_ij) + m_i_new_mask = m_i_new > float("-inf") + alpha = tl.exp(m_i - m_i_new) + beta = tl.exp(m_ij - m_i_new) + # mask alpha and beta for sliding window + alpha = tl.where(m_i_new_mask, alpha, 1.0) + beta = tl.where(m_i_new_mask, beta, 0.0) + l_i_new = alpha * l_i + beta * l_ij + # -- update output accumulator -- + # scale p + # For sliding window there's a chance the l_i_new is 0 due to masking + # the entire row. We need to set l_i_new 1 to avoid zero division + l_i_new_mask = (l_i_new != 0.0) & (m_i_new_mask > float("-inf")) + l_i_new_safe = tl.where(l_i_new_mask, l_i_new, 1.0) + p_scale = beta / l_i_new_safe + p = p * p_scale[:, None] + # scale acc + acc_scale = l_i / l_i_new_safe * alpha + acc = acc * acc_scale[:, None] + # update acc + v = tl.load( + v_ptrs + (cur_batch_in_all_start_index + start_n) * stride_vbs, + mask=((start_n + offs_n[:, None]) < cur_batch_seq_len) & (mask_d[None, :]), + other=0.0, + ) + + p = p.to(v.dtype) + acc += tl.dot(p, v) + # update m_i and l_i + l_i = l_i_new + m_i = m_i_new + # initialize pointers to output + off_o = ( + (cur_batch_in_all_start_index + offs_m[:, None]) * stride_obs + + cur_head * stride_oh + + offs_d[None, :] + ) + out_ptrs = Out + off_o + tl.store( + out_ptrs, acc, mask=(offs_m[:, None] < cur_batch_seq_len) & (mask_d[None, :]) + ) + + +def get_block_size(dtype: torch.dtype) -> int: + if dtype == torch.float32: + return 32 + elif ( + current_platform.is_cuda_alike() + ) and current_platform.get_device_capability().major > 8: + return 128 + else: + return 64 + + +def context_attention_fwd( + q, + k, + v, + o, + b_start_loc, + b_seq_len, + max_input_len, + is_causal=True, + sliding_window_q=None, + sliding_window_k=None, +): + """ + q, k, v: [b * s, head, head_dim] + b_start_loc: [b] + b_seq_len: [b] + out: [b * s, head, head_dim] + """ + BLOCK = get_block_size(q.dtype) + + Lq, Lk, _ = q.shape[-1], k.shape[-1], v.shape[-1] + + sm_scale = 1.0 / (Lq**0.5) + batch, head = b_seq_len.shape[0], q.shape[1] + kv_group_num = q.shape[1] // k.shape[1] + + grid = (batch, head, triton.cdiv(max_input_len, BLOCK)) + num_warps = 4 if Lk <= 64 else 8 + + sliding_window_q = sliding_window_q if sliding_window_q is not None else 0 + sliding_window_k = sliding_window_k if sliding_window_k is not None else 0 + + _fwd_kernel[grid]( + q, + k, + v, + sm_scale, + b_start_loc, + b_seq_len, + o, + q.stride(0), + q.stride(1), + k.stride(0), + k.stride(1), + v.stride(0), + v.stride(1), + o.stride(0), + o.stride(1), + kv_group_num=kv_group_num, + BLOCK_M=BLOCK, + BLOCK_DMODEL=triton.next_power_of_2(Lk), + BLOCK_N=BLOCK, + IS_CAUSAL=is_causal, + SLIDING_WINDOW_Q=sliding_window_q, + SLIDING_WINDOW_K=sliding_window_k, + num_warps=num_warps, + num_stages=1, + Lk=Lk, + ) diff --git a/vllm/benchmarks/datasets.py b/vllm/benchmarks/datasets.py index 067e31f4303b..a90ad0c23bc6 100644 --- a/vllm/benchmarks/datasets.py +++ b/vllm/benchmarks/datasets.py @@ -1437,19 +1437,97 @@ def add_dataset_parser(parser: FlexibleArgumentParser): ) random_group = parser.add_argument_group("random dataset options") - random_group.add_argument( + add_random_dataset_base_args(random_group) + + random_mm_group = parser.add_argument_group( + "random multimodal dataset options extended from random dataset" + ) + add_random_multimodal_dataset_args(random_mm_group) + + hf_group = parser.add_argument_group("hf dataset options") + hf_group.add_argument( + "--hf-subset", type=str, default=None, help="Subset of the HF dataset." + ) + hf_group.add_argument( + "--hf-split", type=str, default=None, help="Split of the HF dataset." + ) + hf_group.add_argument( + "--hf-name", + type=str, + default=None, + help=( + "Name of the dataset on HuggingFace " + "(e.g., 'lmarena-ai/VisionArena-Chat'). " + "Specify this if your dataset-path is a local path." + ), + ) + hf_group.add_argument( + "--hf-output-len", + type=int, + default=None, + help="Output length for each request. Overrides the output lengths " + "from the sampled HF dataset.", + ) + + prefix_repetition_group = parser.add_argument_group( + "prefix repetition dataset options" + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-prefix-len", + type=int, + default=256, + help="Number of prefix tokens per request, used only for prefix " + "repetition dataset.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-suffix-len", + type=int, + default=256, + help="Number of suffix tokens per request, used only for prefix " + "repetition dataset. Total input length is prefix_len + suffix_len.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-num-prefixes", + type=int, + default=10, + help="Number of prefixes to generate, used only for prefix repetition " + "dataset. Prompts per prefix is num_requests // num_prefixes.", + ) + prefix_repetition_group.add_argument( + "--prefix-repetition-output-len", + type=int, + default=128, + help="Number of output tokens per request, used only for prefix " + "repetition dataset.", + ) + + +def add_random_dataset_base_args( + parser_or_group: FlexibleArgumentParser | argparse._ArgumentGroup, +) -> None: + """Add CLI arguments for base random dataset options. + + This function adds arguments needed for: + - random (random dataset) + - random-mm (random multimodal dataset) + - random-rerank (random dataset for reranking) + + Args: + parser_or_group: Either a parser or an argument group to add arguments to. + """ + parser_or_group.add_argument( "--random-input-len", type=int, default=1024, help="Number of input tokens per request, used only for random sampling.", ) - random_group.add_argument( + parser_or_group.add_argument( "--random-output-len", type=int, default=128, help="Number of output tokens per request, used only for random sampling.", ) - random_group.add_argument( + parser_or_group.add_argument( "--random-range-ratio", type=float, default=0.0, @@ -1458,7 +1536,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser): "a symmetric sampling range" "[length * (1 - range_ratio), length * (1 + range_ratio)].", ) - random_group.add_argument( + parser_or_group.add_argument( "--random-prefix-len", type=int, default=0, @@ -1471,13 +1549,13 @@ def add_dataset_parser(parser: FlexibleArgumentParser): "input_len * (1 + range_ratio)]." ), ) - random_group.add_argument( + parser_or_group.add_argument( "--random-batch-size", type=int, default=1, help=("Batch size for random sampling. Only used for embeddings benchmark."), ) - random_group.add_argument( + parser_or_group.add_argument( "--no-reranker", action="store_true", help=( @@ -1486,11 +1564,19 @@ def add_dataset_parser(parser: FlexibleArgumentParser): ), ) - # random multimodal dataset options - random_mm_group = parser.add_argument_group( - "random multimodal dataset options extended from random dataset" - ) - random_mm_group.add_argument( + +def add_random_multimodal_dataset_args( + parser_or_group: FlexibleArgumentParser | argparse._ArgumentGroup, +) -> None: + """Add CLI arguments for random multimodal dataset options. + + This function adds arguments needed for: + - random-mm (random multimodal dataset) + + Args: + parser_or_group: Either a parser or an argument group to add arguments to. + """ + parser_or_group.add_argument( "--random-mm-base-items-per-request", type=int, default=RandomMultiModalDataset.DEFAULT_BASE_ITEMS_PER_REQUEST, @@ -1500,7 +1586,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser): "--random-mm-num-mm-items-range-ratio." ), ) - random_mm_group.add_argument( + parser_or_group.add_argument( "--random-mm-num-mm-items-range-ratio", type=float, default=RandomMultiModalDataset.DEFAULT_NUM_MM_ITEMS_RANGE_RATIO, @@ -1515,7 +1601,7 @@ def add_dataset_parser(parser: FlexibleArgumentParser): "An error is raised if the computed min exceeds the max." ), ) - random_mm_group.add_argument( + parser_or_group.add_argument( "--random-mm-limit-mm-per-prompt", type=json.loads, default=RandomMultiModalDataset.DEFAULT_LIMIT_MM_PER_PROMPT, @@ -1559,7 +1645,7 @@ def normalize(d: dict) -> dict[tuple[int, int, int], float]: return normalize(parsed) raise ValueError("Unsupported value for --random-mm-bucket-config.") - random_mm_group.add_argument( + parser_or_group.add_argument( "--random-mm-bucket-config", type=_parse_mm_bucket_config, default=RandomMultiModalDataset.DEFAULT_MM_ITEM_BUCKET_CONFIG, @@ -1580,63 +1666,6 @@ def normalize(d: dict) -> dict[tuple[int, int, int], float]: ), ) - hf_group = parser.add_argument_group("hf dataset options") - hf_group.add_argument( - "--hf-subset", type=str, default=None, help="Subset of the HF dataset." - ) - hf_group.add_argument( - "--hf-split", type=str, default=None, help="Split of the HF dataset." - ) - hf_group.add_argument( - "--hf-name", - type=str, - default=None, - help=( - "Name of the dataset on HuggingFace " - "(e.g., 'lmarena-ai/VisionArena-Chat'). " - "Specify this if your dataset-path is a local path." - ), - ) - hf_group.add_argument( - "--hf-output-len", - type=int, - default=None, - help="Output length for each request. Overrides the output lengths " - "from the sampled HF dataset.", - ) - - prefix_repetition_group = parser.add_argument_group( - "prefix repetition dataset options" - ) - prefix_repetition_group.add_argument( - "--prefix-repetition-prefix-len", - type=int, - default=256, - help="Number of prefix tokens per request, used only for prefix " - "repetition dataset.", - ) - prefix_repetition_group.add_argument( - "--prefix-repetition-suffix-len", - type=int, - default=256, - help="Number of suffix tokens per request, used only for prefix " - "repetition dataset. Total input length is prefix_len + suffix_len.", - ) - prefix_repetition_group.add_argument( - "--prefix-repetition-num-prefixes", - type=int, - default=10, - help="Number of prefixes to generate, used only for prefix repetition " - "dataset. Prompts per prefix is num_requests // num_prefixes.", - ) - prefix_repetition_group.add_argument( - "--prefix-repetition-output-len", - type=int, - default=128, - help="Number of output tokens per request, used only for prefix " - "repetition dataset.", - ) - def get_samples(args, tokenizer: TokenizerLike) -> list[SampleRequest]: if not hasattr(args, "request_id_prefix"): diff --git a/vllm/benchmarks/mm_processor.py b/vllm/benchmarks/mm_processor.py new file mode 100644 index 000000000000..1e65a2553935 --- /dev/null +++ b/vllm/benchmarks/mm_processor.py @@ -0,0 +1,363 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +r"""Benchmark multimodal processor latency. + +This benchmark measures the latency of the mm processor module +using multimodal prompts from datasets. +MM processor stats are automatically enabled. + +Run: + vllm bench mm-processor \ + --model \ + --dataset-name random-mm \ + --num-prompts 10 \ +""" + +import argparse +import dataclasses +import json +import time +from datetime import datetime +from typing import Any + +import numpy as np + +from vllm.benchmarks.throughput import get_requests +from vllm.engine.arg_utils import EngineArgs +from vllm.multimodal.processing import ( + get_timing_stats_from_engine_client, +) +from vllm.utils.gc_utils import freeze_gc_heap +from vllm.utils.import_utils import PlaceholderModule + +try: + import pandas as pd +except ImportError: + pd = PlaceholderModule("pandas") + + +def collect_mm_processor_stats( + llm_engine: Any, +) -> dict[str, list[float]]: + """ + Collect multimodal processor timing stats. + Returns a dictionary mapping stage names to lists of timing values (in seconds). + """ + all_stats = get_timing_stats_from_engine_client(llm_engine) + + stats_by_stage = { + "hf_processor_time": [], + "hashing_time": [], + "cache_lookup_time": [], + "prompt_update_time": [], + "total_time": [], + } + + for stats_dict in all_stats.values(): + stats_by_stage["hf_processor_time"].append( + stats_dict.get("hf_processor_time", 0.0) + ) + stats_by_stage["hashing_time"].append(stats_dict.get("hashing_time", 0.0)) + stats_by_stage["cache_lookup_time"].append( + stats_dict.get("cache_lookup_time", 0.0) + ) + stats_by_stage["prompt_update_time"].append( + stats_dict.get("prompt_update_time", 0.0) + ) + stats_by_stage["total_time"].append(stats_dict.get("total_time", 0.0)) + + return stats_by_stage + + +def calculate_mm_processor_metrics( + stats_by_stage: dict[str, list[float]], + selected_percentiles: list[float], +) -> dict[str, dict[str, float]]: + """ + Calculate aggregate metrics from stats by stage. + """ + metrics = {} + + for stage_name, times in stats_by_stage.items(): + if not times: + metrics[stage_name] = { + "mean": 0.0, + "median": 0.0, + "std": 0.0, + **{f"p{p}": 0.0 for p in selected_percentiles}, + } + continue + + times_ms = [t * 1000 for t in times] + metrics[stage_name] = { + "mean": float(np.mean(times_ms)), + "median": float(np.median(times_ms)), + "std": float(np.std(times_ms)), + **{ + f"p{p}": float(np.percentile(times_ms, p)) for p in selected_percentiles + }, + } + + return metrics + + +def validate_args(args): + """ + Validate command-line arguments for mm_processor benchmark. + """ + if not getattr(args, "tokenizer", None): + args.tokenizer = args.model + if not hasattr(args, "dataset_path"): + args.dataset_path = None + if not hasattr(args, "lora_path"): + args.lora_path = None + if not hasattr(args, "max_loras"): + args.max_loras = None + + +def benchmark_multimodal_processor( + args: argparse.Namespace, +) -> dict[str, Any]: + """ + Run the multimodal processor benchmark. + """ + from vllm import LLM, SamplingParams + + validate_args(args) + + if args.seed is None: + args.seed = 0 + + engine_args = EngineArgs.from_cli_args(args) + llm = LLM(**dataclasses.asdict(engine_args)) + + tokenizer = llm.get_tokenizer() + requests = get_requests(args, tokenizer) + + assert all( + llm.llm_engine.model_config.max_model_len + >= (request.prompt_len + request.expected_output_len) + for request in requests + ), ( + "Please ensure that max_model_len is greater than the sum of " + "prompt_len and expected_output_len for all requests." + ) + + prompts = [request.prompt for request in requests] + expected_output_lens = [request.expected_output_len for request in requests] + + sampling_params = [ + SamplingParams( + n=1, + temperature=0.0, + max_tokens=output_len, + detokenize=True, + ) + for output_len in expected_output_lens + ] + + selected_percentiles = [ + float(p) for p in getattr(args, "metric_percentiles", "99").split(",") + ] + + freeze_gc_heap() + + print(f"Processing {len(prompts)} requests...") + start_time = time.perf_counter() + + outputs = llm.chat( + prompts, sampling_params, use_tqdm=not getattr(args, "disable_tqdm", False) + ) + + end_time = time.perf_counter() + total_time = end_time - start_time + + mm_stats_by_stage = collect_mm_processor_stats( + llm.llm_engine, + ) + + if not any(mm_stats_by_stage.values()): + print( + "\n⚠️ Warning: No MM processor stats found in registry.\n" + " This may indicate that:\n" + " - No multimodal requests were processed\n" + " - Stats were already retrieved (registry is cleared after retrieval)\n" + ) + + mm_processor_metrics = calculate_mm_processor_metrics( + mm_stats_by_stage, selected_percentiles + ) + + completed = len([o for o in outputs if o.finished]) + failed = len(outputs) - completed + + e2el_times = [] + for output in outputs: + if not output.finished or output.metrics is None: + continue + metrics = output.metrics + for attr in ("finished_time", "last_token_time"): + if ( + getattr(metrics, attr, None) is not None + and getattr(metrics, "arrival_time", None) is not None + ): + e2el_times.append( + (getattr(metrics, attr) - metrics.arrival_time) * 1000 + ) + break + + if not e2el_times and completed > 0: + avg_time_per_request = total_time / completed + e2el_times = [avg_time_per_request * 1000] * completed + + if e2el_times: + mean_e2el_ms = float(np.mean(e2el_times)) + median_e2el_ms = float(np.median(e2el_times)) + std_e2el_ms = float(np.std(e2el_times)) + percentiles_e2el_ms = [ + (p, float(np.percentile(e2el_times, p))) for p in selected_percentiles + ] + else: + mean_e2el_ms = 0.0 + median_e2el_ms = 0.0 + std_e2el_ms = 0.0 + percentiles_e2el_ms = [(p, 0.0) for p in selected_percentiles] + + benchmark_result = { + "completed": completed, + "failed": failed, + "mean_e2el_ms": mean_e2el_ms, + "median_e2el_ms": median_e2el_ms, + "std_e2el_ms": std_e2el_ms, + "percentiles_e2el_ms": percentiles_e2el_ms, + "mm_processor_stats": mm_processor_metrics, + } + + return benchmark_result + + +def add_cli_args(parser: argparse.ArgumentParser) -> None: + """Add CLI arguments for the multimodal processor benchmark.""" + from vllm.engine.arg_utils import EngineArgs + + EngineArgs.add_cli_args(parser) + + parser.set_defaults(enable_mm_processor_stats=True) + + parser.add_argument( + "--dataset-name", + type=str, + default="random-mm", + choices=["random-mm", "random-rerank"], + help="Name of the dataset to benchmark on. Defaults to 'random-mm'.", + ) + parser.add_argument( + "--num-prompts", + type=int, + default=10, + help="Number of prompts to process.", + ) + + from vllm.benchmarks.datasets import ( + add_random_dataset_base_args, + add_random_multimodal_dataset_args, + ) + + add_random_dataset_base_args(parser) + add_random_multimodal_dataset_args(parser) + + parser.add_argument( + "--output-json", + type=str, + default=None, + help="Path to save the benchmark results in JSON format.", + ) + parser.add_argument( + "--metric-percentiles", + type=str, + default="99", + help="Comma-separated list of percentiles to calculate (e.g., '50,90,99').", + ) + parser.add_argument( + "--disable-tqdm", + action="store_true", + help="Disable tqdm progress bar.", + ) + + +def main(args: argparse.Namespace) -> None: + """Main entry point for the multimodal processor benchmark.""" + + print("Starting multimodal processor benchmark...") + result = benchmark_multimodal_processor(args) + + print("\n" + "=" * 80) + print("Multimodal Processor Benchmark Results") + print("=" * 80) + + if "mm_processor_stats" in result: + print("\nMM Processor Timing (ms):") + selected_percentiles = [ + float(p) for p in getattr(args, "metric_percentiles", "99").split(",") + ] + mm_data = [] + for stage, metrics in result["mm_processor_stats"].items(): + row = { + "Stage": stage, + "Mean": f"{metrics['mean']:.2f}", + "Median": f"{metrics['median']:.2f}", + "Std": f"{metrics['std']:.2f}", + } + for p in selected_percentiles: + row[f"P{p}"] = f"{metrics.get(f'p{p}', 0.0):.2f}" + mm_data.append(row) + + mm_df = pd.DataFrame(mm_data) + print(mm_df.to_string(index=False)) + + if "mean_e2el_ms" in result: + print("\nEnd-to-End Latency (ms):") + selected_percentiles = [ + float(p) for p in getattr(args, "metric_percentiles", "99").split(",") + ] + + e2el_data = [ + {"Metric": "Mean", "Value (ms)": f"{result['mean_e2el_ms']:.2f}"}, + {"Metric": "Median", "Value (ms)": f"{result['median_e2el_ms']:.2f}"}, + {"Metric": "Std", "Value (ms)": f"{result['std_e2el_ms']:.2f}"}, + ] + + for p in selected_percentiles: + percentile_value = next( + (val for pct, val in result["percentiles_e2el_ms"] if pct == p), + 0.0, + ) + e2el_data.append( + { + "Metric": f"P{p}", + "Value (ms)": f"{percentile_value:.2f}", + } + ) + + e2el_df = pd.DataFrame(e2el_data) + print(e2el_df.to_string(index=False)) + + if args.output_json: + result["config"] = { + "model": args.model, + "num_prompts": args.num_prompts, + "input_len": getattr(args, "random_input_len", None), + "output_len": getattr(args, "random_output_len", None), + } + result["timestamp"] = datetime.now().isoformat() + + with open(args.output_json, "w") as f: + json.dump(result, f, indent=2) + print(f"\nResults saved to {args.output_json}") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Benchmark mm processor latency") + add_cli_args(parser) + args = parser.parse_args() + main(args) diff --git a/vllm/benchmarks/throughput.py b/vllm/benchmarks/throughput.py index 37b8952a350b..3c0fea8e0111 100644 --- a/vllm/benchmarks/throughput.py +++ b/vllm/benchmarks/throughput.py @@ -24,10 +24,14 @@ MultiModalConversationDataset, PrefixRepetitionRandomDataset, RandomDataset, + RandomDatasetForReranking, + RandomMultiModalDataset, SampleRequest, ShareGPTDataset, SonnetDataset, VisionArenaDataset, + add_random_dataset_base_args, + add_random_multimodal_dataset_args, ) from vllm.benchmarks.lib.utils import convert_to_pytorch_benchmark_format, write_to_json from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs @@ -342,8 +346,6 @@ def get_requests(args, tokenizer): "lora_path": args.lora_path, "max_loras": args.max_loras, "num_requests": args.num_prompts, - "input_len": args.input_len, - "output_len": args.output_len, } if args.dataset_name == "random" or ( @@ -351,12 +353,26 @@ def get_requests(args, tokenizer): and args.dataset_name not in {"prefix_repetition", "random-mm", "random-rerank"} ): sample_kwargs["range_ratio"] = args.random_range_ratio - sample_kwargs["prefix_len"] = args.prefix_len + # prefer random_* arguments, fall back to regular arguments + random_prefix_len = getattr(args, "random_prefix_len", None) + sample_kwargs["prefix_len"] = ( + random_prefix_len if random_prefix_len is not None else args.prefix_len + ) + random_input_len = getattr(args, "random_input_len", None) + sample_kwargs["input_len"] = ( + random_input_len if random_input_len is not None else args.input_len + ) + random_output_len = getattr(args, "random_output_len", None) + sample_kwargs["output_len"] = ( + random_output_len if random_output_len is not None else args.output_len + ) dataset_cls = RandomDataset elif args.dataset_name == "sharegpt": dataset_cls = ShareGPTDataset if args.backend == "vllm-chat": sample_kwargs["enable_multimodal_chat"] = True + if args.output_len is not None: + sample_kwargs["output_len"] = args.output_len elif args.dataset_name == "sonnet": assert tokenizer.chat_template or tokenizer.default_chat_template, ( "Tokenizer/model must have chat template for sonnet dataset." @@ -364,9 +380,15 @@ def get_requests(args, tokenizer): dataset_cls = SonnetDataset sample_kwargs["prefix_len"] = args.prefix_len sample_kwargs["return_prompt_formatted"] = True + if args.input_len is not None: + sample_kwargs["input_len"] = args.input_len + if args.output_len is not None: + sample_kwargs["output_len"] = args.output_len elif args.dataset_name == "burstgpt": dataset_cls = BurstGPTDataset elif args.dataset_name == "hf": + if args.output_len is not None: + sample_kwargs["output_len"] = args.output_len if args.dataset_path in VisionArenaDataset.SUPPORTED_DATASET_PATHS: dataset_cls = VisionArenaDataset common_kwargs["dataset_subset"] = None @@ -395,6 +417,56 @@ def get_requests(args, tokenizer): sample_kwargs["suffix_len"] = args.prefix_repetition_suffix_len sample_kwargs["num_prefixes"] = args.prefix_repetition_num_prefixes sample_kwargs["output_len"] = args.prefix_repetition_output_len + elif args.dataset_name == "random-mm": + dataset_cls = RandomMultiModalDataset + # prefer random_* arguments, fall back to regular arguments + random_input_len = getattr(args, "random_input_len", None) + sample_kwargs["input_len"] = ( + random_input_len + if random_input_len is not None + else getattr(args, "input_len", None) + ) + random_output_len = getattr(args, "random_output_len", None) + sample_kwargs["output_len"] = ( + random_output_len + if random_output_len is not None + else getattr(args, "output_len", None) + ) + sample_kwargs["base_items_per_request"] = getattr( + args, "random_mm_base_items_per_request", None + ) + sample_kwargs["num_mm_items_range_ratio"] = getattr( + args, "random_mm_num_mm_items_range_ratio", None + ) + sample_kwargs["limit_mm_per_prompt"] = getattr( + args, "random_mm_limit_mm_per_prompt", None + ) + sample_kwargs["bucket_config"] = getattr(args, "random_mm_bucket_config", None) + sample_kwargs["enable_multimodal_chat"] = True + random_prefix_len = getattr(args, "random_prefix_len", None) + prefix_len = getattr(args, "prefix_len", None) + sample_kwargs["prefix_len"] = ( + random_prefix_len if random_prefix_len is not None else prefix_len + ) + sample_kwargs["range_ratio"] = args.random_range_ratio + elif args.dataset_name == "random-rerank": + dataset_cls = RandomDatasetForReranking + # prefer random_* arguments, fall back to regular arguments + random_input_len = getattr(args, "random_input_len", None) + sample_kwargs["input_len"] = ( + random_input_len + if random_input_len is not None + else getattr(args, "input_len", None) + ) + random_output_len = getattr(args, "random_output_len", None) + sample_kwargs["output_len"] = ( + random_output_len + if random_output_len is not None + else getattr(args, "output_len", None) + ) + sample_kwargs["batchsize"] = getattr(args, "random_batch_size", 1) + sample_kwargs["is_reranker"] = not getattr(args, "no_reranker", False) + sample_kwargs["range_ratio"] = args.random_range_ratio else: raise ValueError(f"Unknown dataset name: {args.dataset_name}") # Remove None values @@ -451,8 +523,12 @@ def validate_args(args): ): print("When dataset path is not set, it will default to random dataset") args.dataset_name = "random" - if args.input_len is None: - raise ValueError("input_len must be provided for a random dataset") + random_input_len = getattr(args, "random_input_len", None) + if args.input_len is None and random_input_len is None: + raise ValueError( + "Either --input-len or --random-input-len must be provided " + "for a random dataset" + ) # === Dataset Name Specific Checks === # --hf-subset and --hf-split: only used @@ -485,26 +561,79 @@ def validate_args(args): else: raise ValueError(f"{args.dataset_path} is not supported by hf dataset.") - # --random-range-ratio: only used when dataset_name is 'random' - if args.dataset_name != "random" and args.random_range_ratio is not None: + # --random-range-ratio: only used when dataset_name is 'random', + # 'random-mm', or 'random-rerank' + if ( + args.dataset_name not in {"random", "random-mm", "random-rerank"} + and args.random_range_ratio is not None + ): warnings.warn( "--random-range-ratio will be ignored since \ - --dataset-name is not 'random'.", + --dataset-name is not 'random', 'random-mm', or 'random-rerank'.", + stacklevel=2, + ) + + # --random-batch-size: only used when dataset_name is 'random-rerank' + if ( + args.dataset_name != "random-rerank" + and getattr(args, "random_batch_size", None) is not None + ) and args.random_batch_size != 1: + warnings.warn( + "--random-batch-size will be ignored since \ + --dataset-name is not 'random-rerank'.", + stacklevel=2, + ) + + # --no-reranker: only used when dataset_name is 'random-rerank' + if args.dataset_name != "random-rerank" and getattr(args, "no_reranker", False): + warnings.warn( + "--no-reranker will be ignored since \ + --dataset-name is not 'random-rerank'.", stacklevel=2, ) - # --prefix-len: only used when dataset_name is 'random', 'sonnet', or not - # set. + # --prefix-len: only used when dataset_name is 'random', 'random-mm', + # 'sonnet', or not set. if ( - args.dataset_name not in {"random", "sonnet", None} + args.dataset_name not in {"random", "random-mm", "sonnet", None} and args.prefix_len is not None ): warnings.warn( "--prefix-len will be ignored since --dataset-name\ - is not 'random', 'sonnet', or not set.", + is not 'random', 'random-mm', 'sonnet', or not set.", stacklevel=2, ) + # === Random Dataset Argument Conflict Detection === + # Check for conflicts between regular and random arguments when using + # random datasets + if args.dataset_name in {"random", "random-mm", "random-rerank"}: + random_input_len = getattr(args, "random_input_len", None) + random_output_len = getattr(args, "random_output_len", None) + random_prefix_len = getattr(args, "random_prefix_len", None) + + if args.input_len is not None and random_input_len is not None: + warnings.warn( + "Both --input-len and --random-input-len are specified. " + "The random version (--random-input-len) will be preferred " + "in this run.", + stacklevel=2, + ) + if args.output_len is not None and random_output_len is not None: + warnings.warn( + "Both --output-len and --random-output-len are specified. " + "The random version (--random-output-len) will be preferred " + "in this run.", + stacklevel=2, + ) + if args.prefix_len is not None and random_prefix_len is not None: + warnings.warn( + "Both --prefix-len and --random-prefix-len are specified. " + "The random version (--random-prefix-len) will be preferred " + "in this run.", + stacklevel=2, + ) + # === LoRA Settings === if getattr(args, "enable_lora", False) and args.backend != "vllm": raise ValueError("LoRA benchmarking is only supported for vLLM backend") @@ -554,7 +683,16 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--dataset-name", type=str, - choices=["sharegpt", "random", "sonnet", "burstgpt", "hf", "prefix_repetition"], + choices=[ + "sharegpt", + "random", + "sonnet", + "burstgpt", + "hf", + "prefix_repetition", + "random-mm", + "random-rerank", + ], help="Name of the dataset to benchmark on.", default="sharegpt", ) @@ -636,23 +774,19 @@ def add_cli_args(parser: argparse.ArgumentParser): help="Number of fixed prefix tokens before the random " "context in a request (default: 0).", ) - # random dataset - parser.add_argument( - "--random-range-ratio", - type=float, - default=0.0, - help="Range ratio for sampling input/output length, " - "used only for RandomDataset. Must be in the range [0, 1) to define " - "a symmetric sampling range " - "[length * (1 - range_ratio), length * (1 + range_ratio)].", - ) # hf dtaset parser.add_argument( - "--hf-subset", type=str, default=None, help="Subset of the HF dataset." + "--hf-subset", + type=str, + default=None, + help="Subset of the HF dataset.", ) parser.add_argument( - "--hf-split", type=str, default=None, help="Split of the HF dataset." + "--hf-split", + type=str, + default=None, + help="Split of the HF dataset.", ) parser.add_argument( "--profile", @@ -662,31 +796,28 @@ def add_cli_args(parser: argparse.ArgumentParser): ) # prefix repetition dataset - prefix_repetition_group = parser.add_argument_group( - "prefix repetition dataset options" - ) - prefix_repetition_group.add_argument( + parser.add_argument( "--prefix-repetition-prefix-len", type=int, default=None, help="Number of prefix tokens per request, used only for prefix " "repetition dataset.", ) - prefix_repetition_group.add_argument( + parser.add_argument( "--prefix-repetition-suffix-len", type=int, default=None, help="Number of suffix tokens per request, used only for prefix " "repetition dataset. Total input length is prefix_len + suffix_len.", ) - prefix_repetition_group.add_argument( + parser.add_argument( "--prefix-repetition-num-prefixes", type=int, default=None, help="Number of prefixes to generate, used only for prefix repetition " "dataset. Prompts per prefix is num_requests // num_prefixes.", ) - prefix_repetition_group.add_argument( + parser.add_argument( "--prefix-repetition-output-len", type=int, default=None, @@ -694,6 +825,10 @@ def add_cli_args(parser: argparse.ArgumentParser): "repetition dataset.", ) + # (random, random-mm, random-rerank) + add_random_dataset_base_args(parser) + add_random_multimodal_dataset_args(parser) + parser = AsyncEngineArgs.add_cli_args(parser) diff --git a/vllm/compilation/backends.py b/vllm/compilation/backends.py index 2fb6265560b1..4f855fc1d7c2 100644 --- a/vllm/compilation/backends.py +++ b/vllm/compilation/backends.py @@ -630,7 +630,7 @@ def __call__( os.makedirs(cache_dir, exist_ok=True) self.compilation_config.cache_dir = cache_dir rank = vllm_config.parallel_config.rank - dp_rank = vllm_config.parallel_config.data_parallel_rank + dp_rank = vllm_config.parallel_config.data_parallel_index local_cache_dir = os.path.join(cache_dir, f"rank_{rank}_{dp_rank}", self.prefix) os.makedirs(local_cache_dir, exist_ok=True) self.compilation_config.local_cache_dir = local_cache_dir diff --git a/vllm/compilation/decorators.py b/vllm/compilation/decorators.py index 40bde97ac61d..5334a57040eb 100644 --- a/vllm/compilation/decorators.py +++ b/vllm/compilation/decorators.py @@ -403,7 +403,7 @@ def __call__(self, *args, **kwargs): ) rank = self.vllm_config.parallel_config.rank - dp_rank = self.vllm_config.parallel_config.data_parallel_rank + dp_rank = self.vllm_config.parallel_config.data_parallel_index cache_dir = os.path.join(cache_dir, f"rank_{rank}_{dp_rank}") aot_compilation_path = os.path.join(cache_dir, "model") try: diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py index cd527e419855..7ac6ec0068bb 100644 --- a/vllm/config/compilation.py +++ b/vllm/config/compilation.py @@ -839,9 +839,9 @@ def init_backend(self, vllm_config: "VllmConfig") -> str | Callable: """ if self.mode is None: raise ValueError( - "No compilation mode is set. This method should only be \ - called via vllm config where the level is set if none is \ - provided." + "No compilation mode is set. This method should only be " + "called via vllm config where the level is set if none is " + "provided." ) if self.mode == CompilationMode.NONE: raise ValueError("No compilation mode is set.") diff --git a/vllm/config/model.py b/vllm/config/model.py index f080803f4973..83705e9482bf 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -10,10 +10,12 @@ import torch from pydantic import ConfigDict, Field, field_validator, model_validator from pydantic.dataclasses import dataclass -from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE import vllm.envs as envs from vllm.attention.backends.registry import AttentionBackendEnum +from vllm.config.model_arch import ( + ModelArchitectureConfig, +) from vllm.config.multimodal import MMCacheType, MMEncoderTPMode, MultiModalConfig from vllm.config.pooler import PoolerConfig from vllm.config.scheduler import RunnerType @@ -31,7 +33,6 @@ is_rope_parameters_nested, try_get_dense_modules, try_get_generation_config, - try_get_safetensors_metadata, try_get_tokenizer_config, uses_mrope, uses_xdrope_dim, @@ -42,10 +43,13 @@ maybe_patch_hf_config_from_gguf, split_remote_gguf, ) +from vllm.transformers_utils.model_arch_config_convertor import ( + MODEL_ARCH_CONFIG_CONVERTORS, + ModelArchConfigConvertorBase, +) from vllm.transformers_utils.runai_utils import ObjectStorageModel, is_runai_obj_uri from vllm.transformers_utils.utils import maybe_model_redirect from vllm.utils.import_utils import LazyLoader -from vllm.utils.torch_utils import common_broadcastable_dtype if TYPE_CHECKING: from transformers import PretrainedConfig @@ -483,6 +487,7 @@ def __post_init__( self.hf_image_processor_config = get_hf_image_processor_config( self.model, hf_token=self.hf_token, revision=self.revision ) + self.model_arch_config = self.get_model_arch_config() architectures = self.architectures registry = self.registry @@ -600,6 +605,15 @@ def __post_init__( self._verify_cuda_graph() self._verify_bnb_config() + def get_model_arch_config( + self, + ) -> ModelArchitectureConfig: + convertor_cls = MODEL_ARCH_CONFIG_CONVERTORS.get( + self.hf_config.model_type, ModelArchConfigConvertorBase + ) + convertor = convertor_cls(self.hf_config, self.hf_text_config) + return convertor.convert() + @field_validator("tokenizer", "max_model_len", mode="wrap") @classmethod def _skip_none_validation(cls, value: Any, handler: Callable) -> Any: @@ -642,7 +656,7 @@ def _get_transformers_backend_cls(self) -> str: cls = "Transformers" # If 'hf_config != hf_text_config' it's a nested config, i.e. multimodal cls += "MultiModal" if self.hf_config != self.hf_text_config else "" - cls += "MoE" if self.get_num_experts() > 1 else "" + cls += "MoE" if self.is_moe else "" # Check if the architecture we're wrapping has defaults runner = None task = None @@ -675,7 +689,7 @@ def registry(self): @property def architectures(self) -> list[str]: - return getattr(self.hf_config, "architectures", []) + return self.model_arch_config.architectures @property def architecture(self) -> str: @@ -835,56 +849,16 @@ def _get_convert_type( return convert_type - def _parse_quant_hf_config(self, hf_config: PretrainedConfig): - quant_cfg = getattr(hf_config, "quantization_config", None) - if quant_cfg is None: - # compressed-tensors uses a "compression_config" key - quant_cfg = getattr(hf_config, "compression_config", None) - - else: - # Set quant_method for ModelOpt models. - producer_name = quant_cfg.get("producer", {}).get("name") - if producer_name == "modelopt": - quant_algo = quant_cfg.get("quantization", {}).get("quant_algo") - if quant_algo is not None: - quant_algo_upper = str(quant_algo).upper() - if quant_algo_upper in { - "FP8", - "FP8_PER_CHANNEL_PER_TOKEN", - "FP8_PB_WO", - }: - quant_cfg["quant_method"] = "modelopt" - elif quant_algo_upper == "NVFP4": - quant_cfg["quant_method"] = "modelopt_fp4" - else: - raise ValueError(f"Unknown ModelOpt quant algo: {quant_algo}") - - return quant_cfg - def _verify_quantization(self) -> None: supported_quantization = me_quant.QUANTIZATION_METHODS if self.quantization is not None: self.quantization = cast(me_quant.QuantizationMethods, self.quantization) # Parse quantization method from the HF model config, if available. - quant_cfg = self._parse_quant_hf_config(self.hf_config) - if quant_cfg is None and ( - text_config := getattr(self.hf_config, "text_config", None) - ): - # Check the text config as well for multi-modal models. - quant_cfg = self._parse_quant_hf_config(text_config) + quant_cfg = self.model_arch_config.quantization_config if quant_cfg is not None: - # Use the community standard 'quant_method' - quant_method = quant_cfg.get("quant_method", "").lower() - - # Normalize library names - quant_method = quant_method.replace( - "compressed_tensors", "compressed-tensors" - ) - - quant_cfg["quant_method"] = quant_method - + quant_method = quant_cfg["quant_method"] # Quantization methods which are overrides (i.e. they have a # `override_quantization_method` method) must be checked in order # of preference (this is particularly important for GPTQ). @@ -966,7 +940,7 @@ def _verify_cuda_graph(self) -> None: logger.warning( "CUDA graph is not supported for %s on ROCm yet, fallback " "to eager mode.", - self.hf_config.model_type, + self.model_arch_config.model_type, ) self.enforce_eager = True @@ -977,11 +951,9 @@ def _verify_bnb_config(self) -> None: # TODO Remove this when bitsandbytes supports. """ is_bitsandbytes = self.quantization == "bitsandbytes" - has_quantization_config = ( - getattr(self.hf_config, "quantization_config", None) is not None - ) + has_quantization_config = self.model_arch_config.quantization_config is not None is_8bit = ( - self.hf_config.quantization_config.get("load_in_8bit", False) + self.model_arch_config.quantization_config.get("load_in_8bit", False) if has_quantization_config else False ) @@ -1001,8 +973,7 @@ def _verify_bnb_config(self) -> None: self.enforce_eager = True def _verify_with_expert_parallelism(self) -> None: - num_experts = self.get_num_experts() - if num_experts < 1: + if not self.is_moe: raise ValueError( "Number of experts in the model must be greater than 0 " "when expert parallelism is enabled." @@ -1052,9 +1023,7 @@ def verify_with_parallel_config( self, parallel_config: ParallelConfig, ) -> None: - total_num_attention_heads = getattr( - self.hf_text_config, "num_attention_heads", 0 - ) + total_num_attention_heads = self.model_arch_config.total_num_attention_heads tensor_parallel_size = parallel_config.tensor_parallel_size if total_num_attention_heads % tensor_parallel_size != 0: raise ValueError( @@ -1105,10 +1074,10 @@ def get_sliding_window(self) -> int | None: return getattr(self.hf_text_config, "sliding_window", None) def get_vocab_size(self) -> int: - return getattr(self.hf_text_config, "vocab_size", 0) + return self.model_arch_config.vocab_size def get_hidden_size(self) -> int: - return getattr(self.hf_text_config, "hidden_size", 0) + return self.model_arch_config.hidden_size def get_inputs_embeds_size(self) -> int: # The size of inputs_embeds is usually identical to the size @@ -1121,141 +1090,25 @@ def get_inputs_embeds_size(self) -> int: @property def is_deepseek_mla(self) -> bool: - if not hasattr(self.hf_text_config, "model_type"): - return False - elif self.hf_text_config.model_type in ( - "deepseek_v2", - "deepseek_v3", - "deepseek_v32", - "deepseek_mtp", - "kimi_k2", - "kimi_linear", - "longcat_flash", - "pangu_ultra_moe", - "pangu_ultra_moe_mtp", - ): - return self.hf_text_config.kv_lora_rank is not None - elif self.hf_text_config.model_type == "eagle": - # if the model is an EAGLE module, check for the - # underlying architecture - return ( - self.hf_text_config.model.model_type - in ("deepseek_v2", "deepseek_v3", "deepseek_v32") - and self.hf_text_config.kv_lora_rank is not None - ) - return False + return self.model_arch_config.is_deepseek_mla @cached_property def is_mm_prefix_lm(self) -> bool: """Whether to use bidirectional attention for mm positions.""" MM_PREFIX_LM_MODELS = ( "gemma3", - # TODO(Isotr0py): Disable paligemma for now before - # we supports soft cap attention for FlexAttention - # "paligemma", + "paligemma", ) if not hasattr(self.hf_config, "model_type"): return False return self.hf_config.model_type in MM_PREFIX_LM_MODELS def get_head_size(self) -> int: - # TODO remove hard code - if self.is_deepseek_mla: - qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim", 0) - if self.use_mla: - return self.hf_text_config.kv_lora_rank + qk_rope_head_dim - else: - qk_nope_head_dim = getattr(self.hf_text_config, "qk_nope_head_dim", 0) - if qk_rope_head_dim and qk_nope_head_dim: - return qk_rope_head_dim + qk_nope_head_dim - - if hasattr(self.hf_text_config, "model_type") and ( - self.hf_text_config.model_type == "zamba2" - ): - return self.hf_text_config.attention_head_dim - - if self.is_attention_free: - return 0 - - # NOTE: Some configs may set head_dim=None in the config - if getattr(self.hf_text_config, "head_dim", None) is not None: - return self.hf_text_config.head_dim - - # NOTE: Some models (such as PLaMo2.1) use `hidden_size_per_head` - if getattr(self.hf_text_config, "hidden_size_per_head", None) is not None: - return self.hf_text_config.hidden_size_per_head - - # FIXME(woosuk): This may not be true for all models. - return ( - self.hf_text_config.hidden_size // self.hf_text_config.num_attention_heads - ) + return self.model_arch_config.head_size def get_total_num_kv_heads(self) -> int: """Returns the total number of KV heads.""" - # For GPTBigCode & Falcon: - # NOTE: for falcon, when new_decoder_architecture is True, the - # multi_query flag is ignored and we use n_head_kv for the number of - # KV heads. - falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"] - new_decoder_arch_falcon = ( - self.hf_config.model_type in falcon_model_types - and getattr(self.hf_config, "new_decoder_architecture", False) - ) - if not new_decoder_arch_falcon and getattr( - self.hf_text_config, "multi_query", False - ): - # Multi-query attention, only one KV head. - # Currently, tensor parallelism is not supported in this case. - return 1 - - # For DBRX and MPT - if self.hf_config.model_type == "mpt": - if "kv_n_heads" in self.hf_config.attn_config: - return self.hf_config.attn_config["kv_n_heads"] - return self.hf_config.num_attention_heads - if self.hf_config.model_type == "dbrx": - return getattr( - self.hf_config.attn_config, - "kv_n_heads", - self.hf_config.num_attention_heads, - ) - - if self.hf_config.model_type == "nemotron-nas": - for block in self.hf_config.block_configs: - if not block.attention.no_op: - return ( - self.hf_config.num_attention_heads - // block.attention.n_heads_in_group - ) - - raise RuntimeError( - "Could not determine the number of key-value attention heads " - "from model configuration. " - f"Model: {self.model}, Architecture: {self.architectures}. " - "This usually indicates an unsupported model architecture or " - "missing configuration. " - "Please check if your model is supported at: " - "https://docs.vllm.ai/en/latest/models/supported_models.html" - ) - - if self.is_attention_free: - return 0 - - attributes = [ - # For Falcon: - "n_head_kv", - "num_kv_heads", - # For LLaMA-2: - "num_key_value_heads", - # For ChatGLM: - "multi_query_group_num", - ] - # For non-grouped-query attention models, the number of KV heads is - # equal to the number of attention heads. - default_factory = lambda: self.hf_text_config.num_attention_heads - return getattr_iter( - self.hf_text_config, attributes, default_factory=default_factory - ) + return self.model_arch_config.total_num_kv_heads def get_num_kv_heads(self, parallel_config: ParallelConfig) -> int: """Returns the number of KV heads per GPU.""" @@ -1271,46 +1124,14 @@ def get_num_kv_heads(self, parallel_config: ParallelConfig) -> int: return max(1, total_num_kv_heads // parallel_config.tensor_parallel_size) def get_num_attention_heads(self, parallel_config: ParallelConfig) -> int: - num_heads = getattr(self.hf_text_config, "num_attention_heads", 0) + num_heads = self.model_arch_config.total_num_attention_heads return num_heads // parallel_config.tensor_parallel_size def get_num_experts(self) -> int: - """Returns the number of experts in the model.""" - num_expert_names = [ - "num_experts", # Jamba - "moe_num_experts", # Dbrx - "n_routed_experts", # DeepSeek - "num_local_experts", # Mixtral - ] - num_experts = getattr_iter(self.hf_text_config, num_expert_names, 0) - if isinstance(num_experts, list): - # Ernie VL's remote code uses list[int]... - # The values are always the same so we just take the first one. - return num_experts[0] - # Coerce to 0 if explicitly set to None - return num_experts or 0 + return self.model_arch_config.num_experts def get_total_num_hidden_layers(self) -> int: - if ( - self.hf_text_config.model_type == "deepseek_mtp" - or self.hf_config.model_type == "mimo_mtp" - or self.hf_config.model_type == "glm4_moe_mtp" - or self.hf_config.model_type == "ernie_mtp" - or self.hf_config.model_type == "qwen3_next_mtp" - or self.hf_config.model_type == "pangu_ultra_moe_mtp" - ): - total_num_hidden_layers = getattr( - self.hf_text_config, "num_nextn_predict_layers", 0 - ) - elif self.hf_config.model_type == "longcat_flash_mtp": - total_num_hidden_layers = getattr( - self.hf_text_config, "num_nextn_predict_layers", 1 - ) - else: - total_num_hidden_layers = getattr( - self.hf_text_config, "num_hidden_layers", 0 - ) - return total_num_hidden_layers + return self.model_arch_config.total_num_hidden_layers def get_layers_start_end_indices( self, parallel_config: ParallelConfig @@ -1361,9 +1182,7 @@ def get_num_layers_by_block_type( self.hf_text_config, "layers_block_type", None ) if layers_block_type_value is not None: - if hasattr(self.hf_text_config, "model_type") and ( - self.hf_text_config.model_type == "zamba2" - ): + if self.model_arch_config.text_model_type == "zamba2": if attn_block_type: return sum( t == "hybrid" for t in layers_block_type_value[start:end] @@ -1615,10 +1434,18 @@ def matryoshka_dimensions(self): return getattr(self.hf_config, "matryoshka_dimensions", None) @property - def use_pad_token(self) -> bool: - # cross_encoder models defaults to using pad_token. - # `llm as reranker` models defaults to not using pad_token. - return getattr(self.hf_config, "use_pad_token", True) + def use_sep_token(self) -> bool: + # cross_encoder models defaults to using separating token. + # `llm as reranker` defaults to not using separating token. + + use_pad_token = getattr(self.hf_config, "use_pad_token", None) + if use_pad_token is not None: + logger.warning_once( + "use_pad_token has been deprecated; please use use_sep_token instead." + ) + return use_pad_token + + return getattr(self.hf_config, "use_sep_token", True) @property def head_dtype(self) -> torch.dtype: @@ -1678,6 +1505,7 @@ def get_and_verify_max_len(self, max_model_len: int): ) max_model_len = _get_and_verify_max_len( hf_config=self.hf_text_config, + model_arch_config=self.model_arch_config, tokenizer_config=tokenizer_config, max_model_len=max_model_len, disable_sliding_window=self.disable_sliding_window, @@ -1797,11 +1625,11 @@ def is_prefix_caching_supported(self) -> bool: logger.debug("Generative models support prefix caching.") return True - def is_model_moe( - self, - ) -> bool: - return self.get_num_experts() > 1 + @property + def is_moe(self) -> bool: + return self.get_num_experts() > 0 + @property def is_quantized(self) -> bool: return getattr(self.hf_config, "quantization_config", None) is not None @@ -1908,46 +1736,6 @@ def _check_valid_dtype(model_type: str, dtype: torch.dtype): return True -def _find_dtype( - model_id: str, - config: PretrainedConfig, - *, - revision: str | None, -): - # NOTE: getattr(config, "dtype", torch.float32) is not correct - # because config.dtype can be None. - config_dtype = getattr(config, "dtype", None) - - # Fallbacks for multi-modal models if the root config - # does not define dtype - if config_dtype is None: - config_dtype = getattr(config.get_text_config(), "dtype", None) - if config_dtype is None and hasattr(config, "vision_config"): - config_dtype = getattr(config.vision_config, "dtype", None) - if config_dtype is None and hasattr(config, "encoder_config"): - config_dtype = getattr(config.encoder_config, "dtype", None) - - # Try to read the dtype of the weights if they are in safetensors format - if config_dtype is None: - repo_mt = try_get_safetensors_metadata(model_id, revision=revision) - - if repo_mt and (files_mt := repo_mt.files_metadata): - param_dtypes: set[torch.dtype] = { - _SAFETENSORS_TO_TORCH_DTYPE[dtype_str] - for file_mt in files_mt.values() - for dtype_str in file_mt.parameter_count - if dtype_str in _SAFETENSORS_TO_TORCH_DTYPE - } - - if param_dtypes: - return common_broadcastable_dtype(param_dtypes) - - if config_dtype is None: - config_dtype = torch.float32 - - return config_dtype - - def _resolve_auto_dtype( model_type: str, config_dtype: torch.dtype, @@ -2002,7 +1790,9 @@ def _get_and_verify_dtype( is_pooling_model: bool, revision: str | None = None, ) -> torch.dtype: - config_dtype = _find_dtype(model_id, config, revision=revision) + config_dtype = ModelArchConfigConvertorBase.get_torch_dtype( + config, model_id, revision=revision + ) model_type = config.model_type if isinstance(dtype, str): @@ -2065,6 +1855,7 @@ def _get_head_dtype( def _get_and_verify_max_len( hf_config: PretrainedConfig, + model_arch_config: ModelArchitectureConfig, tokenizer_config: dict | None, max_model_len: int | None, disable_sliding_window: bool, @@ -2073,36 +1864,9 @@ def _get_and_verify_max_len( encoder_config: Any | None = None, ) -> int: """Get and verify the model's maximum length.""" - derived_max_model_len = float("inf") - possible_keys = [ - # OPT - "max_position_embeddings", - # GPT-2 - "n_positions", - # MPT - "max_seq_len", - # ChatGLM2 - "seq_length", - # Command-R - "model_max_length", - # Whisper - "max_target_positions", - # Others - "max_sequence_length", - "max_seq_length", - "seq_len", - ] - # Choose the smallest "max_length" from the possible keys - max_len_key = None - for key in possible_keys: - max_len = getattr(hf_config, key, None) - if max_len is not None: - max_len_key = key if max_len < derived_max_model_len else max_len_key - derived_max_model_len = min(derived_max_model_len, max_len) - # For Command-R / Cohere, Cohere2 / Aya Vision models - if tmp_max_len := getattr(hf_config, "model_max_length", None): - max_len_key = "model_max_length" - derived_max_model_len = tmp_max_len + (derived_max_model_len, max_len_key) = ( + model_arch_config.derived_max_model_len_and_key + ) # If sliding window is manually disabled, max_length should be less # than the sliding window length in the model config. @@ -2135,10 +1899,9 @@ def _get_and_verify_max_len( default_max_len = 2048 logger.warning( - "The model's config.json does not contain any of the following " - "keys to determine the original maximum length of the model: " - "%s. Assuming the model's maximum length is %d.", - possible_keys, + "The model's config.json does not contain any of the keys " + "to determine the original maximum length of the model. " + "Assuming the model's maximum length is %d.", default_max_len, ) derived_max_model_len = default_max_len diff --git a/vllm/config/model_arch.py b/vllm/config/model_arch.py new file mode 100644 index 000000000000..d55e2a3399b3 --- /dev/null +++ b/vllm/config/model_arch.py @@ -0,0 +1,57 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +from typing import Any + +from pydantic import ConfigDict +from pydantic.dataclasses import dataclass + +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +@dataclass(config=ConfigDict(arbitrary_types_allowed=True)) +class ModelArchitectureConfig: + """ + Configuration for model architecture that required by vLLM runtime + """ + + architectures: list[str] | None + """List of model architecture class names (e.g., ['LlamaForCausalLM']). + It can be None upon calling `vllm_config.with_hf_config(config.text_config)`""" + + model_type: str + """Model type identifier (e.g., 'llama', 'gpt_oss').""" + + text_model_type: str | None + """Text model type identifier (e.g., 'llama4_text').""" + + hidden_size: int + """Hidden size of the model.""" + + total_num_hidden_layers: int + """Number of hidden layers in the model.""" + + total_num_attention_heads: int + """Number of attention heads in the model.""" + + head_size: int + """Head dimension of the model.""" + + vocab_size: int + """Vocabulary size of the model.""" + + total_num_kv_heads: int + """Number of key value heads in the model.""" + + num_experts: int + """Number of experts in the model.""" + + quantization_config: dict[str, Any] | None + """Quantization configuration dictionary containing quantization parameters.""" + + is_deepseek_mla: bool + """Whether the model is a DeepSeek MLA model.""" + + derived_max_model_len_and_key: tuple[float, str | None] + """Derived maximum model length and key from the hf config.""" diff --git a/vllm/config/observability.py b/vllm/config/observability.py index 4aca6b15684a..ca347a30f7ce 100644 --- a/vllm/config/observability.py +++ b/vllm/config/observability.py @@ -67,6 +67,14 @@ def show_hidden_metrics(self) -> bool: enable_mfu_metrics: bool = False """Enable Model FLOPs Utilization (MFU) metrics.""" + enable_mm_processor_stats: bool = False + """Enable collection of timing statistics for multimodal processor operations. + This is for internal use only (e.g., benchmarks) and is not exposed as a CLI + argument.""" + + enable_mfu_metrics: bool = False + """Enable Model FLOPs Utilization (MFU) metrics.""" + @cached_property def collect_model_forward_time(self) -> bool: """Whether to collect model forward time for the request.""" diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 11504fb08355..9273ca66e830 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -119,6 +119,8 @@ class ParallelConfig: between local data parallel ranks, but an external LB balances between vLLM nodes/replicas. Set explicitly in conjunction with --data-parallel-start-rank.""" + is_moe_model: bool | None = None + """Whether the deployed model is MoE (if known).""" enable_expert_parallel: bool = False """Use expert parallelism instead of tensor parallelism for MoE layers.""" enable_eplb: bool = False @@ -255,6 +257,10 @@ class is dynamically inherited by the worker class. This is used to inject Block_size should be divisible by cp_kv_cache_interleave_size. """ + data_parallel_index: int = Field(init=False) + """Equal to the data parallel rank but not used for torch process groups + and not overridden for dense models.""" + _api_process_count: int = Field(default=1, gt=0) """ The number of API processes initialized. @@ -466,6 +472,7 @@ def compute_hash(self): "data_parallel_rank", "data_parallel_rank_local", "data_parallel_size_local", + "data_parallel_index", "data_parallel_backend", "data_parallel_external_lb", "data_parallel_hybrid_lb", @@ -546,6 +553,14 @@ def __post_init__(self) -> None: self.data_parallel_master_ip = envs.VLLM_DP_MASTER_IP self.data_parallel_master_port = envs.VLLM_DP_MASTER_PORT + if self.data_parallel_size > 1 and self.is_moe_model is False: + raise ValueError( + "Offline data parallel mode is not supported/useful" + " for dense models." + ) + + self.data_parallel_index = self.data_parallel_rank + if self.distributed_executor_backend == "external_launcher": os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0" logger.info("Disabling V1 multiprocessing for external launcher.") diff --git a/vllm/config/speculative.py b/vllm/config/speculative.py index bf533bf14e55..ad4057de834f 100644 --- a/vllm/config/speculative.py +++ b/vllm/config/speculative.py @@ -401,6 +401,9 @@ def __post_init__(self): model_type="eagle", ) self.draft_model_config.hf_config = eagle_config + self.draft_model_config.model_arch_config = ( + self.draft_model_config.get_model_arch_config() + ) if self.num_speculative_tokens is not None and hasattr( self.draft_model_config.hf_config, "num_lookahead_tokens" diff --git a/vllm/config/vllm.py b/vllm/config/vllm.py index 8e6cfb826397..30a24233575f 100644 --- a/vllm/config/vllm.py +++ b/vllm/config/vllm.py @@ -343,6 +343,29 @@ def pad_for_cudagraph(self, batch_size: int) -> int: # i.e., batch_size <= self.compilation_config.max_cudagraph_capture_size return self.compilation_config.bs_to_padded_graph_size[batch_size] + @property + def needs_dp_coordinator(self) -> bool: + """ + Determine if the DPCoordinator process is needed. + + The DPCoordinator is needed in two cases: + 1. For MoE models with DP > 1: to handle wave coordination + (even in external LB mode, since wave coordination runs in the coordinator) + 2. For non-MoE models in internal/hybrid LB mode: to collect and publish + queue stats for load balancing across DP ranks + + Returns: + True if DPCoordinator process is needed, False otherwise. + """ + + # For non-MoE models, only need coordinator in internal/hybrid LB mode + # (for stats collection). + return self.parallel_config.data_parallel_size > 1 and ( + self.model_config is None + or self.model_config.is_moe + or not self.parallel_config.data_parallel_external_lb + ) + def enable_trace_function_call_for_thread(self) -> None: """ Set up function tracing for the current thread, @@ -421,6 +444,7 @@ def with_hf_config( model_config = copy.deepcopy(self.model_config) model_config.hf_config = hf_config + model_config.model_arch_config = model_config.get_model_arch_config() return replace(self, model_config=model_config) @@ -522,6 +546,8 @@ def __post_init__(self): self.model_config.verify_with_parallel_config(self.parallel_config) self.model_config.verify_dual_chunk_attention_config(self.load_config) + self.parallel_config.is_moe_model = self.model_config.is_moe + self.cache_config.verify_with_parallel_config(self.parallel_config) if self.lora_config is not None: @@ -827,9 +853,14 @@ def has_blocked_weights(): ) # Do this after all the updates to compilation_config.mode + effective_dp_size = ( + self.parallel_config.data_parallel_size + if self.model_config is None or self.model_config.is_moe + else 1 + ) self.compilation_config.set_splitting_ops_for_v1( all2all_backend=self.parallel_config.all2all_backend, - data_parallel_size=self.parallel_config.data_parallel_size, + data_parallel_size=effective_dp_size, ) if self.compilation_config.pass_config.enable_sp: @@ -1233,12 +1264,6 @@ def _set_compile_ranges(self): computed_compile_ranges_split_points ) - def recalculate_max_model_len(self, max_model_len: int): - # Can only be called in try_verify_and_update_config - model_config = self.model_config - max_model_len = model_config.get_and_verify_max_len(max_model_len) - self.model_config.max_model_len = max_model_len - def try_verify_and_update_config(self): if self.model_config is None: return @@ -1297,13 +1322,8 @@ def compile_debug_dump_path(self) -> Path | None: if self.compilation_config.debug_dump_path is None: return None tp_rank = self.parallel_config.rank - dp_rank = self.parallel_config.data_parallel_rank - data_parallel_size = self.parallel_config.data_parallel_size - append_path = ( - f"rank_{tp_rank}" - if data_parallel_size == 1 - else f"rank_{tp_rank}_dp_{dp_rank}" - ) + dp_rank = self.parallel_config.data_parallel_index + append_path = f"rank_{tp_rank}_dp_{dp_rank}" path = self.compilation_config.debug_dump_path / append_path return path diff --git a/vllm/distributed/device_communicators/base_device_communicator.py b/vllm/distributed/device_communicators/base_device_communicator.py index caeff54406b5..4a2a7ec5b728 100644 --- a/vllm/distributed/device_communicators/base_device_communicator.py +++ b/vllm/distributed/device_communicators/base_device_communicator.py @@ -285,6 +285,7 @@ def dispatch( hidden_states: torch.Tensor, router_logits: torch.Tensor, is_sequence_parallel: bool = False, + extra_tensors: list[torch.Tensor] | None = None, ) -> tuple[torch.Tensor, torch.Tensor]: """ Dispatch the hidden states and router logits to the appropriate device. diff --git a/vllm/distributed/device_communicators/xpu_communicator.py b/vllm/distributed/device_communicators/xpu_communicator.py index ad61fdfb8ea5..f3d9262d20cf 100644 --- a/vllm/distributed/device_communicators/xpu_communicator.py +++ b/vllm/distributed/device_communicators/xpu_communicator.py @@ -23,11 +23,11 @@ def __init__( ): super().__init__(cpu_group, device, device_group, unique_name) if self.use_all2all: - if self.all2all_backend != "naive": + if self.all2all_backend != "naive": # type: ignore[has-type] logger.warning( "`%s` all2all manager is not supported on XPU. " "Falling back to `naive` all2all manager for XPU.", - self.all2all_backend, + self.all2all_backend, # type: ignore[has-type] ) self.all2all_backend = "naive" if self.all2all_backend == "naive": @@ -78,12 +78,15 @@ def dispatch( hidden_states: torch.Tensor, router_logits: torch.Tensor, is_sequence_parallel: bool = False, + extra_tensors: list[torch.Tensor] | None = None, ) -> tuple[torch.Tensor, torch.Tensor]: assert self.all2all_manager is not None - hidden_states, router_logits = self.all2all_manager.dispatch( - hidden_states, router_logits, is_sequence_parallel + return self.all2all_manager.dispatch( + hidden_states, + router_logits, + is_sequence_parallel, + extra_tensors, # type: ignore[call-arg] ) - return hidden_states, router_logits def combine( self, hidden_states: torch.Tensor, is_sequence_parallel: bool = False diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index c5654659b79d..7826b1286716 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -423,7 +423,7 @@ def add_model( # Set the policy based on the selected eplb algorithm type. policy_type = self.parallel_config.eplb_config.policy self.policy = EPLB_POLICIES[policy_type] - logger.debug("Selected EPLB policy: %d", policy_type) + logger.debug("Selected EPLB policy: %s", policy_type) if global_expert_load is not None: ep_group = get_ep_group().device_group assert global_expert_load.shape == ( diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py index 0e87dea59d23..1383fc09eb0a 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/lmcache_integration/utils.py @@ -6,7 +6,6 @@ from typing import TYPE_CHECKING, Union import torch -from lmcache.config import LMCacheEngineConfig as Config from lmcache.logging import init_logger from lmcache.v1.config import LMCacheEngineConfig as V1Config @@ -20,7 +19,7 @@ ENGINE_NAME = "vllm-instance" # Thread-safe singleton storage -_config_instance: Config | V1Config | None = None +_config_instance: V1Config | None = None _config_lock = threading.Lock() @@ -29,7 +28,7 @@ def is_false(value: str) -> bool: return value.lower() in ("false", "0", "no", "n", "off") -def lmcache_get_or_create_config() -> Config | V1Config: +def lmcache_get_or_create_config() -> V1Config: """Get the LMCache configuration from the environment variable `LMCACHE_CONFIG_FILE`. If the environment variable is not set, this function will return the default configuration. @@ -43,16 +42,7 @@ def lmcache_get_or_create_config() -> Config | V1Config: if _config_instance is None: with _config_lock: if _config_instance is None: # Check again within lock - if is_false(os.getenv("LMCACHE_USE_EXPERIMENTAL", "True")): - logger.warning( - "Detected LMCACHE_USE_EXPERIMENTAL is set to False. " - "Using legacy configuration is deprecated and will " - "be remove soon! Please set LMCACHE_USE_EXPERIMENTAL " - "to True." - ) - LMCacheEngineConfig = Config # type: ignore[assignment] - else: - LMCacheEngineConfig = V1Config # type: ignore[assignment] + LMCacheEngineConfig = V1Config # type: ignore[assignment] if "LMCACHE_CONFIG_FILE" not in os.environ: logger.warning( diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py index 9a15d3fa6ed0..af7792286f6a 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/mooncake_connector.py @@ -915,6 +915,6 @@ def get_mooncake_side_channel_port(vllm_config: VllmConfig) -> int: # This logic is now centralized return ( envs.VLLM_MOONCAKE_BOOTSTRAP_PORT - + vllm_config.parallel_config.data_parallel_rank + + vllm_config.parallel_config.data_parallel_index * vllm_config.parallel_config.tensor_parallel_size ) diff --git a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py index baeafc08dd3f..2be3f2f84ff3 100644 --- a/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py +++ b/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py @@ -471,7 +471,7 @@ def __init__(self, vllm_config: VllmConfig, engine_id: str): self.side_channel_host = envs.VLLM_NIXL_SIDE_CHANNEL_HOST self.side_channel_port = ( envs.VLLM_NIXL_SIDE_CHANNEL_PORT - + vllm_config.parallel_config.data_parallel_rank + + vllm_config.parallel_config.data_parallel_index ) assert vllm_config.kv_transfer_config is not None if current_platform.device_type == "cpu": diff --git a/vllm/distributed/parallel_state.py b/vllm/distributed/parallel_state.py index f5ada5a009ec..4611d42a5874 100644 --- a/vllm/distributed/parallel_state.py +++ b/vllm/distributed/parallel_state.py @@ -1115,7 +1115,11 @@ def get_dp_group() -> GroupCoordinator: def get_ep_group() -> GroupCoordinator: - assert _EP is not None, "expert parallel group is not initialized" + assert _EP is not None, ( + "expert parallel group is not initialized. " + "EP group is only created for MoE models with num_experts > 0. " + "This function should only be called for MoE models." + ) return _EP @@ -1400,20 +1404,23 @@ def initialize_model_parallel( global _EP assert _EP is None, "expert parallel group is already initialized" - group_ranks = ( - all_ranks.transpose(1, 2) - .reshape( - -1, - data_parallel_size - * prefill_context_model_parallel_size - * tensor_model_parallel_size, + # Don't create EP group for dense models. + if config is None or config.model_config is None or config.model_config.is_moe: + group_ranks = ( + all_ranks.transpose(1, 2) + .reshape( + -1, + data_parallel_size + * prefill_context_model_parallel_size + * tensor_model_parallel_size, + ) + .unbind(0) ) - .unbind(0) - ) - group_ranks = [x.tolist() for x in group_ranks] - _EP = init_model_parallel_group( - group_ranks, get_world_group().local_rank, backend, group_name="ep" - ) + group_ranks = [x.tolist() for x in group_ranks] + _EP = init_model_parallel_group( + group_ranks, get_world_group().local_rank, backend, group_name="ep" + ) + # If no EP group needed, _EP remains None logger.info_once( "rank %s in world size %s is assigned as " @@ -1425,7 +1432,7 @@ def initialize_model_parallel( _PP.rank_in_group, _PCP.rank_in_group, _TP.rank_in_group, - _EP.rank_in_group, + _EP.rank_in_group if _EP is not None else "N/A", ) diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index dd12b032c78c..98f1cfbd5922 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -523,6 +523,7 @@ class EngineArgs: ObservabilityConfig.enable_layerwise_nvtx_tracing ) enable_mfu_metrics: bool = ObservabilityConfig.enable_mfu_metrics + enable_mm_processor_stats: bool = ObservabilityConfig.enable_mm_processor_stats scheduling_policy: SchedulerPolicy = SchedulerConfig.policy scheduler_cls: str | type[object] | None = SchedulerConfig.scheduler_cls @@ -838,7 +839,9 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: **parallel_kwargs["data_parallel_external_lb"], ) parallel_group.add_argument( - "--enable-expert-parallel", **parallel_kwargs["enable_expert_parallel"] + "--enable-expert-parallel", + "-ep", + **parallel_kwargs["enable_expert_parallel"], ) parallel_group.add_argument( "--all2all-backend", **parallel_kwargs["all2all_backend"] @@ -1574,6 +1577,7 @@ def create_engine_config( data_parallel_rpc_port=data_parallel_rpc_port, data_parallel_backend=self.data_parallel_backend, data_parallel_hybrid_lb=self.data_parallel_hybrid_lb, + is_moe_model=model_config.is_moe, enable_expert_parallel=self.enable_expert_parallel, all2all_backend=self.all2all_backend, enable_dbo=self.enable_dbo, @@ -1712,6 +1716,7 @@ def create_engine_config( cudagraph_metrics=self.cudagraph_metrics, enable_layerwise_nvtx_tracing=self.enable_layerwise_nvtx_tracing, enable_mfu_metrics=self.enable_mfu_metrics, + enable_mm_processor_stats=self.enable_mm_processor_stats, ) # Compilation config overrides diff --git a/vllm/entrypoints/cli/__init__.py b/vllm/entrypoints/cli/__init__.py index dc02ac563406..704d94d36f70 100644 --- a/vllm/entrypoints/cli/__init__.py +++ b/vllm/entrypoints/cli/__init__.py @@ -1,6 +1,9 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.entrypoints.cli.benchmark.latency import BenchmarkLatencySubcommand +from vllm.entrypoints.cli.benchmark.mm_processor import ( + BenchmarkMMProcessorSubcommand, +) from vllm.entrypoints.cli.benchmark.serve import BenchmarkServingSubcommand from vllm.entrypoints.cli.benchmark.startup import BenchmarkStartupSubcommand from vllm.entrypoints.cli.benchmark.sweep import BenchmarkSweepSubcommand @@ -8,6 +11,7 @@ __all__: list[str] = [ "BenchmarkLatencySubcommand", + "BenchmarkMMProcessorSubcommand", "BenchmarkServingSubcommand", "BenchmarkStartupSubcommand", "BenchmarkSweepSubcommand", diff --git a/vllm/entrypoints/cli/benchmark/mm_processor.py b/vllm/entrypoints/cli/benchmark/mm_processor.py new file mode 100644 index 000000000000..8f1799af12e5 --- /dev/null +++ b/vllm/entrypoints/cli/benchmark/mm_processor.py @@ -0,0 +1,21 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import argparse + +from vllm.benchmarks.mm_processor import add_cli_args, main +from vllm.entrypoints.cli.benchmark.base import BenchmarkSubcommandBase + + +class BenchmarkMMProcessorSubcommand(BenchmarkSubcommandBase): + """The `mm-processor` subcommand for `vllm bench`.""" + + name = "mm-processor" + help = "Benchmark multimodal processor latency across different configurations." + + @classmethod + def add_cli_args(cls, parser: argparse.ArgumentParser) -> None: + add_cli_args(parser) + + @staticmethod + def cmd(args: argparse.Namespace) -> None: + main(args) diff --git a/vllm/entrypoints/logger.py b/vllm/entrypoints/logger.py index 678a7b3a60b5..c9e809353b59 100644 --- a/vllm/entrypoints/logger.py +++ b/vllm/entrypoints/logger.py @@ -1,6 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import logging from collections.abc import Sequence import torch @@ -26,23 +27,24 @@ def log_inputs( params: SamplingParams | PoolingParams | BeamSearchParams | None, lora_request: LoRARequest | None, ) -> None: - max_log_len = self.max_log_len - if max_log_len is not None: - if prompt is not None: - prompt = prompt[:max_log_len] + if logger.isEnabledFor(logging.DEBUG): + max_log_len = self.max_log_len + if max_log_len is not None: + if prompt is not None: + prompt = prompt[:max_log_len] - if prompt_token_ids is not None: - prompt_token_ids = prompt_token_ids[:max_log_len] + if prompt_token_ids is not None: + prompt_token_ids = prompt_token_ids[:max_log_len] - logger.debug( - "Request %s details: prompt: %r, " - "prompt_token_ids: %s, " - "prompt_embeds shape: %s.", - request_id, - prompt, - prompt_token_ids, - prompt_embeds.shape if prompt_embeds is not None else None, - ) + logger.debug( + "Request %s details: prompt: %r, " + "prompt_token_ids: %s, " + "prompt_embeds shape: %s.", + request_id, + prompt, + prompt_token_ids, + prompt_embeds.shape if prompt_embeds is not None else None, + ) logger.info( "Received request %s: params: %s, lora_request: %s.", diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 56004a1939b7..b4c46bb66e7c 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -1091,6 +1091,7 @@ async def init_app_state( enable_prompt_tokens_details=args.enable_prompt_tokens_details, enable_force_include_usage=args.enable_force_include_usage, enable_log_outputs=args.enable_log_outputs, + exclude_log_deltas=args.exclude_log_deltas, log_error_stack=args.log_error_stack, ) if "generate" in supported_tasks diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 799afca12a6e..413e71ec2e46 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -80,7 +80,7 @@ class FrontendArgs: uds: str | None = None """Unix domain socket path. If set, host and port arguments are ignored.""" uvicorn_log_level: Literal[ - "debug", "info", "warning", "error", "critical", "trace" + "critical", "error", "warning", "info", "debug", "trace" ] = "info" """Log level for uvicorn.""" disable_uvicorn_access_log: bool = False @@ -187,6 +187,9 @@ class FrontendArgs: enable_log_outputs: bool = False """If True, log model outputs (generations). Requires --enable-log-requests.""" + exclude_log_deltas: bool = False + """If True, model outputs will be logged once streaming is complete. Deltas + will not be logged. Requires --enable-log-outputs.""" h11_max_incomplete_event_size: int = H11_MAX_INCOMPLETE_EVENT_SIZE_DEFAULT """Maximum size (bytes) of an incomplete HTTP event (header or body) for h11 parser. Helps mitigate header abuse. Default: 4194304 (4 MB).""" @@ -305,6 +308,8 @@ def validate_parsed_serve_args(args: argparse.Namespace): # Enable auto tool needs a tool call parser to be valid if args.enable_auto_tool_choice and not args.tool_call_parser: raise TypeError("Error: --enable-auto-tool-choice requires --tool-call-parser") + if args.exclude_log_deltas and not args.enable_log_outputs: + raise TypeError("Error: --exclude-log-deltas requires --enable-log-outputs") if args.enable_log_outputs and not args.enable_log_requests: raise TypeError("Error: --enable-log-outputs requires --enable-log-requests") diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index 32a3cf04951e..f0d78dace390 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -101,6 +101,7 @@ def __init__( enable_prompt_tokens_details: bool = False, enable_force_include_usage: bool = False, enable_log_outputs: bool = False, + exclude_log_deltas: bool = False, log_error_stack: bool = False, default_chat_template_kwargs: dict[str, Any] | None = None, ) -> None: @@ -118,6 +119,7 @@ def __init__( self.trust_request_chat_template = trust_request_chat_template self.default_chat_template_kwargs = default_chat_template_kwargs or {} self.enable_log_outputs = enable_log_outputs + self.exclude_log_deltas = exclude_log_deltas # set up logits processors self.logits_processors = self.model_config.logits_processors @@ -659,9 +661,14 @@ async def chat_completion_stream_generator( "Tokenizer not available when `skip_tokenizer_init=True`" ) + # Pass the same chat template kwargs as used in tokenization + chat_template_kwargs = self._prepare_extra_chat_template_kwargs( + request.chat_template_kwargs, + self.default_chat_template_kwargs, + ) reasoning_parser = self.reasoning_parser( tokenizer, - chat_template_kwargs=request.chat_template_kwargs, # type: ignore + chat_template_kwargs=chat_template_kwargs, # type: ignore[call-arg] ) except RuntimeError as e: logger.exception("Error in reasoning parser creation.") @@ -1130,7 +1137,7 @@ async def chat_completion_stream_generator( if tc.function and tc.function.arguments ) - if delta_content: + if delta_content and not self.exclude_log_deltas: self.request_logger.log_outputs( request_id=request_id, outputs=delta_content, @@ -1437,9 +1444,14 @@ async def chat_completion_full_generator( "Tokenizer not available when `skip_tokenizer_init=True`" ) + # Pass the same chat template kwargs as used in tokenization + chat_template_kwargs = self._prepare_extra_chat_template_kwargs( + request.chat_template_kwargs, + self.default_chat_template_kwargs, + ) reasoning_parser = self.reasoning_parser( tokenizer, - chat_template_kwargs=request.chat_template_kwargs, # type: ignore + chat_template_kwargs=chat_template_kwargs, # type: ignore[call-arg] ) except RuntimeError as e: logger.exception("Error in reasoning parser creation.") diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py index f1c4ab63f05b..e65141edd144 100644 --- a/vllm/entrypoints/openai/serving_engine.py +++ b/vllm/entrypoints/openai/serving_engine.py @@ -1146,6 +1146,18 @@ def _validate_chat_template( ) return None + @staticmethod + def _prepare_extra_chat_template_kwargs( + request_chat_template_kwargs: dict[str, Any] | None = None, + default_chat_template_kwargs: dict[str, Any] | None = None, + ) -> dict[str, Any]: + """Helper to merge server-default and request-specific chat template kwargs.""" + request_chat_template_kwargs = request_chat_template_kwargs or {} + if default_chat_template_kwargs is None: + return request_chat_template_kwargs + # Apply server defaults first, then request kwargs override. + return default_chat_template_kwargs | request_chat_template_kwargs + async def _preprocess_chat( self, request: ChatLikeRequest | ResponsesRequest, @@ -1184,9 +1196,10 @@ async def _preprocess_chat( tools=tool_dicts, documents=documents, ) - if default_chat_template_kwargs: - _chat_template_kwargs.update(default_chat_template_kwargs) - _chat_template_kwargs.update(chat_template_kwargs or {}) + _chat_template_kwargs |= self._prepare_extra_chat_template_kwargs( + chat_template_kwargs, + default_chat_template_kwargs, + ) request_prompt: str | list[int] diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py index ba10a72fe7e0..a3837d9d32ab 100644 --- a/vllm/entrypoints/score_utils.py +++ b/vllm/entrypoints/score_utils.py @@ -199,14 +199,14 @@ def default_tokenizer_encode(): full_prompt = _apply_model_score_template(model_config, prompt_1, prompt_2) prompt_inputs = tokenizer(full_prompt, **tokenization_kwargs) else: - if model_config.use_pad_token: - # cross_encoder models defaults to using pad_token. + if model_config.use_sep_token: + # cross_encoder models defaults to using separating token. prompt_inputs = tokenizer( text=prompt_1, text_pair=prompt_2, **tokenization_kwargs ) full_prompt = tokenizer.decode(prompt_inputs["input_ids"]) else: - # `llm as reranker` models defaults to not using pad_token. + # `llm as reranker` defaults to not using separating token. full_prompt = prompt_1 + prompt_2 prompt_inputs = tokenizer(text=full_prompt, **tokenization_kwargs) return full_prompt, prompt_inputs diff --git a/vllm/envs.py b/vllm/envs.py index 1d4128d74b95..dadb8c8a231c 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -75,7 +75,7 @@ VLLM_MEDIA_CONNECTOR: str = "http" VLLM_TARGET_DEVICE: str = "cuda" VLLM_MAIN_CUDA_VERSION: str = "12.9" - VLLM_FLOAT32_MATMUL_PRECISION: Literal["ieee", "tf32"] = "ieee" + VLLM_FLOAT32_MATMUL_PRECISION: Literal["highest", "high", "medium"] = "highest" MAX_JOBS: str | None = None NVCC_THREADS: str | None = None VLLM_USE_PRECOMPILED: bool = False @@ -459,13 +459,11 @@ def get_vllm_port() -> int | None: "VLLM_MAIN_CUDA_VERSION": lambda: os.getenv("VLLM_MAIN_CUDA_VERSION", "").lower() or "12.9", # Controls PyTorch float32 matmul precision mode within vLLM workers. - # Accepted values: - # - "ieee" (default): force full IEEE FP32 matmul precision. - # - "tf32": enable TensorFloat32-based fast matmul. + # Valid options mirror torch.set_float32_matmul_precision "VLLM_FLOAT32_MATMUL_PRECISION": env_with_choices( "VLLM_FLOAT32_MATMUL_PRECISION", - "ieee", - ["ieee", "tf32"], + "highest", + ["highest", "high", "medium"], case_sensitive=False, ), # Maximum number of compilation jobs to run in parallel. diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 7a569ec32eac..d1223ad83fbc 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -4,7 +4,7 @@ import time from collections import defaultdict from contextlib import contextmanager -from dataclasses import dataclass +from dataclasses import dataclass, field from typing import Any, NamedTuple import torch @@ -13,6 +13,7 @@ from vllm.attention.backends.abstract import AttentionMetadata from vllm.config import CUDAGraphMode, ParallelConfig, VllmConfig from vllm.logger import init_logger +from vllm.platforms import current_platform from vllm.v1.worker.dp_utils import coordinate_batch_across_dp from vllm.v1.worker.ubatch_utils import UBatchSlices @@ -102,6 +103,7 @@ def make( ) -> "DPMetadata": assert num_tokens_across_dp_cpu is not None assert parallel_config.data_parallel_size > 1 + assert parallel_config.is_moe_model is not False dp_rank = parallel_config.data_parallel_rank batchsize = num_tokens @@ -205,6 +207,8 @@ class ForwardContext: ubatch_slices: UBatchSlices | None = None + additional_kwargs: dict[str, Any] = field(default_factory=dict) + def __post_init__(self): assert self.cudagraph_runtime_mode.valid_runtime_modes(), ( f"Invalid cudagraph runtime mode: {self.cudagraph_runtime_mode}" @@ -235,6 +239,7 @@ def create_forward_context( cudagraph_runtime_mode: CUDAGraphMode = CUDAGraphMode.NONE, batch_descriptor: BatchDescriptor | None = None, ubatch_slices: UBatchSlices | None = None, + additional_kwargs: dict[str, Any] | None = None, ): return ForwardContext( no_compile_layers=vllm_config.compilation_config.static_forward_context, @@ -244,6 +249,7 @@ def create_forward_context( cudagraph_runtime_mode=cudagraph_runtime_mode, batch_descriptor=batch_descriptor, ubatch_slices=ubatch_slices, + additional_kwargs=additional_kwargs or {}, ) @@ -309,6 +315,17 @@ def set_forward_context( if cudagraph_runtime_mode != CUDAGraphMode.NONE and num_tokens is not None: batch_descriptor = batch_descriptor or BatchDescriptor(num_tokens=num_tokens) + additional_kwargs = current_platform.set_additional_forward_context( + attn_metadata=attn_metadata, + vllm_config=vllm_config, + virtual_engine=virtual_engine, + num_tokens=num_tokens, + num_tokens_across_dp=num_tokens_across_dp, + cudagraph_runtime_mode=cudagraph_runtime_mode, + batch_descriptor=batch_descriptor, + ubatch_slices=ubatch_slices, + ) + forward_context = create_forward_context( attn_metadata, vllm_config, @@ -317,6 +334,7 @@ def set_forward_context( cudagraph_runtime_mode, batch_descriptor, ubatch_slices, + additional_kwargs, ) try: @@ -329,8 +347,6 @@ def set_forward_context( # we use synchronous scheduling right now, # adding a sync point here should not affect # scheduling of the next batch - from vllm.platforms import current_platform - synchronize = current_platform.synchronize if synchronize is not None: synchronize() diff --git a/vllm/inputs/preprocess.py b/vllm/inputs/preprocess.py index 43b5fa5ad477..6723809b51e0 100644 --- a/vllm/inputs/preprocess.py +++ b/vllm/inputs/preprocess.py @@ -6,7 +6,7 @@ from typing_extensions import assert_never -from vllm.config import ModelConfig +from vllm.config import ModelConfig, ObservabilityConfig from vllm.logger import init_logger from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalRegistry from vllm.multimodal.cache import BaseMultiModalProcessorCache @@ -47,6 +47,7 @@ def __init__( self, model_config: ModelConfig, tokenizer: TokenizerLike | None, + observability_config: ObservabilityConfig | None = None, mm_registry: MultiModalRegistry = MULTIMODAL_REGISTRY, mm_processor_cache: BaseMultiModalProcessorCache | None = None, ) -> None: @@ -54,6 +55,7 @@ def __init__( self.model_config = model_config self.tokenizer = tokenizer + self.observability_config = observability_config self.mm_registry = mm_registry self.mm_processor_cache = mm_processor_cache @@ -232,6 +234,7 @@ def _get_mm_processor(self) -> BaseMultiModalProcessor: if not hasattr(self, "_mm_processor"): self._mm_processor = self.mm_registry.create_processor( self.model_config, + self.observability_config, tokenizer=self.tokenizer, cache=self.mm_processor_cache, ) diff --git a/vllm/lora/layers/fused_moe.py b/vllm/lora/layers/fused_moe.py index 7c6d86b3602f..093737f1123c 100644 --- a/vllm/lora/layers/fused_moe.py +++ b/vllm/lora/layers/fused_moe.py @@ -130,7 +130,6 @@ def _inject_lora_into_fused_moe(self): prepare_finalize, self.base_layer ), self.base_layer.shared_experts, - getattr(self.base_layer, "shared_experts_stream", None), ) if quant_config.use_mxfp4_w4a16: assert isinstance( @@ -672,20 +671,9 @@ def set_lora( self.reset_lora(index) self.adapter_enabled[index] = 1 - num_experts = self.w13_lora_a_stacked[0].shape[1] w13_lora_a, w2_lora_a = lora_a w13_lora_b, w2_lora_b = lora_b - # (num_experts,rank,input_size) - w13_lora_a = w13_lora_a.reshape(num_experts, -1, w13_lora_a.shape[-1]) - w2_lora_a = w2_lora_a.reshape(num_experts, -1, w2_lora_a.shape[-1]) - # (output_size,rank,num_experts) - w13_lora_b = w13_lora_b.reshape(w13_lora_b.shape[0], -1, num_experts) - w2_lora_b = w2_lora_b.reshape(w2_lora_b.shape[0], -1, num_experts) - # (num_experts,output_size,rank) - w13_lora_b = w13_lora_b.permute(2, 0, 1) - w2_lora_b = w2_lora_b.permute(2, 0, 1) - sliced_w13_lora_a = self._slice_w13_a(w13_lora_a) sliced_w13_lora_b = self._slice_w13_b(w13_lora_b) diff --git a/vllm/lora/model_manager.py b/vllm/lora/model_manager.py index 69787f8f012b..5ef1b823c00a 100644 --- a/vllm/lora/model_manager.py +++ b/vllm/lora/model_manager.py @@ -256,61 +256,7 @@ def activate_adapter( if not module_lora: module.reset_lora(index) continue - # Note (gnovack) - If MOE lora weights are not split into - # num_experts chunks, we split them here - if isinstance(module, FusedMoE3DWithLoRA) and torch.is_tensor( - module_lora.lora_a - ): - # Handle PEFT file format where experts.base_layer is the - # gate_up_proj and experts is the down_proj - gate_up_proj_lora = self._get_lora_layer_weights( - lora_model, module_name + ".base_layer" - ) - down_proj_lora = module_lora - # FIXME Edge case where LoRA is not added to gate_up_proj - # or down_proj - assert gate_up_proj_lora is not None - assert down_proj_lora is not None - if self._is_3d_moe_model: - module_lora.lora_a = [ - gate_up_proj_lora.lora_a, - down_proj_lora.lora_a, - ] - module_lora.lora_b = [ - gate_up_proj_lora.lora_b, - down_proj_lora.lora_b, - ] - else: - # Some 3D MoE models haven't added the `is_3d_moe_weight` - # attribute yet, so fallback here - num_experts = module_lora.lora_a.shape[0] // module_lora.rank - - gate_proj_a = gate_up_proj_lora.lora_a.chunk(num_experts, dim=0) - up_proj_a = gate_up_proj_lora.lora_a.chunk(num_experts, dim=0) - - gate_proj_b = gate_up_proj_lora.lora_b[::2, ...].chunk( - num_experts, dim=-1 - ) - up_proj_b = gate_up_proj_lora.lora_b[1::2, ...].chunk( - num_experts, dim=-1 - ) - - down_proj_a = down_proj_lora.lora_a.chunk(num_experts, dim=0) - down_proj_b = down_proj_lora.lora_b.chunk(num_experts, dim=-1) - lora_a = [] - lora_b = [] - for i in range(num_experts): - lora_a.append(gate_proj_a[i]) - lora_a.append(down_proj_a[i]) - lora_a.append(up_proj_a[i]) - - lora_b.append(gate_proj_b[i]) - lora_b.append(down_proj_b[i]) - lora_b.append(up_proj_b[i]) - - module_lora.lora_a = lora_a - module_lora.lora_b = lora_b module.set_lora( index, module_lora.lora_a, @@ -627,6 +573,10 @@ def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None: for lora in lora_model.loras.values(): lora.optimize() + for module_name, module in self.modules.items(): + if isinstance(module, FusedMoE3DWithLoRA): + self._stack_moe_lora_weights(lora_model, module, module_name) + first_lora: LoRALayerWeights = next(iter(lora_model.loras.values())) assert first_lora.lora_a is not None if isinstance(first_lora.lora_a, list): @@ -653,6 +603,91 @@ def _create_merged_loras_inplace(self, lora_model: LoRAModel) -> None: lora.lora_a = lora.lora_a.pin_memory() lora.lora_b = lora.lora_b.pin_memory() + def _stack_moe_lora_weights( + self, lora_model: LoRAModel, module: FusedMoE3DWithLoRA, module_name: str + ): + module_lora = self._get_lora_layer_weights(lora_model, module_name) + + # Note (gnovack) - If MOE lora weights are not split into + # num_experts chunks, we split them here + if module_lora and torch.is_tensor(module_lora.lora_a): + # Handle PEFT file format where experts.base_layer is the + # gate_up_proj and experts is the down_proj + gate_up_proj_lora = self._get_lora_layer_weights( + lora_model, module_name + ".base_layer" + ) + down_proj_lora = module_lora + # FIXME Edge case where LoRA is not added to gate_up_proj + # or down_proj + assert gate_up_proj_lora is not None + assert down_proj_lora is not None + if self._is_3d_moe_model: + num_experts = module.w13_lora_a_stacked[0].shape[1] + + # (num_experts,rank,input_size) + gate_up_proj_lora.lora_a = gate_up_proj_lora.lora_a.reshape( + num_experts, -1, gate_up_proj_lora.lora_a.shape[-1] + ) + down_proj_lora.lora_a = down_proj_lora.lora_a.reshape( + num_experts, -1, down_proj_lora.lora_a.shape[-1] + ) + + # (output_size,num_experts,rank) + gate_up_proj_lora.lora_b = gate_up_proj_lora.lora_b.reshape( + gate_up_proj_lora.lora_b.shape[0], -1, num_experts + ) + down_proj_lora.lora_b = down_proj_lora.lora_b.reshape( + down_proj_lora.lora_b.shape[0], -1, num_experts + ) + + # (num_experts,output_size,rank) + gate_up_proj_lora.lora_b = gate_up_proj_lora.lora_b.permute( + 2, 0, 1 + ).contiguous() + down_proj_lora.lora_b = down_proj_lora.lora_b.permute( + 2, 0, 1 + ).contiguous() + + module_lora.lora_a = [ + gate_up_proj_lora.lora_a, + down_proj_lora.lora_a, + ] + module_lora.lora_b = [ + gate_up_proj_lora.lora_b, + down_proj_lora.lora_b, + ] + else: + # Some 3D MoE models haven't added the `is_3d_moe_weight` + # attribute yet, so fallback here + num_experts = module_lora.lora_a.shape[0] // module_lora.rank + + gate_proj_a = gate_up_proj_lora.lora_a.chunk(num_experts, dim=0) + up_proj_a = gate_up_proj_lora.lora_a.chunk(num_experts, dim=0) + + gate_proj_b = gate_up_proj_lora.lora_b[::2, ...].chunk( + num_experts, dim=-1 + ) + up_proj_b = gate_up_proj_lora.lora_b[1::2, ...].chunk( + num_experts, dim=-1 + ) + + down_proj_a = down_proj_lora.lora_a.chunk(num_experts, dim=0) + down_proj_b = down_proj_lora.lora_b.chunk(num_experts, dim=-1) + + lora_a = [] + lora_b = [] + for i in range(num_experts): + lora_a.append(gate_proj_a[i]) + lora_a.append(down_proj_a[i]) + lora_a.append(up_proj_a[i]) + + lora_b.append(gate_proj_b[i]) + lora_b.append(down_proj_b[i]) + lora_b.append(up_proj_b[i]) + + module_lora.lora_a = lora_a + module_lora.lora_b = lora_b + def _get_lora_layer_weights( self, lora_model: LoRAModel, module_name: str ) -> LoRALayerWeights | None: diff --git a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py index f04936221eea..771035691090 100644 --- a/vllm/lora/ops/triton_ops/fused_moe_lora_op.py +++ b/vllm/lora/ops/triton_ops/fused_moe_lora_op.py @@ -163,15 +163,17 @@ def _fused_moe_lora_kernel( # accumulator accumulator = tl.zeros((BLOCK_SIZE_M, BLOCK_SIZE_N), dtype=tl.float32) - # GDC wait waits for ALL programs in the prior kernel to complete - # before continuing. if USE_GDC and not IS_PRIMARY: tl.extra.cuda.gdc_wait() for k in range(0, grid_k): k_remaining = K - k * (BLOCK_SIZE_K * SPLIT_K) + # GDC wait waits for ALL programs in the prior kernel to complete + # before continuing. # pre-fetch lora weight b = tl.load(b_ptrs, mask=offs_k[:, None] < k_remaining, other=0.0) + if USE_GDC and not IS_PRIMARY: + tl.extra.cuda.gdc_wait() a = tl.load( a_ptrs, mask=token_mask[:, None] & (offs_k[None, :] < k_remaining), @@ -229,9 +231,9 @@ def _fused_moe_lora_shrink( num_stages: int, split_k: int, mul_routed_weight: bool = False, + use_gdc: bool = False, ) -> None: w1_lora_a_stacked = lora_a_stacked[0] - use_gdc = supports_pdl(qcurr_hidden_states.device) shrink_config = { "BLOCK_SIZE_M": block_size_m, "BLOCK_SIZE_N": block_size_n, @@ -324,6 +326,7 @@ def _fused_moe_lora_expand( split_k: int, mul_routed_weight: bool = False, offset: int = 0, + use_gdc: bool = False, ) -> None: b_ptr = _get_ptr(lora_b_stacked, device) K = max_lora_rank @@ -335,7 +338,6 @@ def _fused_moe_lora_expand( -1, a_intermediate_cache1.shape[3] ) - use_gdc = supports_pdl(a_intermediate_cache1.device) expand_config = { "BLOCK_SIZE_M": block_size_m, "BLOCK_SIZE_N": block_size_n, @@ -464,7 +466,7 @@ def _fused_moe_lora( dtype=output.dtype, device=device, ) - + use_gdc = supports_pdl(device) and not fully_sharded _fused_moe_lora_shrink( a_intermediate_cache1, qcurr_hidden_states, @@ -493,6 +495,7 @@ def _fused_moe_lora( shrink_num_stages, shrink_split_k, mul_routed_weight, + use_gdc=use_gdc, ) if fully_sharded: @@ -540,6 +543,7 @@ def _fused_moe_lora( expand_split_k, mul_routed_weight, offset, + use_gdc=use_gdc, ) @@ -602,6 +606,7 @@ def _fused_moe_lora_shrink_fake( num_stages: int, split_k: int, mul_routed_weight: bool = False, + use_gdc: bool = False, ) -> None: return @@ -635,6 +640,7 @@ def _fused_moe_lora_expand_fake( num_stages: int, split_k: int, mul_routed_weight: bool = False, + use_gdc: bool = False, ) -> None: return diff --git a/vllm/lora/ops/triton_ops/lora_expand_op.py b/vllm/lora/ops/triton_ops/lora_expand_op.py index 311c4b191859..862f5f6b2431 100644 --- a/vllm/lora/ops/triton_ops/lora_expand_op.py +++ b/vllm/lora/ops/triton_ops/lora_expand_op.py @@ -14,8 +14,6 @@ from vllm.triton_utils import tl, triton from vllm.utils.torch_utils import direct_register_custom_op -from .utils import supports_pdl - @triton.jit def _lora_expand_kernel( @@ -241,7 +239,9 @@ def _lora_expand( # thread blocks simply exit. MAX_LORAS, ) - use_gdc = supports_pdl(inputs.device) + # We disable PDL temporarily because LoRA kernels are not launching back-to-back, + # making PDL invalid and affecting the kernel performance. + use_gdc = False # supports_pdl(inputs.device) _lora_expand_kernel[grid]( inputs, lora_ptr_tensor, diff --git a/vllm/lora/ops/triton_ops/lora_shrink_op.py b/vllm/lora/ops/triton_ops/lora_shrink_op.py index 71bd5e361466..9ba82b396a48 100644 --- a/vllm/lora/ops/triton_ops/lora_shrink_op.py +++ b/vllm/lora/ops/triton_ops/lora_shrink_op.py @@ -14,8 +14,6 @@ from vllm.triton_utils import tl, triton from vllm.utils.torch_utils import direct_register_custom_op -from .utils import supports_pdl - @triton.jit def _lora_shrink_kernel( @@ -221,7 +219,9 @@ def _lora_shrink( # thread blocks exit early. MAX_LORAS, ) - use_gdc = supports_pdl(inputs.device) + # We disable PDL temporarily because LoRA kernels are not launching back-to-back, + # making PDL invalid and affecting the kernel performance. + use_gdc = False # supports_pdl(inputs.device) _lora_shrink_kernel[grid]( inputs, lora_ptr_tensor, diff --git a/vllm/model_executor/__init__.py b/vllm/model_executor/__init__.py index b50f0cb3a61a..8d79940b858f 100644 --- a/vllm/model_executor/__init__.py +++ b/vllm/model_executor/__init__.py @@ -2,10 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from vllm.model_executor.parameter import BasevLLMParameter, PackedvLLMParameter -from vllm.model_executor.utils import set_random_seed __all__ = [ - "set_random_seed", "BasevLLMParameter", "PackedvLLMParameter", ] diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py index 66250f816f45..a80768c33a51 100644 --- a/vllm/model_executor/custom_op.py +++ b/vllm/model_executor/custom_op.py @@ -67,8 +67,9 @@ def forward_xpu(self, *args, **kwargs): return self.forward_native(*args, **kwargs) def forward_cpu(self, *args, **kwargs): - # By default, we assume that CPU ops are compatible with CUDA ops. - return self.forward_cuda(*args, **kwargs) + # By default, we assume that CPU ops are compatible with the + # PyTorch-native implementation. + return self.forward_native(*args, **kwargs) def forward_tpu(self, *args, **kwargs): # By default, we assume that TPU ops are compatible with the diff --git a/vllm/model_executor/layers/fla/ops/utils.py b/vllm/model_executor/layers/fla/ops/utils.py index 5a48e56a5fbb..18e17a5110c1 100644 --- a/vllm/model_executor/layers/fla/ops/utils.py +++ b/vllm/model_executor/layers/fla/ops/utils.py @@ -119,7 +119,7 @@ def wrapper(*args, **kwargs): def get_available_device() -> str: try: return triton.runtime.driver.active.get_current_target().backend - except BaseException: + except (RuntimeError, AttributeError): return "cpu" diff --git a/vllm/model_executor/layers/fused_moe/all2all_utils.py b/vllm/model_executor/layers/fused_moe/all2all_utils.py index 86c50f39f007..036b3cac4cb3 100644 --- a/vllm/model_executor/layers/fused_moe/all2all_utils.py +++ b/vllm/model_executor/layers/fused_moe/all2all_utils.py @@ -77,8 +77,10 @@ def maybe_make_prepare_finalize( prepare_finalize: FusedMoEPrepareAndFinalize | None = None - # TODO: could allow this now - assert not moe.use_flashinfer_cutlass_kernels, "Must be created in modelopt.py" + # TODO(rob): update this as part of the MoE refactor. + assert not moe.use_flashinfer_cutlass_kernels, ( + "Must be created in modelopt.py or fp8.py" + ) if moe.use_pplx_kernels: assert quant_config is not None diff --git a/vllm/model_executor/layers/fused_moe/config.py b/vllm/model_executor/layers/fused_moe/config.py index 4266514bc94e..3f298f7a5ca2 100644 --- a/vllm/model_executor/layers/fused_moe/config.py +++ b/vllm/model_executor/layers/fused_moe/config.py @@ -452,11 +452,14 @@ def make( - a1_scale: Optional scale to be used for a1. - a2_scale: Optional scale to be used for a2. - g1_alphas: Optional global quantization scales for w1 (for nvfp4). - per-channel scales for w1 (for W4A8 FP8). + Optional per-channel scales for w1 (for W4A8 FP8). + Optional dq scale i.e. w_scale * a_scale (for W8A8 fp8). - g2_alphas: Optional global quantization scales for w2 (for nvfp4). - per-channel scales for w2 (for W4A8 FP8). - - a1_gscale: Optional global quantization scales for a1 (for nvfp4). - - a2_gscale: Optional global quantization scales for a2 (for nvfp4). + Optional per-channel scales for w2 (for W4A8 FP8). + Optional dq scale i.e. w_scale * a_scale (for W8A8 fp8). + - a1_gscale: Optional global quantization scales for a1 (1.0 /a2_scale). + - a2_gscale: Optional global quantization scales for a2 (1.0 /a2_scale). + - w1_bias: Optional biases for w1 (GPT OSS Triton). - w2_bias: Optional biases for w1 (GPT OSS Triton). - w1_zp: Optional w1 zero points for int4/int8 quantization. diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/cutlass_moe.py index 9281780fca47..c585cbc1ab5d 100644 --- a/vllm/model_executor/layers/fused_moe/cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/cutlass_moe.py @@ -173,7 +173,7 @@ def run_cutlass_moe_fp8( num_expert = global_num_experts if expert_map is None else expert_map.size(0) # permuted a1q reuses workspace2 - a1q, a1q_scale, expert_offsets, inv_perm, _ = moe_permute( + a1q, a1q_scale, expert_first_token_offset, inv_perm, _ = moe_permute( a1q, a1q_scale, topk_ids, @@ -182,7 +182,7 @@ def run_cutlass_moe_fp8( expert_map, permuted_hidden_states=a1q_perm, ) - expert_offsets = expert_offsets[:-1] + expert_offsets = expert_first_token_offset[:-1] ops.get_cutlass_moe_mm_problem_sizes( local_topk_ids, problem_sizes1, problem_sizes2, global_num_experts, N, K @@ -215,9 +215,6 @@ def run_cutlass_moe_fp8( act_out, a2_scale, use_per_token_if_dynamic=per_act_token, output=quant_out ) - if expert_map is not None: - mm2_out.fill_(0) - ops.cutlass_moe_mm( mm2_out, a2q, @@ -243,6 +240,9 @@ def run_cutlass_moe_fp8( permuted_hidden_states=mm2_out, topk_weights=topk_weights, inv_permuted_idx=inv_perm, + expert_first_token_offset=( + expert_first_token_offset if expert_map is not None else None + ), ) @@ -988,7 +988,7 @@ def run_cutlass_moe_w4a8_fp8( num_expert = global_num_experts if expert_map is None else expert_map.size(0) # permuted a1q reuses workspace2 - a1q, a1q_scale, expert_offsets, inv_perm, _ = moe_permute( + a1q, a1q_scale, expert_first_token_offset, inv_perm, _ = moe_permute( a1q, a1q_scale, topk_ids, @@ -997,7 +997,7 @@ def run_cutlass_moe_w4a8_fp8( expert_map, permuted_hidden_states=a1q_perm, ) - expert_offsets = expert_offsets[:-1] + expert_offsets = expert_first_token_offset[:-1] # For RS gemm SwapAB is always enabled (swap logical M, N in the problem shape) ops.get_cutlass_moe_mm_problem_sizes( @@ -1032,9 +1032,6 @@ def run_cutlass_moe_w4a8_fp8( act_out, a2_scale, use_per_token_if_dynamic=per_act_token, output=quant_out ) - if expert_map is not None: - mm2_out.fill_(0) - ops.cutlass_w4a8_moe_mm( mm2_out, a2q, @@ -1058,6 +1055,9 @@ def run_cutlass_moe_w4a8_fp8( permuted_hidden_states=mm2_out, topk_weights=topk_weights, inv_permuted_idx=inv_perm, + expert_first_token_offset=( + expert_first_token_offset if expert_map is not None else None + ), ) diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py index f864634c6617..09c3d9b2190f 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py @@ -165,10 +165,10 @@ def apply( ): # FP8 per-tensor path: use global alphas/scales; do not pass input_sf quant_scales = [ - self.g1_alphas, - self.a2_gscale, - self.g2_alphas, - self.a1_gscale, + self.g1_alphas, # w13_weight_scale * w13_input_scale + self.a2_gscale, # 1.0 / w2_input_scale + self.g2_alphas, # w2_weight_scale * w2_input_scale + self.a1_scale, ] a1q_scale = None # not passing input_sf in fp8 @@ -241,7 +241,9 @@ def flashinfer_cutlass_moe_fp4( apply_router_weight_on_input: bool = False, ) -> torch.Tensor: fused_experts = mk.FusedMoEModularKernel( - create_flashinfer_prepare_finalize(use_dp=False), + create_flashinfer_prepare_finalize( + use_dp=False, use_nvfp4=True, enable_alltoallv=False + ), FlashInferExperts( out_dtype=hidden_states.dtype, quant_config=quant_config, diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py index 762890867e60..0b0efdafbd4d 100644 --- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py +++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_prepare_finalize.py @@ -10,6 +10,9 @@ ) from vllm.forward_context import get_forward_context from vllm.model_executor.layers.fused_moe.config import FusedMoEQuantConfig +from vllm.model_executor.layers.fused_moe.prepare_finalize import ( + MoEPrepareAndFinalizeNoEP, +) from vllm.model_executor.layers.fused_moe.topk_weight_and_reduce import ( TopKWeightAndReduceNoOP, ) @@ -181,13 +184,14 @@ def prepare( self._apply_router_weight_on_input( a1, topk_weights, topk_ids, apply_router_weight_on_input ) - if not self.use_dp and quant_config.quant_dtype == "nvfp4": + is_nvfp4 = quant_config.quant_dtype == "nvfp4" + if not self.use_dp and is_nvfp4: return a1, None, None, topk_ids, topk_weights if not self.use_deepseek_fp8_block_scale: a1q, a1q_scale = moe_kernel_quantize_input( a1, - quant_config.a1_gscale, + quant_config.a1_gscale if is_nvfp4 else quant_config.a1_scale, quant_config.quant_dtype, quant_config.per_act_token_quant, quant_config.block_shape, @@ -219,7 +223,7 @@ def prepare( topk_weights, topk_ids, a1q = gathered a1q_scale = None - if quant_config.quant_dtype == "nvfp4" and a1q_scale is not None: + if is_nvfp4 and a1q_scale is not None: a1q_scale = nvfp4_block_scale_interleave(a1q_scale) return a1q, a1q_scale, None, topk_ids, topk_weights @@ -349,14 +353,23 @@ def create_flashinfer_prepare_finalize( use_nvfp4: bool = False, enable_alltoallv: bool = False, use_deepseek_fp8_block_scale: bool = False, -) -> FlashInferCutlassMoEPrepareAndFinalize: +) -> FlashInferCutlassMoEPrepareAndFinalize | MoEPrepareAndFinalizeNoEP: """Factory function to create the appropriate FlashInfer implementation.""" + # TODO(rob): migrate non-DP cases to MoEPrepareAndFinalizeNoEP + # once we complete the FP8 refactor. if use_nvfp4: if enable_alltoallv: return FlashInferAllToAllMoEPrepareAndFinalize(use_dp) else: return FlashInferAllGatherMoEPrepareAndFinalize(use_dp) - # FP8 path currently supported via AllGather; optionally enable block-scale - return FlashInferAllGatherMoEPrepareAndFinalize( - use_dp=use_dp, use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale - ) + + # FP8 DP path currently supported via AllGather. + if use_dp: + return FlashInferAllGatherMoEPrepareAndFinalize( + use_dp=True, + use_deepseek_fp8_block_scale=use_deepseek_fp8_block_scale, + ) + else: + # NOTE(rob): CUTLASS FP8 block quant executes the input + # quantzation and grouped gemm in a single kernel. + return MoEPrepareAndFinalizeNoEP(defer_input_quant=use_deepseek_fp8_block_scale) diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index 7a0a3718cb80..b434780e19a2 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -541,11 +541,70 @@ def fused_moe_kernel( tl.store(c_ptrs, accumulator, mask=c_mask) -def invoke_fused_moe_kernel( +# NOTE(zyongye): we can remove all the wna16 kernel +# once we drop off sm75 support +def invoke_fused_moe_wna16_cuda_kernel( + A: torch.Tensor, + B: torch.Tensor, + C: torch.Tensor, + B_scale: torch.Tensor | None, + B_zp: torch.Tensor | None, + topk_weights: torch.Tensor | None, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_padded: torch.Tensor, + mul_routed_weight: bool, + top_k: int, + config: dict[str, Any], + block_shape: list[int], +): + assert B_scale is not None and B_scale.ndim == 3 + assert B_zp is None or B_zp.ndim == 3 + assert block_shape is None or block_shape[0] == 0 + + M = A.size(0) + num_tokens = M * top_k + bit = 4 + + config = config.copy() + config.update( + get_moe_wna16_block_config( + config=config, + use_moe_wna16_cuda=True, + num_valid_tokens=num_tokens, + size_k=A.size(1), + size_n=B.size(1), + num_experts=B.size(1), + group_size=block_shape[1], + real_top_k=top_k, + block_size_m=config["BLOCK_SIZE_M"], + ) + ) + + ops.moe_wna16_gemm( + A, + C, + B, + B_scale, + B_zp, + topk_weights if mul_routed_weight else None, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + top_k, + config["BLOCK_SIZE_M"], + config["BLOCK_SIZE_N"], + config["BLOCK_SIZE_K"], + bit, + ) + + +# NOTE(zyongye): we can remove all the wna16 kernel +# once we drop off sm75 support +def invoke_fused_moe_wna16_triton_kernel( A: torch.Tensor, B: torch.Tensor, C: torch.Tensor, - A_scale: torch.Tensor | None, B_scale: torch.Tensor | None, B_zp: torch.Tensor | None, topk_weights: torch.Tensor | None, @@ -556,6 +615,96 @@ def invoke_fused_moe_kernel( top_k: int, config: dict[str, Any], compute_type: tl.dtype, + use_int8_w8a16: bool, + use_int4_w4a16: bool, + block_shape: list[int], +): + assert B_scale is not None and B_scale.ndim == 3 + assert B_zp is None or B_zp.ndim == 3 + assert block_shape is None or block_shape[0] == 0 + + M = A.size(0) + num_tokens = M * top_k + + EM = sorted_token_ids.size(0) + if A.size(0) < config["BLOCK_SIZE_M"]: + # optimize for small batch_size. + # We assume that top_ids of each token is unique, + # so num_valid_experts <= batch_size <= BLOCK_SIZE_M, + # and we can skip some invalid blocks. + EM = min(sorted_token_ids.size(0), A.size(0) * top_k * config["BLOCK_SIZE_M"]) + grid = lambda META: ( + triton.cdiv(EM, META["BLOCK_SIZE_M"]) + * triton.cdiv(B.size(1), META["BLOCK_SIZE_N"]), + ) + config = config.copy() + config.update( + get_moe_wna16_block_config( + config=config, + use_moe_wna16_cuda=False, + num_valid_tokens=num_tokens, + size_k=A.size(1), + size_n=B.size(1), + num_experts=B.size(1), + group_size=block_shape[1], + real_top_k=top_k, + block_size_m=config["BLOCK_SIZE_M"], + ) + ) + + fused_moe_kernel_gptq_awq[grid]( + A, + B, + C, + B_scale, + B_zp, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + B.size(1), + A.size(1), + EM, + num_tokens, + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(2), + B.stride(1), + C.stride(1), + C.stride(2), + B_scale.stride(0), + B_scale.stride(2), + B_scale.stride(1), + B_zp.stride(0) if B_zp is not None else 0, + B_zp.stride(2) if B_zp is not None else 0, + B_zp.stride(1) if B_zp is not None else 0, + block_k_diviable=A.size(1) % config["BLOCK_SIZE_K"] == 0, + group_size=block_shape[1], + MUL_ROUTED_WEIGHT=mul_routed_weight, + top_k=top_k, + compute_type=compute_type, + has_zp=B_zp is not None, + use_int4_w4a16=use_int4_w4a16, + use_int8_w8a16=use_int8_w8a16, + **config, + ) + + +def invoke_fused_moe_triton_kernel( + A: torch.Tensor, + B: torch.Tensor, + C: torch.Tensor, + A_scale: torch.Tensor | None, + B_scale: torch.Tensor | None, + topk_weights: torch.Tensor | None, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_padded: torch.Tensor, + mul_routed_weight: bool, + top_k: int, + config: dict[str, Any], + compute_type: tl.dtype, use_fp8_w8a8: bool, use_int8_w8a8: bool, use_int8_w8a16: bool, @@ -563,7 +712,7 @@ def invoke_fused_moe_kernel( per_channel_quant: bool, block_shape: list[int] | None = None, B_bias: torch.Tensor | None = None, -) -> None: +): assert topk_weights is not None or not mul_routed_weight assert topk_weights is None or topk_weights.stride(1) == 1 assert sorted_token_ids.stride(0) == 1 @@ -576,7 +725,6 @@ def invoke_fused_moe_kernel( assert block_shape is None or triton.cdiv( B.size(-1), block_shape[1] ) == B_scale.size(-1) - elif use_int8_w8a16 or use_int4_w4a16: assert B_scale is not None assert block_shape is None or block_shape[0] == 0 @@ -599,13 +747,90 @@ def invoke_fused_moe_kernel( * triton.cdiv(B.size(1), META["BLOCK_SIZE_N"]), ) HAS_BIAS = B_bias is not None - if ( - (use_int8_w8a16 or use_int4_w4a16) - and block_shape is not None - and block_shape[1] > 0 + + config = config.copy() + config["SPLIT_K"] = 1 + BLOCK_SIZE_K = config.pop("BLOCK_SIZE_K") + if block_shape is not None: + BLOCK_SIZE_K = min(BLOCK_SIZE_K, min(block_shape[0], block_shape[1])) + fused_moe_kernel[grid]( + A, + B, + C, + B_bias, + A_scale, + B_scale, + topk_weights, + sorted_token_ids, + expert_ids, + num_tokens_post_padded, + B.size(1), + B.size(2), + EM, + num_tokens, + A.stride(0), + A.stride(1), + B.stride(0), + B.stride(2), + B.stride(1), + C.stride(1), + C.stride(2), + A_scale.stride(0) if A_scale is not None and A_scale.ndim == 2 else 0, + A_scale.stride(1) if A_scale is not None and A_scale.ndim == 2 else 0, + B_scale.stride(0) if B_scale is not None and B_scale.ndim >= 2 else 0, + B_scale.stride(2) if B_scale is not None and B_scale.ndim == 3 else 0, + B_scale.stride(1) if B_scale is not None and B_scale.ndim >= 2 else 0, + B_bias.stride(0) if B_bias is not None else 0, + B_bias.stride(1) if B_bias is not None else 0, + 0 if block_shape is None else block_shape[0], + 0 if block_shape is None else block_shape[1], + MUL_ROUTED_WEIGHT=mul_routed_weight, + top_k=top_k, + compute_type=compute_type, + use_fp8_w8a8=use_fp8_w8a8, + use_int8_w8a8=use_int8_w8a8, + use_int8_w8a16=use_int8_w8a16, + per_channel_quant=per_channel_quant, + HAS_BIAS=HAS_BIAS, + BLOCK_SIZE_K=BLOCK_SIZE_K, + **config, + ) + + +def dispatch_fused_moe_kernel( + A: torch.Tensor, + B: torch.Tensor, + C: torch.Tensor, + A_scale: torch.Tensor | None, + B_scale: torch.Tensor | None, + B_zp: torch.Tensor | None, + topk_weights: torch.Tensor | None, + sorted_token_ids: torch.Tensor, + expert_ids: torch.Tensor, + num_tokens_post_padded: torch.Tensor, + mul_routed_weight: bool, + top_k: int, + config: dict[str, Any], + compute_type: tl.dtype, + use_fp8_w8a8: bool, + use_int8_w8a8: bool, + use_int8_w8a16: bool, + use_int4_w4a16: bool, + per_channel_quant: bool, + block_shape: list[int] | None = None, + B_bias: torch.Tensor | None = None, +) -> None: + assert topk_weights is not None or not mul_routed_weight + assert topk_weights is None or topk_weights.stride(1) == 1 + assert sorted_token_ids.stride(0) == 1 + + M = A.size(0) + num_tokens = M * top_k + + if (use_int8_w8a16 or use_int4_w4a16) and ( + block_shape is not None and block_shape[1] > 0 ): - assert B_scale is not None and B_scale.ndim == 3 - assert B_zp is None or B_zp.ndim == 3 + assert B_bias is None use_moe_wna16_cuda = should_moe_wna16_use_cuda( num_valid_tokens=num_tokens, @@ -613,41 +838,25 @@ def invoke_fused_moe_kernel( num_experts=B.size(0), bit=4 if use_int4_w4a16 else 8, ) - config = config.copy() - config.update( - get_moe_wna16_block_config( - config=config, - use_moe_wna16_cuda=use_moe_wna16_cuda, - num_valid_tokens=num_tokens, - size_k=A.size(1), - size_n=B.size(1), - num_experts=B.size(1), - group_size=block_shape[1], - real_top_k=top_k, - block_size_m=config["BLOCK_SIZE_M"], - ) - ) if use_moe_wna16_cuda: - bit = 4 if use_int4_w4a16 else 8 - ops.moe_wna16_gemm( + invoke_fused_moe_wna16_cuda_kernel( A, - C, B, + C, B_scale, B_zp, - topk_weights if mul_routed_weight else None, + topk_weights, sorted_token_ids, expert_ids, num_tokens_post_padded, + mul_routed_weight, top_k, - config["BLOCK_SIZE_M"], - config["BLOCK_SIZE_N"], - config["BLOCK_SIZE_K"], - bit, + config, + block_shape, ) return - fused_moe_kernel_gptq_awq[grid]( + invoke_fused_moe_wna16_triton_kernel( A, B, C, @@ -657,80 +866,37 @@ def invoke_fused_moe_kernel( sorted_token_ids, expert_ids, num_tokens_post_padded, - B.size(1), - A.size(1), - EM, - num_tokens, - A.stride(0), - A.stride(1), - B.stride(0), - B.stride(2), - B.stride(1), - C.stride(1), - C.stride(2), - B_scale.stride(0), - B_scale.stride(2), - B_scale.stride(1), - B_zp.stride(0) if B_zp is not None else 0, - B_zp.stride(2) if B_zp is not None else 0, - B_zp.stride(1) if B_zp is not None else 0, - block_k_diviable=A.size(1) % config["BLOCK_SIZE_K"] == 0, - group_size=block_shape[1], - MUL_ROUTED_WEIGHT=mul_routed_weight, - top_k=top_k, - compute_type=compute_type, - has_zp=B_zp is not None, - use_int4_w4a16=use_int4_w4a16, - use_int8_w8a16=use_int8_w8a16, - **config, + mul_routed_weight, + top_k, + config, + compute_type, + use_int8_w8a16, + use_int4_w4a16, + block_shape, ) + else: - config = config.copy() - config["SPLIT_K"] = 1 - BLOCK_SIZE_K = config.pop("BLOCK_SIZE_K") - if block_shape is not None: - BLOCK_SIZE_K = min(BLOCK_SIZE_K, min(block_shape[0], block_shape[1])) - fused_moe_kernel[grid]( + invoke_fused_moe_triton_kernel( A, B, C, - B_bias, A_scale, B_scale, topk_weights, sorted_token_ids, expert_ids, num_tokens_post_padded, - B.size(1), - B.size(2), - EM, - num_tokens, - A.stride(0), - A.stride(1), - B.stride(0), - B.stride(2), - B.stride(1), - C.stride(1), - C.stride(2), - A_scale.stride(0) if A_scale is not None and A_scale.ndim == 2 else 0, - A_scale.stride(1) if A_scale is not None and A_scale.ndim == 2 else 0, - B_scale.stride(0) if B_scale is not None and B_scale.ndim >= 2 else 0, - B_scale.stride(2) if B_scale is not None and B_scale.ndim == 3 else 0, - B_scale.stride(1) if B_scale is not None and B_scale.ndim >= 2 else 0, - B_bias.stride(0) if B_bias is not None else 0, - B_bias.stride(1) if B_bias is not None else 0, - 0 if block_shape is None else block_shape[0], - 0 if block_shape is None else block_shape[1], - MUL_ROUTED_WEIGHT=mul_routed_weight, - top_k=top_k, - compute_type=compute_type, - use_fp8_w8a8=use_fp8_w8a8, - use_int8_w8a8=use_int8_w8a8, - use_int8_w8a16=use_int8_w8a16, - per_channel_quant=per_channel_quant, - HAS_BIAS=HAS_BIAS, - BLOCK_SIZE_K=BLOCK_SIZE_K, - **config, + mul_routed_weight, + top_k, + config, + compute_type, + use_fp8_w8a8, + use_int8_w8a8, + use_int8_w8a16, + use_int4_w4a16, + per_channel_quant, + block_shape, + B_bias, ) @@ -1997,7 +2163,7 @@ def fused_experts_impl( ignore_invalid_experts=True, ) - invoke_fused_moe_kernel( + dispatch_fused_moe_kernel( qcurr_hidden_states, w1, intermediate_cache1, @@ -2056,7 +2222,7 @@ def fused_experts_impl( if expert_map is not None: intermediate_cache3.zero_() - invoke_fused_moe_kernel( + dispatch_fused_moe_kernel( qintermediate_cache2, w2, intermediate_cache3, @@ -2207,13 +2373,12 @@ def apply( topk_ids, config["BLOCK_SIZE_M"], global_num_experts, expert_map ) - invoke_fused_moe_kernel( + invoke_fused_moe_triton_kernel( hidden_states, w1, intermediate_cache1, a1q_scale, self.w1_scale, - self.w1_zp, None, # topk_weights sorted_token_ids, expert_ids, @@ -2245,13 +2410,12 @@ def apply( self.block_shape, ) - invoke_fused_moe_kernel( + invoke_fused_moe_triton_kernel( qintermediate_cache2, w2, intermediate_cache3, a2q_scale, self.w2_scale, - self.w2_zp, topk_weights, sorted_token_ids, expert_ids, diff --git a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py index 30ff1bf2f008..6abefde0763e 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe_modular_method.py @@ -49,7 +49,6 @@ def make( prepare_finalize, old_quant_method.select_gemm_impl(prepare_finalize, moe_layer), shared_experts, - getattr(moe_layer, "shared_experts_stream", None), moe_parallel_config=moe_layer.moe_parallel_config, ), ) diff --git a/vllm/model_executor/layers/fused_moe/layer.py b/vllm/model_executor/layers/fused_moe/layer.py index f0d94bfbcaba..374dffde5724 100644 --- a/vllm/model_executor/layers/fused_moe/layer.py +++ b/vllm/model_executor/layers/fused_moe/layer.py @@ -356,14 +356,14 @@ def __init__( # TODO: Remove this after more extensive testings with TP/DP # and other execution modes if envs.VLLM_DISABLE_SHARED_EXPERTS_STREAM: - logger.info_once("Disabling MoE shared_experts cuda stream") + logger.debug_once("Disabling MoE shared_experts cuda stream", scope="local") self.shared_experts_stream = None else: # TODO(rob): enable shared expert overlap with non-cuda-alike. # aux_stream() returns None on non-cuda-alike platforms. self.shared_experts_stream = aux_stream() if self.shared_experts_stream is not None: - logger.info_once( + logger.debug_once( "Enabled separate cuda stream for MoE shared_experts", scope="local" ) @@ -1899,11 +1899,11 @@ def forward_impl( ) post_quant_allgather = ( - has_flashinfer_trtllm_fused_moe() - and self.quant_method is not None + self.quant_method is not None and self.dp_size > 1 and self.use_ep and isinstance(self.quant_method, ModelOptNvFp4FusedMoE) + and has_flashinfer_trtllm_fused_moe() ) if post_quant_allgather: hidden_states_to_dispatch, extra_tensors = ( diff --git a/vllm/model_executor/layers/fused_moe/modular_kernel.py b/vllm/model_executor/layers/fused_moe/modular_kernel.py index 25308b3106a4..79168948f04a 100644 --- a/vllm/model_executor/layers/fused_moe/modular_kernel.py +++ b/vllm/model_executor/layers/fused_moe/modular_kernel.py @@ -21,7 +21,6 @@ count_expert_num_tokens, disable_inplace, ) -from vllm.platforms import current_platform from vllm.utils.math_utils import cdiv from vllm.v1.worker.ubatching import ( dbo_enabled, @@ -682,14 +681,12 @@ def __init__( prepare_finalize: FusedMoEPrepareAndFinalize, fused_experts: FusedMoEPermuteExpertsUnpermute, shared_experts: torch.nn.Module | None = None, - shared_experts_stream: torch.cuda.Stream | None = None, moe_parallel_config: FusedMoEParallelConfig | None = None, ): super().__init__() self.prepare_finalize = prepare_finalize self.fused_experts = fused_experts self.shared_experts = shared_experts - self.shared_experts_stream = shared_experts_stream # prefer an explicit FusedMoEParallelConfig when available (from # FusedMoE layers / tests). @@ -904,34 +901,6 @@ def _slice_expert_tokens_metadata( expert_num_tokens_cpu=c_expert_num_tokens_cpu, ) - def _maybe_setup_shared_experts_stream( - self, hidden_states: torch.Tensor - ) -> tuple[bool, torch.Tensor | None]: - # decide whether to run shared experts on a separate CUDA stream to - # overlap with the main fused MoE kernel. - use_shared_experts_stream = ( - self.shared_experts is not None - and self.shared_experts_stream is not None - and hidden_states.is_cuda - and ( - hidden_states.shape[0] - <= envs.VLLM_SHARED_EXPERTS_STREAM_TOKEN_THRESHOLD - ) - ) - - hidden_states_clone: torch.Tensor | None = None - if use_shared_experts_stream and self.shared_experts_stream is not None: - # TODO: Optimize this (complicated) - # Note: this clone adds overhead but is required - # for correctness with multiple CUDA streams and CUDA graph capture. - hidden_states_clone = hidden_states.clone() - # record that the clone will be used by the separate stream so its - # lifetime is correctly tracked. - hidden_states_clone.record_stream(self.shared_experts_stream) - self.shared_experts_stream.wait_stream(torch.cuda.current_stream()) - - return use_shared_experts_stream, hidden_states_clone - def _prepare( self, hidden_states: torch.Tensor, @@ -1119,30 +1088,12 @@ def _finalize( topk_weights: torch.Tensor, topk_ids: torch.Tensor, apply_router_weight_on_input: bool, - hidden_states_clone: torch.Tensor | None = None, - use_shared_experts_stream: bool = False, ) -> torch.Tensor | tuple[torch.Tensor, torch.Tensor]: """ The _finalize method is a wrapper around self.prepare_finalize.finalize that handles DBO, async and shared expert overlap. """ - - def maybe_run_shared_experts() -> torch.Tensor | None: - if self.shared_experts is None: - return None - - if ( - not use_shared_experts_stream - or self.shared_experts_stream is not None - and (not hidden_states.is_cuda or not torch.cuda.is_available()) - ): - # fall back to running on the current stream - return self.shared_experts(hidden_states) - - assert hidden_states_clone is not None - # launch shared experts on the dedicated stream. - with torch.cuda.stream(self.shared_experts_stream): - return self.shared_experts(hidden_states_clone) + shared_output: torch.Tensor | None = None if not self.prepare_finalize.supports_async(): assert not dbo_enabled() @@ -1155,7 +1106,8 @@ def maybe_run_shared_experts() -> torch.Tensor | None: apply_router_weight_on_input, self.fused_experts.finalize_weight_and_reduce_impl(), ) - shared_output = maybe_run_shared_experts() + if self.shared_experts is not None: + shared_output = self.shared_experts(hidden_states) else: finalize_ret = self.prepare_finalize.finalize_async( output, @@ -1165,8 +1117,8 @@ def maybe_run_shared_experts() -> torch.Tensor | None: apply_router_weight_on_input, self.fused_experts.finalize_weight_and_reduce_impl(), ) - - shared_output = maybe_run_shared_experts() + if self.shared_experts is not None: + shared_output = self.shared_experts(hidden_states) # TODO(lucas): refactor this in the alternative schedules followup # currently unpack if we have hook + receiver pair or just @@ -1189,28 +1141,12 @@ def maybe_run_shared_experts() -> torch.Tensor | None: receiver() - self._wait_for_shared_experts_stream(hidden_states, use_shared_experts_stream) - if self.shared_experts is None: return output else: assert shared_output is not None return shared_output, output - def _wait_for_shared_experts_stream( - self, hidden_states: torch.Tensor, use_shared_experts_stream: bool - ) -> None: - # ensure that any work enqueued on the shared_experts_stream is - # completed before the shared_output tensor is consumed - if ( - self.shared_experts is not None - and use_shared_experts_stream - and self.shared_experts_stream is not None - and hidden_states.is_cuda - and current_platform.is_cuda() - ): - torch.cuda.current_stream().wait_stream(self.shared_experts_stream) - def forward( self, hidden_states: torch.Tensor, @@ -1257,10 +1193,6 @@ def forward( else: output = torch.zeros_like(hidden_states) - use_shared_experts_stream, hidden_states_clone = ( - self._maybe_setup_shared_experts_stream(hidden_states) - ) - local_num_experts = w1.size(0) if global_num_experts == -1: global_num_experts = local_num_experts @@ -1297,6 +1229,4 @@ def forward( topk_weights, topk_ids, apply_router_weight_on_input, - hidden_states_clone=hidden_states_clone, - use_shared_experts_stream=use_shared_experts_stream, ) diff --git a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py index ec1e410608d6..06707e5e4892 100644 --- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py @@ -231,8 +231,7 @@ def rocm_aiter_fused_experts( # w8a8 block-scaled if quant_config.block_shape is not None and quant_config.use_fp8_w8a8: assert not apply_router_weight_on_input, ( - "apply_router_weight_on_input is\ - not supported for block scaled moe" + "apply_router_weight_on_input is not supported for block scaled moe" ) assert quant_config.w1_scale is not None assert quant_config.w2_scale is not None diff --git a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py index 82dbccf3fa9d..41762b7f6492 100644 --- a/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py +++ b/vllm/model_executor/layers/fused_moe/unquantized_fused_moe_method.py @@ -16,6 +16,9 @@ FusedMoEQuantConfig, biased_moe_quant_config, ) +from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( + FlashInferExperts, +) from vllm.model_executor.layers.fused_moe.fused_moe_method_base import ( FusedMoEMethodBase, ) @@ -27,7 +30,13 @@ from vllm.model_executor.layers.fused_moe.prepare_finalize import ( MoEPrepareAndFinalizeNoEP, ) -from vllm.model_executor.utils import set_weight_attrs +from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( + AiterExperts, +) +from vllm.model_executor.layers.quantization.utils.flashinfer_utils import ( + swap_w13_to_w31, +) +from vllm.model_executor.utils import replace_parameter, set_weight_attrs from vllm.platforms import current_platform from vllm.platforms.interface import CpuArchEnum from vllm.utils.flashinfer import has_flashinfer_cutlass_fused_moe @@ -54,12 +63,6 @@ def __init__(self, moe: FusedMoEConfig): super().__init__(moe) self.rocm_aiter_moe_enabled = rocm_aiter_ops.is_fused_moe_enabled() - if self.rocm_aiter_moe_enabled: - from .rocm_aiter_fused_moe import rocm_aiter_fused_experts - - self.rocm_aiter_fused_experts = rocm_aiter_fused_experts - else: - self.rocm_aiter_fused_experts = None # type: ignore # FlashInfer CUTLASS MoE is only supported on Hopper and later GPUS self.flashinfer_cutlass_moe_enabled = ( @@ -73,18 +76,6 @@ def __init__(self, moe: FusedMoEConfig): logger.info_once( "Enabling FlashInfer CUTLASS MoE for UnquantizedFusedMoEMethod" ) - from functools import partial - - from .flashinfer_cutlass_moe import flashinfer_cutlass_moe - - self.flashinfer_cutlass_moe = partial( - flashinfer_cutlass_moe, - quant_config=FUSED_MOE_UNQUANTIZED_CONFIG, - tp_rank=self.moe.moe_parallel_config.tp_rank, - tp_size=self.moe.moe_parallel_config.tp_size, - ep_rank=self.moe.moe_parallel_config.ep_rank, - ep_size=self.moe.moe_parallel_config.ep_size, - ) else: if ( self.moe.moe_parallel_config.use_ep @@ -101,7 +92,6 @@ def __init__(self, moe: FusedMoEConfig): "FlashInfer CUTLASS MoE is currently not available for DP.", scope="local", ) - self.flashinfer_cutlass_moe = None # type: ignore @property def supports_eplb(self) -> bool: @@ -214,20 +204,6 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.w13_weight.data = self._maybe_pad_weight(layer.w13_weight.data) layer.w2_weight.data = self._maybe_pad_weight(layer.w2_weight.data) - if self.rocm_aiter_moe_enabled: - shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( - layer.w13_weight.data, layer.w2_weight.data - ) - - layer.w13_weight.data = shuffled_w13 - layer.w2_weight.data = shuffled_w2 - - if self.flashinfer_cutlass_moe_enabled: - # Swap halves to arrange as [w3; w1] (kernel expectation) - w1_w, w3_w = torch.chunk(layer.w13_weight.data, 2, dim=1) - w13_weight_swapped = torch.cat([w3_w, w1_w], dim=1) - layer.w13_weight.data = w13_weight_swapped.contiguous() - if current_platform.is_xpu(): import intel_extension_for_pytorch as ipex @@ -271,11 +247,44 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: layer.cpu_fused_moe = cpu_fused_moe.CPUFusedMOE(layer) elif current_platform.is_cuda_alike(): self.moe_quant_config = self.get_fused_moe_quant_config(layer) - self.kernel = mk.FusedMoEModularKernel( - MoEPrepareAndFinalizeNoEP(), - TritonExperts(self.moe_quant_config), - shared_experts=None, - ) + if self.rocm_aiter_moe_enabled: + shuffled_w13, shuffled_w2 = rocm_aiter_ops.shuffle_weights( + layer.w13_weight.data, layer.w2_weight.data + ) + replace_parameter(layer, "w13_weight", shuffled_w13) + replace_parameter(layer, "w2_weight", shuffled_w2) + + self.use_inplace = True + self.kernel = mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(), + AiterExperts(self.moe_quant_config), + shared_experts=None, + ) + + elif self.flashinfer_cutlass_moe_enabled: + self.use_inplace = False + # Swap halves to arrange as [w3; w1] (kernel expectation) + w13_weight = swap_w13_to_w31(layer.w13_weight.data) + replace_parameter(layer, "w13_weight", w13_weight) + + self.kernel = mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(), + FlashInferExperts( + out_dtype=layer.params_dtype, + quant_config=self.moe_quant_config, + tp_rank=self.moe.moe_parallel_config.tp_rank, + tp_size=self.moe.moe_parallel_config.tp_size, + ep_rank=self.moe.moe_parallel_config.ep_rank, + ep_size=self.moe.moe_parallel_config.ep_size, + ), + ) + else: + self.use_inplace = True + self.kernel = mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(), + TritonExperts(self.moe_quant_config), + shared_experts=None, + ) def apply( self, @@ -309,40 +318,18 @@ def forward_cuda( router_logits=router_logits, ) - if self.rocm_aiter_moe_enabled: - result = self.rocm_aiter_fused_experts( - hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - expert_map=layer.expert_map, - activation=layer.activation, - apply_router_weight_on_input=layer.apply_router_weight_on_input, - ) - elif self.flashinfer_cutlass_moe_enabled: - return self.flashinfer_cutlass_moe( - hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - activation=layer.activation, - apply_router_weight_on_input=layer.apply_router_weight_on_input, - ) - else: - result = self.kernel( - hidden_states=x, - w1=layer.w13_weight, - w2=layer.w2_weight, - topk_weights=topk_weights, - topk_ids=topk_ids, - inplace=True, - activation=layer.activation, - apply_router_weight_on_input=layer.apply_router_weight_on_input, - global_num_experts=layer.global_num_experts, - expert_map=layer.expert_map, - ) + result = self.kernel( + hidden_states=x, + w1=layer.w13_weight, + w2=layer.w2_weight, + topk_weights=topk_weights, + topk_ids=topk_ids, + inplace=self.use_inplace, + activation=layer.activation, + apply_router_weight_on_input=layer.apply_router_weight_on_input, + global_num_experts=layer.global_num_experts, + expert_map=layer.expert_map, + ) return result diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index 402f0bf69cea..ebdc05449a86 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -80,6 +80,14 @@ def adjust_marlin_shard(param, shard_size, shard_offset): return shard_size * marlin_tile_size, shard_offset * marlin_tile_size +def adjust_block_scale_shard(weight_block_size, shard_size, shard_offset): + assert weight_block_size is not None + block_n = weight_block_size[0] + shard_offset = (shard_offset + block_n - 1) // block_n + shard_size = (shard_size + block_n - 1) // block_n + return shard_size, shard_offset + + def adjust_bitsandbytes_4bit_shard( param: Parameter, shard_offsets: dict[str, tuple[int, int]], loaded_shard_id: str ) -> tuple[int, int]: @@ -763,8 +771,18 @@ def weight_loader( assert loaded_shard_id < len(self.output_sizes) if output_dim is not None: - shard_offset = sum(self.output_sizes[:loaded_shard_id]) // self.tp_size - shard_size = self.output_sizes[loaded_shard_id] // self.tp_size + shard_offset = sum(self.output_sizes[:loaded_shard_id]) + shard_size = self.output_sizes[loaded_shard_id] + + if isinstance(param, BlockQuantScaleParameter): + weight_block_size = getattr(self, "weight_block_size", None) + shard_size, shard_offset = adjust_block_scale_shard( + weight_block_size, shard_size, shard_offset + ) + + shard_offset //= self.tp_size + shard_size //= self.tp_size + # Special case for quantization. # If quantized, we need to adjust the offset and size to account # for the packing. @@ -867,24 +885,17 @@ def weight_loader_v2( assert loaded_shard_id < len(self.output_sizes) + shard_offset = sum(self.output_sizes[:loaded_shard_id]) + shard_size = self.output_sizes[loaded_shard_id] + if isinstance(param, BlockQuantScaleParameter): - assert self.quant_method is not None - # Assume the weight block size has been set by quant method - assert hasattr(self, "weight_block_size") - weight_block_size = self.weight_block_size - assert weight_block_size is not None - block_n, _ = weight_block_size[0], weight_block_size[1] - shard_offset = ( - (sum(self.output_sizes[:loaded_shard_id]) + block_n - 1) // block_n - ) // self.tp_size - shard_size = ( - (self.output_sizes[loaded_shard_id] + block_n - 1) - // block_n - // self.tp_size + weight_block_size = getattr(self, "weight_block_size", None) + shard_size, shard_offset = adjust_block_scale_shard( + weight_block_size, shard_size, shard_offset ) - else: - shard_offset = sum(self.output_sizes[:loaded_shard_id]) // self.tp_size - shard_size = self.output_sizes[loaded_shard_id] // self.tp_size + + shard_offset //= self.tp_size + shard_size //= self.tp_size param.load_merged_column_weight( loaded_weight=loaded_weight, @@ -1066,16 +1077,11 @@ def weight_loader_v2( shard_offset = self._get_shard_offset_mapping(loaded_shard_id) shard_size = self._get_shard_size_mapping(loaded_shard_id) - # Note(simon): This is needed for Qwen3's fp8 quantization. if isinstance(param, BlockQuantScaleParameter): - assert self.quant_method is not None - # Assume the weight block size has been set by quant method - assert hasattr(self, "weight_block_size") - weight_block_size = self.weight_block_size - assert weight_block_size is not None - block_n, _ = weight_block_size[0], weight_block_size[1] - shard_offset = (shard_offset + block_n - 1) // block_n - shard_size = (shard_size + block_n - 1) // block_n + weight_block_size = getattr(self, "weight_block_size", None) + shard_size, shard_offset = adjust_block_scale_shard( + weight_block_size, shard_size, shard_offset + ) param.load_qkv_weight( loaded_weight=loaded_weight, @@ -1208,6 +1214,13 @@ def weight_loader( elif loaded_shard_id == "v": shard_offset = (self.num_heads + self.num_kv_heads) * self.head_size shard_size = self.num_kv_heads * self.v_head_size + + if isinstance(param, BlockQuantScaleParameter): + weight_block_size = getattr(self, "weight_block_size", None) + shard_size, shard_offset = adjust_block_scale_shard( + weight_block_size, shard_size, shard_offset + ) + # Special case for Quantized Weights. # If quantized, we need to adjust the offset and size to account # for the packing. diff --git a/vllm/model_executor/layers/mamba/mamba_mixer.py b/vllm/model_executor/layers/mamba/mamba_mixer.py index 0b63acf2dc5a..a8d412784efa 100644 --- a/vllm/model_executor/layers/mamba/mamba_mixer.py +++ b/vllm/model_executor/layers/mamba/mamba_mixer.py @@ -82,6 +82,7 @@ def __init__( input_size=conv_kernel_size, output_size=intermediate_size, bias=use_conv_bias, + prefix=f"{prefix}.conv1d", ) # unsqueeze to fit conv1d weights shape into the linear weights shape. # Can't do this in `weight_loader` since it already exists in @@ -90,7 +91,10 @@ def __init__( self.conv1d.weight.data = self.conv1d.weight.data.unsqueeze(1) self.in_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, bias=use_bias + hidden_size, + [intermediate_size] * 2, + bias=use_bias, + prefix=f"{prefix}.in_proj", ) # selective projection used to make dt, B and C input dependent @@ -98,12 +102,17 @@ def __init__( intermediate_size, time_step_rank + ssm_state_size * 2, bias=False, + prefix=f"{prefix}.x_proj", ) # time step projection (discretization) - # In the forward we need to apply dt_proj without the bias, # as the bias is added in the selective scan kernel. self.dt_proj = ColumnParallelLinear( - time_step_rank, intermediate_size, bias=True, skip_bias_add=True + time_step_rank, + intermediate_size, + bias=True, + skip_bias_add=True, + prefix=f"{prefix}.dt_proj", ) def weight_loader(param: Parameter, loaded_weight: torch.Tensor): @@ -136,6 +145,7 @@ def A_weight_loader(param: Parameter, loaded_weight: torch.Tensor): hidden_size, bias=use_bias, input_is_parallel=True, + prefix=f"{prefix}.out_proj", ) self.dt_layernorm = ( diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py index 3f1b4d883b79..f8b29041ee2b 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_wNa16.py @@ -114,7 +114,7 @@ def create_weights( logger.info("Using %s for CompressedTensorsWNA16", kernel_type.__name__) self._kernel_backends_being_used.add(kernel_type.__name__) - if isinstance(kernel_type, MarlinLinearKernel): + if kernel_type is MarlinLinearKernel: input_dtype = get_marlin_input_dtype(self.layer_name) if input_dtype is not None: mp_linear_kernel_config.act_type = input_dtype diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 08e1f4d444ee..1223c6902e5f 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -50,7 +50,8 @@ apply_flashinfer_per_tensor_scale_fp8, build_flashinfer_fp8_cutlass_moe_prepare_finalize, get_flashinfer_moe_backend, - register_moe_scaling_factors, + make_fp8_moe_alpha_scales_for_fi, + register_scales_for_trtllm_fp8_per_tensor_moe, rotate_flashinfer_fp8_moe_weights, select_cutlass_fp8_gemm_impl, swap_w13_to_w31, @@ -150,7 +151,7 @@ def get_fp8_moe_backend( if block_quant and current_platform.is_device_capability_family(100): raise ValueError( "FlashInfer FP8 MoE throughput backend does not " - "support block quantization. Please use " + "support block quantization on SM100. Please use " "VLLM_FLASHINFER_MOE_BACKEND=latency " "instead." ) @@ -180,7 +181,19 @@ def get_fp8_moe_backend( scope="local", ) - if envs.VLLM_USE_DEEP_GEMM and moe_use_deep_gemm and block_quant: + # Determine if we should use DeepGEMM (top-level enable switch) + # - If explicitly set by user, respect their choice + # - If not platform supports DeepGEMM, disable it + # This helps avoid warning messages on unsupported platforms. + use_deep_gemm = envs.VLLM_USE_DEEP_GEMM + if not is_deep_gemm_supported(): + use_deep_gemm = False + logger.info_once( + "DeepGEMM is disabled because the platform does not support it.", + scope="local", + ) + + if use_deep_gemm and moe_use_deep_gemm and block_quant: if not has_deep_gemm(): logger.warning_once( "DeepGEMM backend requested but not available.", scope="local" @@ -762,6 +775,14 @@ def __init__(self, quant_config: Fp8Config, layer: torch.nn.Module): "FlashInfer CUTLASS FP8 MoE backend only supports SiLU " "activation function, but got {layer.activation}." ) + dynamic_per_token = ( + not self.block_quant and self.quant_config.activation_scheme != "static" + ) + if self.flashinfer_moe_backend is not None and dynamic_per_token: + raise NotImplementedError( + "FlashInfer FP8 MoE backend does not support dynamic per token " + "activation quantization." + ) def create_weights( self, @@ -893,6 +914,8 @@ def _convert_weights_to_kernel_format( w2_weight: torch.Tensor, w13_weight_scale: torch.Tensor, w2_weight_scale: torch.Tensor, + w13_input_scale: torch.Tensor | None, + w2_input_scale: torch.Tensor | None, ) -> None: if self.fp8_backend == Fp8MoeBackend.DEEPGEMM: assert self.block_quant @@ -937,11 +960,16 @@ def _convert_weights_to_kernel_format( if self.block_quant: w13_weight_scale = swap_w13_to_w31(w13_weight_scale) else: - # TODO(rob): this function is a hack that renames the scaling - # factors in the Module. This is a hack we should clean up. - register_moe_scaling_factors(layer) if self.fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM: rotate_flashinfer_fp8_moe_weights(w13_weight, w2_weight) + register_scales_for_trtllm_fp8_per_tensor_moe( + layer=layer, + w13_weight_scale=w13_weight, + w13_input_scale=w13_input_scale, + w2_weight_scale=w2_weight, + w2_input_scale=w2_input_scale, + ) + elif self.fp8_backend == Fp8MoeBackend.AITER: w13_weight, w2_weight = rocm_aiter_ops.shuffle_weights( w13_weight, w2_weight @@ -961,27 +989,37 @@ def _setup_kernel(self, layer: Module) -> None: # done, then we will initialzie the TP case and DP/EP case # via the same code path (i.e. via maybe_init_modular_kernel). # NOTE(rob): in progress migrating all into this format. - if self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS: - from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( - FlashInferExperts, - ) - from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_prepare_finalize import ( # noqa: E501 - FlashInferAllGatherMoEPrepareAndFinalize, - ) - config = self.get_fused_moe_quant_config(layer) - assert config is not None - self.moe_quant_config = config + from vllm.model_executor.layers.fused_moe import ( + TritonOrDeepGemmExperts, + ) + from vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe import ( + FlashInferExperts, + ) + from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( + MarlinExperts, + ) + from vllm.model_executor.layers.fused_moe.prepare_finalize import ( + MoEPrepareAndFinalizeNoEP, + ) + from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( + AiterExperts, + ) + + # Flashinfer TRTLLM does not use the modular kernel abstraction. + if self.fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM: + return + self.moe_quant_config = self.get_fused_moe_quant_config(layer) + assert self.moe_quant_config is not None + self.use_inplace = True + + if self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS: self.kernel = mk.FusedMoEModularKernel( - # TODO(rob): we can use the generic MoEPrepareAndFinalizeNoEP - # with the changes to defer input quantization - FlashInferAllGatherMoEPrepareAndFinalize( - use_dp=(self.moe.dp_size > 1), - use_deepseek_fp8_block_scale=self.block_quant, - ), + # TODO: make defer_input_quant an attr of the FlashInferExperts + MoEPrepareAndFinalizeNoEP(defer_input_quant=self.block_quant), FlashInferExperts( - out_dtype=torch.get_default_dtype(), + out_dtype=layer.orig_dtype, quant_config=self.moe_quant_config, ep_rank=self.moe.ep_rank, ep_size=self.moe.ep_size, @@ -993,50 +1031,26 @@ def _setup_kernel(self, layer: Module) -> None: ) self.use_inplace = False - elif self.fp8_backend in [ - Fp8MoeBackend.DEEPGEMM, - Fp8MoeBackend.TRITON, - Fp8MoeBackend.MARLIN, - Fp8MoeBackend.AITER, - ]: - from vllm.model_executor.layers.fused_moe import ( - TritonOrDeepGemmExperts, - ) - from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( - MarlinExperts, + elif self.fp8_backend == Fp8MoeBackend.AITER: + self.kernel = mk.FusedMoEModularKernel( + # TODO: make defer_input_quant an attr of the AiterExperts + MoEPrepareAndFinalizeNoEP(defer_input_quant=True), + AiterExperts(quant_config=self.moe_quant_config), ) - from vllm.model_executor.layers.fused_moe.prepare_finalize import ( - MoEPrepareAndFinalizeNoEP, + elif self.fp8_backend == Fp8MoeBackend.MARLIN: + self.kernel = mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(), + MarlinExperts(quant_config=self.moe_quant_config), ) - from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import ( - AiterExperts, + else: + self.kernel = mk.FusedMoEModularKernel( + MoEPrepareAndFinalizeNoEP(), + TritonOrDeepGemmExperts( + quant_config=self.moe_quant_config, + allow_deep_gemm=(self.fp8_backend == Fp8MoeBackend.DEEPGEMM), + ), ) - config = self.get_fused_moe_quant_config(layer) - assert config is not None - self.moe_quant_config = config - - if self.fp8_backend == Fp8MoeBackend.AITER: - self.kernel = mk.FusedMoEModularKernel( - # TODO: make defer_input_quant an attr of the AiterExperts - MoEPrepareAndFinalizeNoEP(defer_input_quant=True), - AiterExperts(quant_config=self.moe_quant_config), - ) - elif self.fp8_backend == Fp8MoeBackend.MARLIN: - self.kernel = mk.FusedMoEModularKernel( - MoEPrepareAndFinalizeNoEP(), - MarlinExperts(quant_config=self.moe_quant_config), - ) - else: - self.kernel = mk.FusedMoEModularKernel( - MoEPrepareAndFinalizeNoEP(), - TritonOrDeepGemmExperts( - quant_config=self.moe_quant_config, - allow_deep_gemm=(self.fp8_backend == Fp8MoeBackend.DEEPGEMM), - ), - ) - self.use_inplace = True - def process_weights_after_loading(self, layer: Module) -> None: if getattr(layer, "_already_called_process_weights_after_loading", False): return @@ -1093,7 +1107,13 @@ def process_weights_after_loading(self, layer: Module) -> None: # Shuffle weights into the runtime format. self._convert_weights_to_kernel_format( - layer, w13_weight, w2_weight, w13_weight_scale, w2_weight_scale + layer=layer, + w13_weight=w13_weight, + w2_weight=w2_weight, + w13_weight_scale=w13_weight_scale, + w2_weight_scale=w2_weight_scale, + w13_input_scale=w13_input_scale, + w2_input_scale=w2_input_scale, ) # Setup modular kernel for TP case. @@ -1109,21 +1129,14 @@ def maybe_make_prepare_finalize( or self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM ): return None - elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: - if self.block_quant: - assert self.weight_block_size == [128, 128], ( - f"Only support weight_block_size == [128, 128], " - f"got {self.weight_block_size}" - ) - # Wire block-scale flag through prepare/finalize when using CUTLASS + elif self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS: prepare_finalize = build_flashinfer_fp8_cutlass_moe_prepare_finalize( self.moe, use_deepseek_fp8_block_scale=self.block_quant, ) logger.debug_once("%s", prepare_finalize.__class__.__name__) return prepare_finalize - else: - return super().maybe_make_prepare_finalize(routing_tables) + return super().maybe_make_prepare_finalize(routing_tables) def select_gemm_impl( self, @@ -1195,6 +1208,11 @@ def select_gemm_impl( def get_fused_moe_quant_config( self, layer: torch.nn.Module ) -> FusedMoEQuantConfig | None: + # TRTLLM does not use Modular Kernel. + if self.fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM: + return None + + # MARLIN uses mixed precision W8A16 config. if self.fp8_backend == Fp8MoeBackend.MARLIN: return fp8_w8a16_moe_quant_config( w1_scale=getattr(layer, f"w13_{self.weight_scale_name}"), @@ -1202,11 +1220,38 @@ def get_fused_moe_quant_config( block_shape=self.weight_block_size, ) + w1_scale = getattr(layer, f"w13_{self.weight_scale_name}") + w2_scale = getattr(layer, f"w2_{self.weight_scale_name}") + a1_scale = layer.w13_input_scale + a2_scale = layer.w2_input_scale + + # Flashinfer CUTLASS per-tensor uses single dq scale + # (alpha = w_scale * a_scale) and inverse a2 scale. + if ( + self.fp8_backend == Fp8MoeBackend.FLASHINFER_CUTLASS + and not self.block_quant + ): + g1_alphas, g2_alphas = make_fp8_moe_alpha_scales_for_fi( + w1_scale, + a1_scale, + w2_scale, + a2_scale, + ) + return fp8_w8a8_moe_quant_config( + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=(1.0 / a2_scale), + g1_alphas=g1_alphas, + g2_alphas=g2_alphas, + ) + + # All other backends use normal config. return fp8_w8a8_moe_quant_config( - w1_scale=getattr(layer, f"w13_{self.weight_scale_name}"), - w2_scale=getattr(layer, f"w2_{self.weight_scale_name}"), - a1_scale=layer.w13_input_scale, - a2_scale=layer.w2_input_scale, + w1_scale=w1_scale, + w2_scale=w2_scale, + a1_scale=a1_scale, + a2_scale=a2_scale, block_shape=self.weight_block_size, ) @@ -1427,7 +1472,13 @@ def process_weights_after_loading(self, layer: Module) -> None: # Shuffle weights into the runtime format. self._convert_weights_to_kernel_format( - layer, w13_weight, w2_weight, layer.w13_weight_scale, layer.w2_weight_scale + layer=layer, + w13_weight=w13_weight, + w2_weight=w2_weight, + w13_weight_scale=layer.w13_weight_scale, + w2_weight_scale=layer.w2_weight_scale, + w13_input_scale=None, + w2_input_scale=None, ) # Setup modular kernel for TP case. diff --git a/vllm/model_executor/layers/quantization/modelopt.py b/vllm/model_executor/layers/quantization/modelopt.py index afbefe1fedc1..b6752d7f9913 100644 --- a/vllm/model_executor/layers/quantization/modelopt.py +++ b/vllm/model_executor/layers/quantization/modelopt.py @@ -50,7 +50,8 @@ flashinfer_cutlass_moe_fp8, get_flashinfer_moe_backend, is_flashinfer_supporting_global_sf, - register_moe_scaling_factors, + make_fp8_moe_alpha_scales_for_fi, + register_scales_for_trtllm_fp8_per_tensor_moe, rotate_flashinfer_fp8_moe_weights, select_cutlass_fp8_gemm_impl, swap_w13_to_w31, @@ -752,13 +753,17 @@ def maybe_make_prepare_finalize( if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: return None elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: + # TP case: avoid convert to ModularKernelMethod - to be refactored. + if self.moe.dp_size == 1: + return None + prepare_finalize = build_flashinfer_fp8_cutlass_moe_prepare_finalize( - self.moe + self.moe, + use_deepseek_fp8_block_scale=False, ) logger.debug_once("%s", prepare_finalize.__class__.__name__) return prepare_finalize - else: - return super().maybe_make_prepare_finalize(routing_tables) + return super().maybe_make_prepare_finalize(routing_tables) def select_gemm_impl( self, @@ -943,9 +948,18 @@ def process_weights_after_loading(self, layer: torch.nn.Module) -> None: if self.flashinfer_moe_backend is not None: if self.moe.is_act_and_mul: layer.w13_weight.data = swap_w13_to_w31(layer.w13_weight.data) + + # NOTE: this adds some attributes used by the trtllm kernel, + # which does not conform to the modular kernels abstraction (yet). if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: rotate_flashinfer_fp8_moe_weights(layer.w13_weight, layer.w2_weight) - register_moe_scaling_factors(layer) + register_scales_for_trtllm_fp8_per_tensor_moe( + layer=layer, + w13_weight_scale=layer.w13_weight_scale, + w13_input_scale=layer.w13_input_scale, + w2_weight_scale=layer.w2_weight_scale, + w2_input_scale=layer.w2_input_scale, + ) def _maybe_pad_intermediate_for_flashinfer(self, layer: torch.nn.Module) -> None: """Pad intermediate size so FlashInfer kernels' alignment constraints hold. @@ -995,19 +1009,34 @@ def get_fused_moe_quant_config( self, layer: torch.nn.Module ) -> FusedMoEQuantConfig | None: if self.flashinfer_moe_backend == FlashinferMoeBackend.TENSORRT_LLM: + # TRTLLM does not use modular kernels return None - return fp8_w8a8_moe_quant_config( - w1_scale=layer.w13_weight_scale, - g1_alphas=layer.output1_scales_gate_scalar.squeeze(), - w2_scale=layer.w2_weight_scale, - g2_alphas=layer.output2_scales_scalar.squeeze(), - a1_scale=layer.w13_input_scale, - a1_gscale=layer.w13_input_scale, - a2_scale=layer.w2_input_scale, - a2_gscale=layer.w2_input_scale_inv, - per_act_token_quant=False, - ) + elif self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS: + g1_alphas, g2_alphas = make_fp8_moe_alpha_scales_for_fi( + layer.w13_weight_scale, + layer.w13_input_scale, + layer.w2_weight_scale, + layer.w2_input_scale, + ) + return fp8_w8a8_moe_quant_config( + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale, + a1_gscale=(1.0 / layer.w13_input_scale), + a2_gscale=(1.0 / layer.w2_input_scale), + g1_alphas=g1_alphas, + g2_alphas=g2_alphas, + ) + else: + assert self.flashinfer_moe_backend is None + return fp8_w8a8_moe_quant_config( + w1_scale=layer.w13_weight_scale, + w2_scale=layer.w2_weight_scale, + a1_scale=layer.w13_input_scale, + a2_scale=layer.w2_input_scale, + ) def apply( self, @@ -1452,6 +1481,9 @@ def maybe_make_prepare_finalize( self.allow_flashinfer and self.flashinfer_moe_backend == FlashinferMoeBackend.CUTLASS ): + # TP case: avoid convert to ModularKernelMethod - to be refactored. + if self.moe.dp_size == 1: + return None # For now, fp4 moe only works with the flashinfer dispatcher. prepare_finalize = build_flashinfer_fp4_cutlass_moe_prepare_finalize( self.moe diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index dc0fbfa7df35..4fabb426b721 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -240,7 +240,6 @@ def __init__(self, moe: FusedMoEConfig): self.mxfp4_backend = get_mxfp4_backend(moe.is_lora_enabled) self.marlin_input_dtype = None - self.use_marlin = self.mxfp4_backend == Mxfp4Backend.MARLIN self.max_capture_size = ( get_current_vllm_config().compilation_config.max_cudagraph_capture_size ) diff --git a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py index 3d6e9cda8766..b73c44b3130d 100644 --- a/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py +++ b/vllm/model_executor/layers/quantization/utils/flashinfer_utils.py @@ -103,6 +103,26 @@ def rotate_flashinfer_fp8_moe_weights( ) +def register_scales_for_trtllm_fp8_per_tensor_moe( + layer: torch.nn.Module, + w13_weight_scale: torch.Tensor, + w13_input_scale: torch.Tensor, + w2_weight_scale: torch.Tensor, + w2_input_scale: torch.Tensor, +) -> None: + """Register necessary scales for FlashInfer TRTLLM FP8 MoE kernel""" + g1_alphas, g2_alphas = make_fp8_moe_alpha_scales_for_fi( + w13_scale=w13_weight_scale, + w13_input_scale=w13_input_scale, + w2_scale=w2_weight_scale, + w2_input_scale=w2_input_scale, + ) + layer.w2_input_scale_inv = 1.0 / w2_input_scale + layer.output1_scales_gate_scalar = g1_alphas + layer.output1_scales_scalar = g1_alphas * layer.w2_input_scale_inv + layer.output2_scales_scalar = g2_alphas + + def apply_flashinfer_per_tensor_scale_fp8( layer: torch.nn.Module, hidden_states: torch.Tensor, @@ -117,18 +137,13 @@ def apply_flashinfer_per_tensor_scale_fp8( from flashinfer.fused_moe import RoutingMethodType import vllm.model_executor.layers.fused_moe.flashinfer_trtllm_moe # noqa: E501, F401 + from vllm.model_executor.models.llama4 import Llama4MoE - assert layer.output1_scales_scalar is not None, ( - "Expected output1_scales_scalar to be initialized" - ) - assert layer.output1_scales_scalar is not None, ( - "Expected output1_scales_gate_scalar to be initialized" + assert ( + hasattr(layer, "output1_scales_scalar") + and hasattr(layer, "output1_scales_gate_scalar") + and hasattr(layer, "output2_scales_scalar") ) - assert layer.output1_scales_scalar is not None, ( - "Expected output2_scales_scalar to be initialized" - ) - - from vllm.model_executor.models.llama4 import Llama4MoE assert layer.custom_routing_function == Llama4MoE.custom_routing_function, ( "FusedMoE flashinfer kernels are only supported for Llama4" @@ -155,40 +170,16 @@ def apply_flashinfer_per_tensor_scale_fp8( ) -def get_moe_scaling_factors( - input_scale: torch.Tensor, - gemm1_weights_scale: torch.Tensor, - activation_scale: torch.Tensor, - gemm2_weights_scale: torch.Tensor, -) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: - output1_scales_scalar = gemm1_weights_scale * input_scale * (1.0 / activation_scale) - output1_scales_gate_scalar = gemm1_weights_scale * input_scale - output2_scales_scalar = activation_scale * gemm2_weights_scale - - return output1_scales_scalar, output1_scales_gate_scalar, output2_scales_scalar +def make_fp8_moe_alpha_scales_for_fi( + w13_scale: torch.Tensor, + w13_input_scale: torch.Tensor, + w2_scale: torch.Tensor, + w2_input_scale: torch.Tensor, +) -> tuple[torch.Tensor, torch.Tensor]: + g1_alphas = (w13_scale * w13_input_scale).squeeze() + g2_alphas = (w2_scale * w2_input_scale).squeeze() - -def register_moe_scaling_factors(layer: torch.nn.Module) -> None: - output1_scales, output1_gate_scales, output2_scales = get_moe_scaling_factors( - layer.w13_input_scale, - layer.w13_weight_scale, - layer.w2_input_scale, - layer.w2_weight_scale, - ) - layer.register_parameter( - "output1_scales_scalar", torch.nn.Parameter(output1_scales, requires_grad=False) - ) - layer.register_parameter( - "output1_scales_gate_scalar", - torch.nn.Parameter(output1_gate_scales, requires_grad=False), - ) - layer.register_parameter( - "output2_scales_scalar", torch.nn.Parameter(output2_scales, requires_grad=False) - ) - layer.register_parameter( - "w2_input_scale_inv", - torch.nn.Parameter(1.0 / layer.w2_input_scale, requires_grad=False), - ) + return g1_alphas, g2_alphas def build_flashinfer_fp8_cutlass_moe_prepare_finalize( diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py index 7e83ea9a1355..bd82728ed15f 100644 --- a/vllm/model_executor/layers/rotary_embedding/base.py +++ b/vllm/model_executor/layers/rotary_embedding/base.py @@ -250,6 +250,28 @@ def forward_xpu( ) return query, key + def forward_cpu( + self, + positions: torch.Tensor, + query: torch.Tensor, + key: torch.Tensor | None = None, + ) -> tuple[torch.Tensor, torch.Tensor | None]: + from vllm import _custom_ops as ops + + self._match_cos_sin_cache_dtype(query) + + # ops.rotary_embedding() is an in-place operation + # that updates the query and key tensors. + ops.rotary_embedding( + positions, + query, + key, + self.head_size, + self.cos_sin_cache, + self.is_neox_style, + ) + return query, key + def extra_repr(self) -> str: s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}" s += f", max_position_embeddings={self.max_position_embeddings}" diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index 0c5961561a7d..d55b61f27e7a 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -681,8 +681,8 @@ def safetensors_weights_iterator( # instead we reconstruct the subclasses here before returning if not torchao_version_at_least("0.15.0"): raise ValueError( - "Please use torchao version >= 0.15.0 \ - to load torchao safetensors checkpoint" + "Please use torchao version >= 0.15.0 " + "to load torchao safetensors checkpoint" ) from torchao.prototype.safetensors.safetensors_support import ( unflatten_tensor_state_dict, diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index acf1e57a59a9..43303aa76efb 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -382,9 +382,9 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: else: text_config.num_labels = len(tokens) - # `llm as reranker` defaults to not using pad_token - use_pad_token = getattr(text_config, "use_pad_token", False) - text_config.use_pad_token = use_pad_token + # `llm as reranker` defaults to not using separating token. + use_sep_token = getattr(text_config, "use_sep_token", False) + text_config.use_sep_token = use_sep_token def load_weights_using_from_2_way_softmax( diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index c6d7f19cbe90..c7f44762f393 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -127,11 +127,16 @@ def __init__( in_features: int, hidden_features: int, output_dim: int, + prefix: str = "", ) -> None: super().__init__() - self.linear_in = ColumnParallelLinear(in_features, hidden_features, bias=False) - self.linear_out = RowParallelLinear(hidden_features, output_dim, bias=False) + self.linear_in = ColumnParallelLinear( + in_features, hidden_features, bias=False, prefix=f"{prefix}.linear_in" + ) + self.linear_out = RowParallelLinear( + hidden_features, output_dim, bias=False, prefix=f"{prefix}.linear_out" + ) self.act = get_act_fn("gelu_new") def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -154,7 +159,7 @@ class AriaProjector(nn.Module): A tensor with the shape of (batch_size, query_number, output_dim) """ - def __init__(self, config: AriaConfig) -> None: + def __init__(self, config: AriaConfig, prefix: str = "") -> None: super().__init__() self.patch_to_query_dict = config.projector_patch_to_query_dict @@ -174,7 +179,10 @@ def __init__(self, config: AriaConfig) -> None: self.layer_norm = nn.LayerNorm(self.in_features) self.feed_forward = AriaProjectorMLP( - self.in_features, self.hidden_features, self.output_dim + self.in_features, + self.hidden_features, + self.output_dim, + prefix=f"{prefix}.feed_forward", ) def forward( @@ -536,7 +544,9 @@ def __init__( quant_config=quant_config, prefix=f"{prefix}.vision_tower", ) - self.multi_modal_projector = AriaProjector(config) + self.multi_modal_projector = AriaProjector( + config, prefix=maybe_prefix(prefix, "multi_modal_projector") + ) self.vocab_size = config.text_config.vocab_size self.language_model = AriaTextModel( vllm_config=vllm_config.with_hf_config(config.text_config), diff --git a/vllm/model_executor/models/blip2.py b/vllm/model_executor/models/blip2.py index 1244f97a1bd6..2bd1dd1aef3b 100644 --- a/vllm/model_executor/models/blip2.py +++ b/vllm/model_executor/models/blip2.py @@ -35,13 +35,15 @@ from vllm.sequence import IntermediateTensors from vllm.utils.tensor_schema import TensorSchema, TensorShape -from .blip import BlipVisionModel +from .blip import BlipVisionModel, get_blip_num_patches from .interfaces import ( MultiModalEmbeddings, + SupportsLoRA, SupportsMultiModal, SupportsPP, SupportsQuant, ) +from .module_mapping import MultiModelKeys from .utils import AutoWeightsLoader, init_vllm_registered_model, maybe_prefix @@ -521,7 +523,7 @@ def _get_prompt_updates( dummy_inputs=Blip2DummyInputsBuilder, ) class Blip2ForConditionalGeneration( - nn.Module, SupportsMultiModal, SupportsPP, SupportsQuant + nn.Module, SupportsLoRA, SupportsMultiModal, SupportsPP, SupportsQuant ): @classmethod def get_placeholder_str(cls, modality: str, i: int) -> str | None: @@ -538,9 +540,17 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): multimodal_config = vllm_config.model_config.multimodal_config self.config = config self.multimodal_config = multimodal_config + vision_config = config.vision_config + self._vision_tokens_per_image = ( + get_blip_num_patches( + image_size=vision_config.image_size, + patch_size=vision_config.patch_size, + ) + + 1 # include class token + ) # TODO: Optionally initializes this for supporting embeddings. - self.vision_model = BlipVisionModel(config.vision_config, quant_config) + self.vision_model = BlipVisionModel(vision_config, quant_config) self.query_tokens = nn.Parameter( torch.zeros(1, config.num_query_tokens, config.qformer_config.hidden_size) @@ -691,3 +701,36 @@ def compute_logits( def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) return loader.load_weights(weights) + + def get_mm_mapping(self) -> MultiModelKeys: + return MultiModelKeys.from_string_field( + language_model="language_model", + connector=["qformer", "language_projection"], + tower_model="vision_model", + ) + + def get_num_mm_encoder_tokens( + self, + num_image_tokens: int, + ) -> int: + if num_image_tokens <= 0: + return 0 + assert num_image_tokens % self.config.num_query_tokens == 0, ( + "The number of image tokens must be a multiple of " + "the number of query tokens." + ) + num_images = num_image_tokens / self.config.num_query_tokens + return num_images * self._vision_tokens_per_image + + def get_num_mm_connector_tokens( + self, + num_vision_tokens: int, + ) -> int: + if num_vision_tokens <= 0: + return 0 + assert num_vision_tokens % self._vision_tokens_per_image == 0, ( + "The number of vision tokens must be a multiple of " + "the number of tokens per image." + ) + num_images = num_vision_tokens / self._vision_tokens_per_image + return num_images * self.config.num_query_tokens diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 10fd599f9e5f..362c194d8783 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -113,8 +113,8 @@ def verify_and_update_model_config(model_config: "ModelConfig") -> None: class NomicBertModelConfig(VerifyAndUpdateConfig): @staticmethod - def verify_and_update_config(vllm_config: "VllmConfig") -> None: - config = vllm_config.model_config.hf_config + def verify_and_update_model_config(model_config: "ModelConfig") -> None: + config = model_config.hf_config assert config.__class__.__name__ == "NomicBertConfig" assert config.activation_function in ["swiglu", "gelu"] @@ -137,6 +137,10 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: config.intermediate_size = config.n_inner config.hidden_size = config.n_embd config.num_hidden_layers = config.n_layer + model_config.model_arch_config.hidden_size = config.hidden_size + model_config.model_arch_config.total_num_hidden_layers = ( + config.num_hidden_layers + ) head_dim = config.hidden_size // config.num_attention_heads max_trained_positions = getattr(config, "max_trained_positions", 2048) @@ -153,42 +157,43 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: # The context extension uses vllm style rope_theta and rope_parameters. # See #17785 #18755 if ( - not vllm_config.model_config.hf_overrides - and vllm_config.model_config.original_max_model_len is None + not model_config.hf_overrides + and model_config.original_max_model_len is None ): # Default # Reset max_model_len to max_trained_positions. # nomic-embed-text-v2-moe the length is set to 512 # by sentence_bert_config.json. - max_model_len_before = vllm_config.model_config.max_model_len - max_model_len = min( - vllm_config.model_config.max_model_len, max_trained_positions - ) + max_model_len_before = model_config.max_model_len + max_model_len = min(model_config.max_model_len, max_trained_positions) - vllm_config.recalculate_max_model_len(max_model_len) - logger.warning( - "Nomic context extension is disabled. " - "Changing max_model_len from %s to %s. " - "To enable context extension, see: " - "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/context_extension.html", - max_model_len_before, - vllm_config.model_config.max_model_len, + model_config.max_model_len = model_config.get_and_verify_max_len( + max_model_len ) + + if model_config.max_model_len != max_model_len_before: + logger.warning( + "Nomic context extension is disabled. " + "Changing max_model_len from %s to %s. " + "To enable context extension, see: " + "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/context_extension.html", + max_model_len_before, + model_config.max_model_len, + ) else: # We need to re-verify max_model_len to avoid lengths # greater than position_embedding. - model_config = vllm_config.model_config hf_text_config = model_config.hf_text_config if isinstance(model_config.hf_overrides, dict): # hf_overrides_kw max_model_len = model_config.hf_overrides.get( - "max_model_len", vllm_config.model_config.max_model_len + "max_model_len", model_config.max_model_len ) else: # hf_overrides_fn # This might be overridden by sentence_bert_config.json. - max_model_len = vllm_config.model_config.max_model_len + max_model_len = model_config.max_model_len # reset hf_text_config for recalculate_max_model_len. if hasattr(hf_text_config, "max_model_len"): @@ -196,13 +201,21 @@ def verify_and_update_config(vllm_config: "VllmConfig") -> None: hf_text_config.max_position_embeddings = max_trained_positions hf_text_config.rope_parameters = config.rotary_kwargs["rope_parameters"] + # Update the cached derived_max_model_len to enforce the limit + model_config.model_arch_config.derived_max_model_len_and_key = ( + float(max_trained_positions), + "max_position_embeddings", + ) + # The priority of sentence_bert_config.json is higher # than max_position_embeddings encoder_config = deepcopy(model_config.encoder_config) encoder_config.pop("max_seq_length", None) model_config.encoder_config = encoder_config - vllm_config.recalculate_max_model_len(max_model_len) + model_config.max_model_len = model_config.get_and_verify_max_len( + max_model_len + ) class Qwen2ForProcessRewardModelConfig(VerifyAndUpdateConfig): @@ -238,7 +251,7 @@ def verify_and_update_model_config(model_config: "ModelConfig") -> None: tokens = getattr(config, "classifier_from_token", None) assert tokens is not None and len(tokens) == 2, ( "Try loading the original Qwen3 Reranker?, see: " - "https://github.com/vllm-project/vllm/tree/main/examples/offline_inference/offline_reranker.py" + "https://github.com/vllm-project/vllm/tree/main/examples/pooling/score/qwen3_reranker_offline.py" ) model_config.hf_config.method = "from_2_way_softmax" diff --git a/vllm/model_executor/models/deepseek_ocr.py b/vllm/model_executor/models/deepseek_ocr.py index 1f07381c0cbd..146c673ddcb4 100644 --- a/vllm/model_executor/models/deepseek_ocr.py +++ b/vllm/model_executor/models/deepseek_ocr.py @@ -14,9 +14,11 @@ from vllm.config.multimodal import BaseDummyOptions from vllm.model_executor.models.interfaces import ( MultiModalEmbeddings, + SupportsLoRA, SupportsMultiModal, SupportsPP, ) +from vllm.model_executor.models.module_mapping import MultiModelKeys from vllm.model_executor.models.utils import ( AutoWeightsLoader, WeightsMapper, @@ -343,7 +345,7 @@ def get_replacement_deepseek_vl2(item_idx: int): info=DeepseekOCRProcessingInfo, dummy_inputs=DeepseekOCRDummyInputsBuilder, ) -class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP): +class DeepseekOCRForCausalLM(nn.Module, SupportsMultiModal, SupportsPP, SupportsLoRA): hf_to_vllm_mapper = WeightsMapper( orig_to_new_prefix={ # map prefix for language backbone @@ -589,3 +591,13 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self) autoloaded_weights = loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) return autoloaded_weights + + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="language_model", + connector="projector", + tower_model=["sam_model", "vision_model"], + ) diff --git a/vllm/model_executor/models/gpt_neox.py b/vllm/model_executor/models/gpt_neox.py index c4d11b488f38..d994e380dfef 100644 --- a/vllm/model_executor/models/gpt_neox.py +++ b/vllm/model_executor/models/gpt_neox.py @@ -166,7 +166,7 @@ def __init__( self.attention = GPTNeoXAttention( config, cache_config, quant_config, prefix=f"{prefix}.attention" ) - self.mlp = GPTNeoXMLP(config, quant_config) + self.mlp = GPTNeoXMLP(config, quant_config, prefix=f"{prefix}.mlp") def forward( self, diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index 0e82e84c4edb..b7132e66274a 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -427,6 +427,7 @@ def __init__( hidden_act=config.hidden_act, quant_config=quant_config, reduce_results=False, + prefix=f"{prefix}.shared_mlp", ) else: self.shared_mlp = None diff --git a/vllm/model_executor/models/jamba.py b/vllm/model_executor/models/jamba.py index b2ad12be1e35..946a9f6fcf57 100644 --- a/vllm/model_executor/models/jamba.py +++ b/vllm/model_executor/models/jamba.py @@ -78,6 +78,7 @@ def __init__( bias=False, quant_config=None, params_dtype=params_dtype, + prefix=f"{prefix}.router", ) self.experts = FusedMoE( diff --git a/vllm/model_executor/models/jina_vl.py b/vllm/model_executor/models/jina_vl.py index 8bba7b62882f..7be3d47781e3 100644 --- a/vllm/model_executor/models/jina_vl.py +++ b/vllm/model_executor/models/jina_vl.py @@ -27,15 +27,23 @@ class JinaVLScorer(nn.Module): - def __init__(self, model_config: "ModelConfig"): + def __init__(self, model_config: "ModelConfig", prefix: str = ""): super().__init__() config = model_config.hf_config.get_text_config() head_dtype = model_config.head_dtype self.dense = ColumnParallelLinear( - config.hidden_size, config.hidden_size, params_dtype=head_dtype, bias=True + config.hidden_size, + config.hidden_size, + params_dtype=head_dtype, + bias=True, + prefix=f"{prefix}.dense", ) self.out_proj = RowParallelLinear( - config.hidden_size, config.num_labels, params_dtype=head_dtype, bias=True + config.hidden_size, + config.num_labels, + params_dtype=head_dtype, + bias=True, + prefix=f"{prefix}.out_proj", ) def forward(self, x, **kwargs): @@ -94,7 +102,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): pooler_config = vllm_config.model_config.pooler_config assert pooler_config is not None - self.score = JinaVLScorer(vllm_config.model_config) + self.score = JinaVLScorer( + vllm_config.model_config, prefix=maybe_prefix(prefix, "score") + ) self.pooler = DispatchPooler( { "token_classify": Pooler.for_token_classify( diff --git a/vllm/model_executor/models/kimi_vl.py b/vllm/model_executor/models/kimi_vl.py index 85267ccda8a9..dcae9ccdebbe 100644 --- a/vllm/model_executor/models/kimi_vl.py +++ b/vllm/model_executor/models/kimi_vl.py @@ -325,7 +325,7 @@ def __init__( self.hidden_size = config.text_config.hidden_size self.vision_tower = MoonVitPretrainedModel( config.vision_config, - self.use_data_parallel, + multimodal_config=model_config.multimodal_config, prefix=maybe_prefix(prefix, "vision_tower"), ) diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index f0f2983f8463..4332acc82120 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -201,8 +201,8 @@ def __init__( # This is a target model, use layer_idx directly effective_layer_idx = layer_idx assert effective_layer_idx < len(layer_types), ( - f"effective_layer_idx: {effective_layer_idx} \ - is out of bounds for layer_types: {layer_types}" + f"effective_layer_idx: {effective_layer_idx} " + f"is out of bounds for layer_types: {layer_types}" ) is_sliding = layer_types[effective_layer_idx] == "sliding_attention" diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py index 66a327bb7603..386c5216e6af 100644 --- a/vllm/model_executor/models/llava.py +++ b/vllm/model_executor/models/llava.py @@ -51,7 +51,13 @@ from vllm.utils.tensor_schema import TensorSchema, TensorShape from .clip import CLIPVisionModel -from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP +from .interfaces import ( + MultiModalEmbeddings, + SupportsLoRA, + SupportsMultiModal, + SupportsPP, +) +from .module_mapping import MultiModelKeys from .pixtral import PixtralHFEncoderInfo, PixtralHFVisionModel from .siglip import SiglipVisionModel from .utils import ( @@ -505,7 +511,9 @@ def init_vision_tower_for_llava( info=_build_llava_or_pixtral_hf_info, dummy_inputs=LlavaDummyInputsBuilder, ) -class LlavaForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP): +class LlavaForConditionalGeneration( + nn.Module, SupportsLoRA, SupportsMultiModal, SupportsPP +): packed_modules_mapping = { "qkv_proj": ["q_proj", "k_proj", "v_proj"], "gate_up_proj": ["gate_proj", "up_proj"], @@ -734,6 +742,32 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: loader = AutoWeightsLoader(self, skip_prefixes=skip_prefixes) return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper) + def get_mm_mapping(self) -> MultiModelKeys: + """ + Get the module prefix in multimodal models + """ + return MultiModelKeys.from_string_field( + language_model="language_model", + connector="multi_modal_projector", + tower_model="vision_tower", + ) + + def get_num_mm_encoder_tokens( + self, + num_image_tokens: int, + ) -> int: + # LLaVA's vision encoder outputs one token per patch without + # spatial merging or pixel shuffle + return num_image_tokens + + def get_num_mm_connector_tokens( + self, + num_vision_tokens: int, + ) -> int: + # LLaVA's MLP projector outputs the same number of tokens + # as it receives from the vision encoder (1:1 mapping) + return num_vision_tokens + class MantisProcessingInfo(LlavaProcessingInfo): def get_hf_processor(self, **kwargs: object): diff --git a/vllm/model_executor/models/mimo_v2_flash.py b/vllm/model_executor/models/mimo_v2_flash.py index 12b486f001e0..98d40a38486b 100644 --- a/vllm/model_executor/models/mimo_v2_flash.py +++ b/vllm/model_executor/models/mimo_v2_flash.py @@ -211,6 +211,7 @@ def __init__( num_kv_heads: int, head_dim: int, v_head_dim: int | None = None, + v_scale: float | None = None, sliding_window_size: int = -1, attention_bias: bool = False, add_swa_attention_sink_bias: bool = False, @@ -241,6 +242,7 @@ def __init__( self.k_size = self.num_kv_heads * self.head_dim self.v_size = self.num_kv_heads * self.v_head_dim + self.v_scale = v_scale self.scaling = self.head_dim**-0.5 self.rope_theta = rope_theta self.max_position_embeddings = max_position_embeddings @@ -304,6 +306,10 @@ def forward( q, k, v = qkv.split([self.q_size, self.k_size, self.v_size], dim=-1) q, k = self.rotary_emb(positions, q, k) + # Apply v_scale before attention + if self.v_scale is not None: + v = v * self.v_scale + v = v.view(-1, self.num_kv_heads, self.v_head_dim) v = torch.nn.functional.pad(v, [0, self.head_dim - self.v_head_dim], value=0) v = v.view(-1, self.num_kv_heads * self.head_dim) @@ -332,6 +338,8 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: rope_theta = getattr(config, "rope_theta", 1000000) max_position_embeddings = getattr(config, "max_position_embeddings", 32768) + v_scale = getattr(config, "attention_value_scale", None) + if self.is_compressed_softmax_layer(): self.self_attn = MiMoV2Attention( hidden_size=self.hidden_size, @@ -339,6 +347,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: num_kv_heads=config.swa_num_key_value_heads, head_dim=config.swa_head_dim, v_head_dim=getattr(config, "swa_v_head_dim", None), + v_scale=v_scale, sliding_window_size=config.sliding_window_size, attention_bias=config.attention_bias, add_swa_attention_sink_bias=getattr( @@ -358,6 +367,7 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = "") -> None: num_kv_heads=config.num_key_value_heads, head_dim=config.head_dim, v_head_dim=getattr(config, "v_head_dim", None), + v_scale=v_scale, sliding_window_size=-1, # normal attention attention_bias=config.attention_bias, layer_id=layer_id, @@ -433,7 +443,6 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.quant_config = quant_config self.vocab_size = config.vocab_size self.num_redundant_experts = eplb_config.num_redundant_experts - self.v_scale = getattr(config, "attention_value_scale", None) if get_pp_group().is_first_rank or ( config.tie_word_embeddings and get_pp_group().is_last_rank @@ -605,18 +614,6 @@ def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: param = params_dict[name_rewritten] weight_loader = getattr(param, "weight_loader", default_weight_loader) - - if param_name == "qkv_proj" and shard_id == "v": - v_scale = ( - self.v_scale - if self.v_scale is not None - else getattr(self.config, "attention_value_scale", None) - ) - if v_scale is not None and ( - name.endswith("weight_scale_inv") or name.endswith(".bias") - ): - loaded_weight *= float(v_scale) - weight_loader(param, loaded_weight, shard_id) loaded_params.add(name_rewritten) diff --git a/vllm/model_executor/models/minicpm.py b/vllm/model_executor/models/minicpm.py index f104018d3aa6..a05be794a29c 100644 --- a/vllm/model_executor/models/minicpm.py +++ b/vllm/model_executor/models/minicpm.py @@ -90,6 +90,7 @@ def __init__( intermediate_size: int, params_dtype: torch.dtype | None = None, tp_size: int | None = None, + prefix: str = "", ): super().__init__() self.tp_size = tp_size or get_tensor_model_parallel_world_size() @@ -108,6 +109,7 @@ def __init__( bias=False, params_dtype=self.params_dtype, quant_config=None, + prefix=f"{prefix}.gate", ) self.ws = nn.Parameter( @@ -352,6 +354,7 @@ def _init_ffn_block(self): hidden_act=self.config.hidden_act, hidden_act_param=getattr(self.config, "hidden_act_param", 0.0), quant_config=self.quant_config, + prefix=f"{self.prefix}.mlp", ) else: self.mlp = MiniCPMMoE( @@ -359,6 +362,7 @@ def _init_ffn_block(self): top_k=self.config.num_experts_per_tok, hidden_size=self.config.hidden_size, intermediate_size=self.config.intermediate_size, + prefix=f"{self.prefix}.mlp", ) def forward( diff --git a/vllm/model_executor/models/minicpm_eagle.py b/vllm/model_executor/models/minicpm_eagle.py index 9f3587a6d2fa..e9f1a91bfc4a 100644 --- a/vllm/model_executor/models/minicpm_eagle.py +++ b/vllm/model_executor/models/minicpm_eagle.py @@ -108,6 +108,7 @@ def _init_ffn_block(self): top_k=self.config.num_experts_per_tok, hidden_size=self.config.hidden_size, intermediate_size=self.config.intermediate_size, + prefix=f"{self.prefix}.mlp", ) def forward( diff --git a/vllm/model_executor/models/mistral_large_3_eagle.py b/vllm/model_executor/models/mistral_large_3_eagle.py index 37cd4324e53d..830f210e7438 100644 --- a/vllm/model_executor/models/mistral_large_3_eagle.py +++ b/vllm/model_executor/models/mistral_large_3_eagle.py @@ -67,6 +67,7 @@ def __init__( input_is_parallel=False, quant_config=quant_config, return_bias=False, + prefix=maybe_prefix(prefix, "fc"), ) self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory( diff --git a/vllm/model_executor/models/modernbert.py b/vllm/model_executor/models/modernbert.py index 4655ffa7b2f6..fb8f6a28ebab 100644 --- a/vllm/model_executor/models/modernbert.py +++ b/vllm/model_executor/models/modernbert.py @@ -63,7 +63,9 @@ def forward( class ModernBertAttention(nn.Module): - def __init__(self, config: ModernBertConfig, layer_id: int | None = None): + def __init__( + self, config: ModernBertConfig, layer_id: int | None = None, prefix: str = "" + ): super().__init__() self.config = config self.hidden_size = config.hidden_size @@ -80,6 +82,7 @@ def __init__(self, config: ModernBertConfig, layer_id: int | None = None): self.head_dim, self.num_heads, bias=config.attention_bias, + prefix=f"{prefix}.Wqkv", ) if layer_types := getattr(config, "layer_types", None): @@ -117,7 +120,10 @@ def __init__(self, config: ModernBertConfig, layer_id: int | None = None): per_layer_sliding_window=sliding_window, ) self.Wo = RowParallelLinear( - config.hidden_size, config.hidden_size, bias=config.attention_bias + config.hidden_size, + config.hidden_size, + bias=config.attention_bias, + prefix=f"{prefix}.Wo", ) def forward( @@ -135,7 +141,7 @@ def forward( class ModernBertMLP(nn.Module): - def __init__(self, config: ModernBertConfig): + def __init__(self, config: ModernBertConfig, prefix: str = ""): super().__init__() self.config = config self.Wi = nn.Linear( @@ -143,7 +149,10 @@ def __init__(self, config: ModernBertConfig): ) self.act = nn.GELU() self.Wo = RowParallelLinear( - config.intermediate_size, config.hidden_size, bias=config.mlp_bias + config.intermediate_size, + config.hidden_size, + bias=config.mlp_bias, + prefix=f"{prefix}.Wo", ) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: @@ -163,11 +172,13 @@ def __init__( self.attn_norm = nn.LayerNorm( config.hidden_size, eps=config.norm_eps, bias=config.norm_bias ) - self.attn = ModernBertAttention(config=config, layer_id=layer_id) + self.attn = ModernBertAttention( + config=config, layer_id=layer_id, prefix=f"{prefix}.attn" + ) self.mlp_norm = nn.LayerNorm( config.hidden_size, eps=config.norm_eps, bias=config.norm_bias ) - self.mlp = ModernBertMLP(config) + self.mlp = ModernBertMLP(config, prefix=f"{prefix}.mlp") def forward( self, @@ -189,7 +200,11 @@ def __init__(self, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config self.layers = nn.ModuleList( [ - ModernBertLayer(config=config, layer_id=layer_id) + ModernBertLayer( + config=config, + layer_id=layer_id, + prefix=f"{prefix}.layers.{layer_id}", + ) for layer_id in range(config.num_hidden_layers) ] ) @@ -220,7 +235,9 @@ def __init__( config = vllm_config.model_config.hf_config self.config = config self.embeddings = ModernBertEmbeddings(config) - self.encoder_layer = ModernBertEncoderLayer(vllm_config) + self.encoder_layer = ModernBertEncoderLayer( + vllm_config, prefix=f"{prefix}.encoder_layer" + ) self.final_norm = nn.LayerNorm( config.hidden_size, eps=config.norm_eps, bias=config.norm_bias ) diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 9c741e1f5071..5ccc5653ec8b 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -142,6 +142,7 @@ def __init__( self, config: VisionBackboneConfig, quant_config: QuantizationConfig | None = None, + prefix: str = "", ): super().__init__() self.w1 = ColumnParallelLinear( @@ -149,6 +150,7 @@ def __init__( config.image_mlp_dim, bias=True, quant_config=quant_config, + prefix=f"{prefix}.w1", ) # Activation function. assert config.image_mlp_activations == "quick_gelu" @@ -158,6 +160,7 @@ def __init__( config.image_emb_dim, bias=True, quant_config=quant_config, + prefix=f"{prefix}.w2", ) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -176,6 +179,7 @@ def __init__( use_bias: bool = True, nlayers: int = 1, quant_config: QuantizationConfig | None = None, + prefix: str = "", ): super().__init__() @@ -202,24 +206,28 @@ def __init__( self.total_num_heads * self.head_dim, bias=use_bias, quant_config=quant_config, + prefix=f"{prefix}.wq", ) self.wk = ColumnParallelLinear( nlayers * self.hidden_size, self.total_num_kv_heads * self.head_dim, bias=use_bias, quant_config=quant_config, + prefix=f"{prefix}.wk", ) self.wv = ColumnParallelLinear( nlayers * self.hidden_size, self.total_num_kv_heads * self.head_dim, bias=use_bias, quant_config=quant_config, + prefix=f"{prefix}.wv", ) self.wo = RowParallelLinear( self.total_num_heads * self.head_dim, self.hidden_size, bias=use_bias, quant_config=quant_config, + prefix=f"{prefix}.wo", ) self.scale = self.head_dim**-0.5 @@ -254,10 +262,15 @@ def __init__( self, config: VisionBackboneConfig, quant_config: QuantizationConfig | None = None, + prefix: str = "", ): super().__init__() - self.attention = MultiHeadDotProductAttention(config, quant_config=quant_config) - self.feed_forward = ViTMLP(config, quant_config) + self.attention = MultiHeadDotProductAttention( + config, quant_config=quant_config, prefix=f"{prefix}.attention" + ) + self.feed_forward = ViTMLP( + config, quant_config, prefix=f"{prefix}.feed_forward" + ) self.attention_norm = nn.LayerNorm( config.image_emb_dim, eps=config.image_norm_eps, @@ -280,12 +293,15 @@ def __init__( self, config: VisionBackboneConfig, quant_config: QuantizationConfig | None = None, + prefix: str = "", ): super().__init__() self.resblocks = nn.ModuleList( [ - ResidualAttentionBlock(config, quant_config) - for _ in range(config.image_num_layers) + ResidualAttentionBlock( + config, quant_config, prefix=f"{prefix}.resblocks.{i}" + ) + for i in range(config.image_num_layers) ] ) @@ -308,6 +324,7 @@ def __init__( self, config: VisionBackboneConfig, quant_config: QuantizationConfig | None = None, + prefix: str = "", ): super().__init__() scale = config.image_emb_dim**-0.5 @@ -324,7 +341,9 @@ def __init__( bias=False, ) self.pre_ln = nn.LayerNorm(config.image_emb_dim, eps=config.image_norm_eps) - self.transformer = BlockCollection(config, quant_config) + self.transformer = BlockCollection( + config, quant_config, prefix=f"{prefix}.transformer" + ) def add_pos_emb(self, x: torch.Tensor, patch_num: int) -> torch.Tensor: cls_emb = self.positional_embedding[0:1] @@ -419,6 +438,7 @@ def __init__( self.total_num_kv_heads, bias=config.qkv_bias, quant_config=quant_config, + prefix=f"{prefix}.qkv_proj", ) self.tp_rank: int | None = None @@ -454,6 +474,7 @@ def __init__( self.hidden_size, bias=False, quant_config=quant_config, + prefix=f"{prefix}.o_proj", ) def _apply_qk_norm( @@ -493,6 +514,7 @@ def __init__( config: PretrainedConfig, input_dim: int | None = None, quant_config: QuantizationConfig | None = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -503,6 +525,7 @@ def __init__( [self.intermediate_size] * 2, bias=False, quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj", ) # Activation function. self.act_fn = MulAndSilu() @@ -512,6 +535,7 @@ def __init__( self.hidden_size, bias=False, quant_config=quant_config, + prefix=f"{prefix}.down_proj", ) def forward( @@ -532,6 +556,7 @@ def __init__( config: PretrainedConfig, input_dim: int | None = None, quant_config: QuantizationConfig | None = None, + prefix: str = "", ) -> None: super().__init__() self.hidden_size = config.hidden_size @@ -542,6 +567,7 @@ def __init__( [self.intermediate_size] * 2, bias=False, quant_config=quant_config, + prefix=f"{prefix}.merged_linear", ) # Activation function. self.act_fn = SiluAndMul() @@ -552,6 +578,7 @@ def __init__( self.hidden_size, bias=False, quant_config=quant_config, + prefix=f"{prefix}.down_proj", ) def forward( @@ -579,7 +606,9 @@ def __init__( ) # MLP block. - self.mlp = LanguageModelMLP(config, quant_config=quant_config) + self.mlp = LanguageModelMLP( + config, quant_config=quant_config, prefix=f"{prefix}.mlp" + ) # LayerNorm assert config.layer_norm_type == "rms" @@ -643,6 +672,7 @@ def __init__( config: PretrainedConfig, vision_config: VisionBackboneConfig, quant_config: QuantizationConfig | None = None, + prefix: str = "", ) -> None: super().__init__() self.vit_layers = VIT_LAYERS @@ -651,18 +681,24 @@ def __init__( (self.image_num_patch[0] + 1) // POOLING_SIZE, (self.image_num_patch[1] + 1) // POOLING_SIZE, ) - self.image_vit = VisionTransformer(vision_config, quant_config=quant_config) + self.image_vit = VisionTransformer( + vision_config, quant_config=quant_config, prefix=f"{prefix}.image_vit" + ) self.num_prefix_tokens = self.image_vit.num_prefix_tokens assert self.num_prefix_tokens in {0, 1}, ( "Only 0 or 1 prefix tokens are supported" ) self.image_pooling_2d = MultiHeadDotProductAttention( - vision_config, nlayers=len(self.vit_layers), quant_config=quant_config + vision_config, + nlayers=len(self.vit_layers), + quant_config=quant_config, + prefix=f"{prefix}.image_pooling_2d", ) self.image_projector = ImageProjectorMLP( config, input_dim=vision_config.image_emb_dim, quant_config=quant_config, + prefix=f"{prefix}.image_projector", ) image_dim = vision_config.image_emb_dim * len(self.vit_layers) @@ -1405,7 +1441,12 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): self.multimodal_config = multimodal_config vision_config = VisionBackboneConfig() - self.vision_backbone = MolmoVisionBackbone(config, vision_config, quant_config) + self.vision_backbone = MolmoVisionBackbone( + config, + vision_config, + quant_config, + prefix=maybe_prefix(prefix, "vision_backbone"), + ) self.model = MolmoModel( vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model") ) diff --git a/vllm/model_executor/models/moonvit.py b/vllm/model_executor/models/moonvit.py index 63ea6b259a71..99200068c066 100644 --- a/vllm/model_executor/models/moonvit.py +++ b/vllm/model_executor/models/moonvit.py @@ -51,118 +51,20 @@ import torch.nn.functional as F from transformers.activations import ACT2FN from transformers.modeling_utils import PreTrainedModel -from transformers.utils import is_flash_attn_2_available +from vllm.attention.layers.mm_encoder_attention import MMEncoderAttention +from vllm.config import MultiModalConfig +from vllm.distributed import divide, get_tensor_model_parallel_world_size from vllm.model_executor.layers.conv import Conv2dLayer -from vllm.model_executor.layers.linear import ReplicatedLinear +from vllm.model_executor.layers.linear import ( + ColumnParallelLinear, + QKVParallelLinear, + RowParallelLinear, +) from vllm.model_executor.models.utils import maybe_prefix from vllm.platforms import current_platform from vllm.transformers_utils.configs.moonvit import MoonViTConfig -if is_flash_attn_2_available(): - from flash_attn import flash_attn_varlen_func -elif current_platform.is_xpu(): - from vllm.attention.utils.fa_utils import flash_attn_varlen_func -else: - flash_attn_varlen_func = None - - -def multihead_attention( - q: torch.Tensor, - k: torch.Tensor, - v: torch.Tensor, - q_cu_seqlens: torch.Tensor | None = None, - k_cu_seqlens: torch.Tensor | None = None, -) -> torch.Tensor: - """Multi-head attention using flash attention 2. - - Args: - q: Query tensor of shape (batch_size, seqlen, num_heads, head_dim), - or (tot_seqlens, num_heads, head_dim) if packing. - k: Key tensor of shape (batch_size, seqlen, num_heads, head_dim), - or (tot_seqlens, num_heads, head_dim) if packing. - v: Value tensor of shape (batch_size, seqlen, num_heads, head_dim), - or (tot_seqlens, num_heads, head_dim) if packing. - q_cu_seqlens (torch.Tensor): cumulative sequence lengths of q. - The first element should be 0 and the last element should be q.shape[0]. - k_cu_seqlens (torch.Tensor): cumulative sequence lengths of k. - The first element should be 0 and the last element should be k.shape[0]. - - Returns: - output: shape (batch_size, seqlen, dim) or (tot_seqlens, dim) if packing, - where dim = num_heads * head_dim - """ - # Unified format legal check - assert q.dim() == k.dim() == v.dim() == 3, "q, k, v must have 3 dims" - assert q_cu_seqlens[-1] == q.shape[0], "q_cu_seqlens must sum to q.shape[0]" - assert k_cu_seqlens[-1] == k.shape[0] == v.shape[0], ( - "k_cu_seqlens must sum to k.shape[0]" - ) - assert q.dtype in [ - torch.bfloat16, - torch.float16, - ], f"unsupported dtype {q.dtype} for multihead attn" - - max_seqlen_q = (q_cu_seqlens[1:] - q_cu_seqlens[:-1]).max().item() - max_seqlen_k = (k_cu_seqlens[1:] - k_cu_seqlens[:-1]).max().item() - attn_out = flash_attn_varlen_func( - q, - k, - v, - cu_seqlens_q=q_cu_seqlens, - cu_seqlens_k=k_cu_seqlens, - max_seqlen_q=max_seqlen_q, - max_seqlen_k=max_seqlen_k, - causal=False, - ) - attn_out = attn_out.flatten(start_dim=-2) - - return attn_out - - -def sdpa_attention( - q: torch.Tensor, - k: torch.Tensor, - v: torch.Tensor, - q_cu_seqlens: torch.Tensor | None = None, - k_cu_seqlens: torch.Tensor | None = None, -) -> torch.Tensor: - """SDPA attention. - - Args: - q: Query tensor of shape (batch_size, seqlen, num_heads, head_dim), - or (tot_seqlens, num_heads, head_dim) if packing. - k: Key tensor of shape (batch_size, seqlen, num_heads, head_dim), - or (tot_seqlens, num_heads, head_dim) if packing. - v: Value tensor of shape (batch_size, seqlen, num_heads, head_dim), - or (tot_seqlens, num_heads, head_dim) if packing. - q_cu_seqlens: Optional cumulative sequence lengths of q. - k_cu_seqlens: Optional cumulative sequence lengths of k. - """ - seq_length = q.shape[0] - attention_mask = torch.zeros( - [1, seq_length, seq_length], device=q.device, dtype=torch.bool - ) - for i in range(1, len(q_cu_seqlens)): - attention_mask[ - ..., - q_cu_seqlens[i - 1] : q_cu_seqlens[i], - q_cu_seqlens[i - 1] : q_cu_seqlens[i], - ] = True - q = q.transpose(0, 1) - k = k.transpose(0, 1) - v = v.transpose(0, 1) - attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0) - attn_output = attn_output.transpose(0, 1) - attn_output = attn_output.reshape(seq_length, -1) - return attn_output - - -VL_VISION_ATTENTION_FUNCTIONS = { - "flash_attention_2": multihead_attention, - "sdpa": sdpa_attention, -} - def _apply_rope_input_validation(x, freqs_cis): assert x.ndim == freqs_cis.ndim + 1, (x.shape, freqs_cis.shape) @@ -411,11 +313,19 @@ def __init__( super().__init__() assert len(dims) == 3 self.use_data_parallel = use_data_parallel - self.fc0 = ReplicatedLinear( - dims[0], dims[1], bias=bias, prefix=maybe_prefix(prefix, "fc0") + self.fc0 = ColumnParallelLinear( + dims[0], + dims[1], + bias=bias, + prefix=maybe_prefix(prefix, "fc0"), + disable_tp=self.use_data_parallel, ) - self.fc1 = ReplicatedLinear( - dims[1], dims[2], bias=bias, prefix=maybe_prefix(prefix, "fc1") + self.fc1 = RowParallelLinear( + dims[1], + dims[2], + bias=bias, + prefix=maybe_prefix(prefix, "fc1"), + disable_tp=self.use_data_parallel, ) self.activation = activation @@ -433,35 +343,55 @@ def __init__( hidden_dim: int, mlp_dim: int, prefix: str = "", - use_data_parallel: bool = False, + multimodal_config: MultiModalConfig | None = None, *, - attn_implementation: str = "sdpa", activation=F.gelu, attn_bias: bool = False, ): super().__init__() + self.use_data_parallel = ( + multimodal_config.mm_encoder_tp_mode == "data" + if multimodal_config + else False + ) + self.num_heads = num_heads self.hidden_dim = hidden_dim self.hidden_size_per_attention_head = self.hidden_dim // self.num_heads - self.attn_implementation = attn_implementation - # use fa2 in vllm by default - if is_flash_attn_2_available() or current_platform.is_xpu(): - self.attn_implementation = "flash_attention_2" + self.tp_size = ( + 1 if self.use_data_parallel else get_tensor_model_parallel_world_size() + ) + self.num_attention_heads_per_partition = divide(num_heads, self.tp_size) self.norm0 = nn.LayerNorm(hidden_dim) self.norm1 = nn.LayerNorm(hidden_dim) - self.use_data_parallel = use_data_parallel self.mlp = MLP2( [hidden_dim, mlp_dim, hidden_dim], activation, prefix=f"{prefix}.mlp", - use_data_parallel=use_data_parallel, + use_data_parallel=self.use_data_parallel, ) - self.wqkv = ReplicatedLinear( - hidden_dim, hidden_dim * 3, bias=attn_bias, prefix=f"{prefix}.wqkv" + self.wqkv = QKVParallelLinear( + hidden_size=hidden_dim, + head_size=self.hidden_size_per_attention_head, + total_num_heads=num_heads, + total_num_kv_heads=num_heads, + bias=attn_bias, + prefix=f"{prefix}.wqkv", + disable_tp=self.use_data_parallel, ) - self.wo = ReplicatedLinear( - hidden_dim, hidden_dim, bias=attn_bias, prefix=f"{prefix}.wo" + self.wo = RowParallelLinear( + hidden_dim, + hidden_dim, + bias=attn_bias, + prefix=f"{prefix}.wo", + disable_tp=self.use_data_parallel, + ) + self.attn = MMEncoderAttention( + num_heads=self.num_attention_heads_per_partition, + head_size=self.hidden_size_per_attention_head, + multimodal_config=multimodal_config, + prefix=f"{prefix}.attn", ) def attention_qkvpacked( @@ -472,14 +402,15 @@ def attention_qkvpacked( ): """ Args: - x (torch.Tensor): (batch_size, seqlen, hidden_dim) + x (torch.Tensor): (seqlen, hidden_dim) cu_seqlens (torch.Tensor): """ + seq_length = x.size(0) xqkv, _ = self.wqkv(x) qkv_shape = xqkv.size()[:-1] + ( 3, - self.num_heads, + self.num_attention_heads_per_partition, self.hidden_size_per_attention_head, ) # xqkv: (batch_size, seqlen, 3, nheads, headdim) @@ -488,9 +419,18 @@ def attention_qkvpacked( xq, xk = apply_rope(xq, xk, rope_freqs_cis) - attn_func = VL_VISION_ATTENTION_FUNCTIONS[self.attn_implementation] - attn_out = attn_func( - xq, xk, xv, q_cu_seqlens=cu_seqlens, k_cu_seqlens=cu_seqlens + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() + attn_out = self.attn( + xq.unsqueeze(0), + xk.unsqueeze(0), + xv.unsqueeze(0), + cu_seqlens=cu_seqlens, + max_seqlen=max_seqlen, + ) + attn_out = attn_out.reshape( + seq_length, + self.num_attention_heads_per_partition + * self.hidden_size_per_attention_head, ) attn_out, _ = self.wo(attn_out) return attn_out @@ -528,7 +468,7 @@ def __init__( num_layers: int, block_cfg: dict, prefix: str = "", - use_data_parallel: bool = False, + multimodal_config: MultiModalConfig | None = None, ) -> None: super().__init__() @@ -538,7 +478,7 @@ def __init__( self.blocks = nn.ModuleList( [ MoonVitEncoderLayer( - use_data_parallel=use_data_parallel, + multimodal_config=multimodal_config, prefix=f"{prefix}.blocks.{layer_idx}", **block_cfg, ) @@ -599,31 +539,6 @@ def patch_merger( return outputs -class MoonVitVLProjector(nn.Module): - def __init__( - self, - in_channels: int, - merge_kernel_size: list[int, int], - hidden_act: str = "gelu", - ln_eps: float = 1e-5, - out_dim: int = 4096, - ): - super().__init__() - self.hidden_size = in_channels * merge_kernel_size[0] * merge_kernel_size[1] - - self.pre_norm = nn.nn.LayerNorm(in_channels, eps=ln_eps) - self.linear_1 = nn.Linear(self.hidden_size, self.hidden_size, bias=True) - self.act = ACT2FN[hidden_act] - self.linear_2 = nn.Linear(self.hidden_size, out_dim, bias=True) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.pre_norm(hidden_states).view(-1, self.hidden_size) - hidden_states = self.linear_1(hidden_states) - hidden_states = self.act(hidden_states) - hidden_states = self.linear_2(hidden_states) - return hidden_states - - class MoonVitPretrainedModel(PreTrainedModel): config_class = MoonViTConfig model_type = "moonvit" @@ -634,14 +549,13 @@ class MoonVitPretrainedModel(PreTrainedModel): def __init__( self, config: MoonViTConfig, - use_data_parallel: bool = False, + multimodal_config: MultiModalConfig | None = None, prefix: str = "", *inputs, **kwargs, ): super().__init__(config, *inputs, **kwargs) config = deepcopy(config) - self.use_data_parallel = use_data_parallel self.merge_kernel_size = config.merge_kernel_size self.hidden_size = config.hidden_size self.patch_size = config.patch_size @@ -662,9 +576,9 @@ def __init__( "mlp_dim": config.intermediate_size, "activation": ACT2FN["gelu_pytorch_tanh"], "attn_bias": True, - "attn_implementation": config._attn_implementation, }, prefix=f"{prefix}.encoder", + multimodal_config=multimodal_config, ) def forward( diff --git a/vllm/model_executor/models/nano_nemotron_vl.py b/vllm/model_executor/models/nano_nemotron_vl.py index 6dfab595e5b9..a88496eca91d 100644 --- a/vllm/model_executor/models/nano_nemotron_vl.py +++ b/vllm/model_executor/models/nano_nemotron_vl.py @@ -1220,7 +1220,7 @@ def extract_feature(self, pixel_values): n = pixel_values.shape[0] vit_embeds_list = [] for i in range(0, n, micro_batch_size): - vit_embeds = self.vision_model(pixel_values[i : i + micro_batch_size]) + _, vit_embeds = self.vision_model(pixel_values[i : i + micro_batch_size]) vit_embeds = vit_embeds.to(dtype=torch.bfloat16) h = w = int(vit_embeds.shape[1] ** 0.5) vit_embeds = vit_embeds.reshape(vit_embeds.shape[0], h, w, -1) @@ -1695,12 +1695,7 @@ def get_vit_model_from_radio_config(self, hf_config): patch_size=patch_size, norm_mean=hf_config.norm_mean, norm_std=hf_config.norm_std, - reg_tokens=( - hf_config_vision.args.get("register_multiple") - if hasattr(hf_config_vision, "args") - and isinstance(hf_config_vision.args, dict) - else None - ), + **hf_config_vision.args, ) return RadioModel(config=radio_config) diff --git a/vllm/model_executor/models/nemotron_h.py b/vllm/model_executor/models/nemotron_h.py index 8bc9ce6154d9..c0fe3942739a 100644 --- a/vllm/model_executor/models/nemotron_h.py +++ b/vllm/model_executor/models/nemotron_h.py @@ -210,16 +210,12 @@ def __init__( ) if self.use_latent_moe: - # TODO: check if using ReplicatedLinear is better than - # ColumnParallelLinear + all_gather - self.fc1_latent_proj = ColumnParallelLinear( + self.fc1_latent_proj = ReplicatedLinear( input_size=config.hidden_size, output_size=self.moe_hidden_size, bias=config.mlp_bias, quant_config=quant_config, disable_tp=self.is_sequence_parallel, - # We need to gather the output to prepare input for moe - gather_output=True, prefix=f"{prefix}.fc1_latent_proj", ) self.fc2_latent_proj = ReplicatedLinear( diff --git a/vllm/model_executor/models/nemotron_parse.py b/vllm/model_executor/models/nemotron_parse.py new file mode 100644 index 000000000000..1e7bb0e4304b --- /dev/null +++ b/vllm/model_executor/models/nemotron_parse.py @@ -0,0 +1,958 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# +# Adapted from https://github.com/amalad/vllm/blob/nemotron_parse/vllm/model_executor/models/nemotron_parse.py +# that's based on https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.1/blob/main/hf_nemotron_parse_modeling.py +# +# Bart classes based on old vLLM codebase: +# https://github.com/vllm-project/vllm/blob/v0.10.2/vllm/model_executor/models/bart.py + +import math +from collections.abc import Iterable, Mapping, Sequence +from typing import Annotated, Literal + +import cv2 +import numpy as np +import torch +import torch.nn as nn +from einops import rearrange +from PIL import Image +from timm.data.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD +from torchvision import transforms as T +from transformers import ( + BartConfig, + BatchFeature, + PretrainedConfig, + TensorType, +) + +from vllm.attention.backends.abstract import AttentionType +from vllm.config import CacheConfig, VllmConfig +from vllm.config.lora import LoRAConfig +from vllm.config.multimodal import BaseDummyOptions +from vllm.logger import init_logger +from vllm.model_executor.layers.activation import get_act_fn +from vllm.model_executor.layers.linear import ColumnParallelLinear, RowParallelLinear +from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization.base_config import QuantizationConfig +from vllm.model_executor.layers.vocab_parallel_embedding import ( + ParallelLMHead, + VocabParallelEmbedding, +) +from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.interfaces import ( + MultiModalEmbeddings, + SupportsMultiModal, +) +from vllm.model_executor.models.radio import RadioModel +from vllm.model_executor.models.whisper import WhisperAttention, WhisperCrossAttention +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.inputs import ( + MultiModalDataDict, + MultiModalFieldConfig, + MultiModalKwargsItems, +) +from vllm.multimodal.parse import MultiModalDataItems +from vllm.multimodal.processing import ( + BaseProcessingInfo, + EncDecMultiModalProcessor, + PromptReplacement, + PromptUpdate, +) +from vllm.multimodal.profiling import BaseDummyInputsBuilder +from vllm.transformers_utils.configs.radio import RadioConfig +from vllm.transformers_utils.tokenizer import AnyTokenizer +from vllm.utils.tensor_schema import TensorSchema, TensorShape + +logger = init_logger(__name__) +DEFAULT_FINAL_IMAGE_SIZE = (2048, 1648) + + +class BartScaledWordEmbedding(VocabParallelEmbedding): + """ + This module overrides VocabParallelEmbedding's + forward by multiplying with embeddings scale. + """ + + def __init__( + self, num_embeddings: int, embedding_dim: int, embed_scale: float = 1.0 + ): + super().__init__(num_embeddings, embedding_dim) + self.embed_scale = embed_scale + + def forward(self, input_ids: torch.Tensor) -> torch.Tensor: + return super().forward(input_ids) * self.embed_scale + + +class BartParallelLMHead(ParallelLMHead): + """ + This module overrides ParallelLMHead's + forward by dividing by embeddings scale, + yielding effectively the inverse of + BartScaledWordEmbedding + """ + + def __init__( + self, num_embeddings: int, embedding_dim: int, embed_scale: float = 1.0 + ): + super().__init__(num_embeddings, embedding_dim) + self.embed_scale = embed_scale + + def forward(self, input_ids: torch.Tensor) -> torch.Tensor: + return super().forward(input_ids) / self.embed_scale + + +class BartDecoderLayer(nn.Module): + def __init__( + self, + config: BartConfig, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ): + super().__init__() + self.embed_dim = config.d_model + + self.self_attn = WhisperAttention( + embed_dim=self.embed_dim, + num_heads=config.decoder_attention_heads, + attn_type=AttentionType.DECODER, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.self_attn", + ) + self.activation_fn = get_act_fn(config.activation_function) + + self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim) + """ + afeldman-nm: personally I would call this "cross-attention", + however I left the name as "encoder_attn" to maintain consistency + with the name of the pretrained weights. + """ + self.encoder_attn = WhisperCrossAttention( + self.embed_dim, + config.decoder_attention_heads, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.encoder_attn", + ) + self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim) + + ffn_hidden_size = self.embed_dim + ffn_intermediate_size = config.encoder_ffn_dim + ffn_has_bias = True + self.fc1 = ColumnParallelLinear( + ffn_hidden_size, + ffn_intermediate_size, + bias=ffn_has_bias, + quant_config=quant_config, + prefix=f"{prefix}.fc1", + ) + self.fc2 = RowParallelLinear( + ffn_intermediate_size, + ffn_hidden_size, + bias=ffn_has_bias, + quant_config=quant_config, + prefix=f"{prefix}.fc2", + ) + + self.final_layer_norm = nn.LayerNorm(self.embed_dim) + + def forward( + self, + decoder_hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor | None = None, + ) -> torch.Tensor: + r""" + Args: + decoder_hidden_states: torch.Tensor of *decoder* input embeddings. + encoder_hidden_states: torch.Tensor of *encoder* input embeddings. + Returns: + Decoder layer output torch.Tensor + """ + residual = decoder_hidden_states + + # Self Attention + hidden_states = self.self_attn(hidden_states=decoder_hidden_states) + + hidden_states = residual + hidden_states + hidden_states = self.self_attn_layer_norm(hidden_states) + + # Cross-Attention Block + + residual = hidden_states + + hidden_states = self.encoder_attn( + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + ) + + hidden_states = residual + hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + # Fully Connected + residual = hidden_states + fc1_out, _ = self.fc1(hidden_states) + hidden_states = self.activation_fn(fc1_out) + + hidden_states, _ = self.fc2(hidden_states) + + hidden_states = residual + hidden_states + hidden_states = self.final_layer_norm(hidden_states) + + return hidden_states + + +class MBartDecoderLayer(BartDecoderLayer): + def forward( + self, + decoder_hidden_states: torch.Tensor, + encoder_hidden_states: torch.Tensor | None = None, + ) -> torch.Tensor: + residual = decoder_hidden_states + hidden_states = self.self_attn_layer_norm(decoder_hidden_states) + + # Self Attention + hidden_states = self.self_attn(hidden_states=hidden_states) + + hidden_states = residual + hidden_states + + # Cross-Attention Block + + residual = hidden_states + hidden_states = self.encoder_attn_layer_norm(hidden_states) + + hidden_states = self.encoder_attn( + hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + ) + + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.final_layer_norm(hidden_states) + fc1_out, _ = self.fc1(hidden_states) + hidden_states = self.activation_fn(fc1_out) + + hidden_states, _ = self.fc2(hidden_states) + + hidden_states = residual + hidden_states + + return hidden_states + + +class MBartDecoderNoPos(nn.Module): + """ + Transformer decoder consisting of *config.decoder_layers* layers. + Each layer is a [`BartDecoderLayer`] + Args: + config: BartConfig + embed_tokens (nn.Embedding): output embedding + """ + + def __init__( + self, + config: BartConfig, + cache_config: CacheConfig | None = None, + quant_config: QuantizationConfig | None = None, + lora_config: LoRAConfig | None = None, + embed_tokens: nn.Embedding | None = None, + prefix: str = "", + ): + super().__init__() + self.cache_config = cache_config + self.quant_config = quant_config + self.lora_config = lora_config + embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0 + + self.embed_tokens = BartScaledWordEmbedding( + config.vocab_size, config.d_model, embed_scale=embed_scale + ) + + if embed_tokens is not None: + self.embed_tokens.weight = embed_tokens.weight + + self.layers = nn.ModuleList( + [ + MBartDecoderLayer( + config, + cache_config, + quant_config, + prefix=f"{prefix}.layers.{layer_idx}", + ) + for layer_idx in range(config.decoder_layers) + ] + ) + + self.layernorm_embedding = nn.LayerNorm(config.d_model) + self.layer_norm = nn.LayerNorm(config.d_model) + + def forward( + self, + decoder_input_ids: torch.Tensor, + *, + encoder_hidden_states: torch.Tensor | None, + inputs_embeds: torch.Tensor | None = None, + **kwargs, + ) -> torch.Tensor: + r""" + Args: + decoder_input_ids: Indices of *decoder* input sequence tokens in the + vocabulary. Padding will be ignored by default should you provide it. + encoder_hidden_states: Tensor of encoder output embeddings + Returns: + Decoder output torch.Tensor + """ + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(decoder_input_ids) + + hidden_states = self.layernorm_embedding(inputs_embeds) + + # decoder layers + + for decoder_layer in self.layers: + hidden_states = decoder_layer( + decoder_hidden_states=hidden_states, + encoder_hidden_states=encoder_hidden_states, + ) + + hidden_states = self.layer_norm(hidden_states) + return hidden_states + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + stacked_params_mapping = [ + # (param_name, shard_name, shard_id) + (".self_attn.qkv_proj", ".self_attn.q_proj", "q"), + (".self_attn.qkv_proj", ".self_attn.k_proj", "k"), + (".self_attn.qkv_proj", ".self_attn.v_proj", "v"), + (".encoder_attn.kv_proj", ".encoder_attn.k_proj", "k"), + (".encoder_attn.kv_proj", ".encoder_attn.v_proj", "v"), + ] + params_dict = dict(self.named_parameters()) + loaded_params: set[str] = set() + for name, loaded_weight in weights: + if name.startswith("embed_positions"): + continue + + for param_name, weight_name, shard_id in stacked_params_mapping: + if weight_name not in name: + continue + name = name.replace(weight_name, param_name) + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + param = params_dict[name] + weight_loader = param.weight_loader + weight_loader(param, loaded_weight, shard_id) + break + else: + # Skip loading extra bias for GPTQ models. + if name.endswith(".bias") and name not in params_dict: + continue + + param = params_dict[name] + weight_loader = getattr(param, "weight_loader", default_weight_loader) + weight_loader(param, loaded_weight) + loaded_params.add(name) + return loaded_params + + +class NemotronParsePixelInputs(TensorSchema): + """ + Dimensions: + - b: Batch size + - c: Number of channels (3) + - h: Height + - w: Width + """ + + type: Literal["pixel_values"] + data: Annotated[torch.Tensor, TensorShape("b", 3, "h", "w")] + + +class NemotronParseImageProcessor: + """ + NemotronParse Image Processor + """ + + def __init__( + self, + final_size: tuple = DEFAULT_FINAL_IMAGE_SIZE, + **kwargs, + ): + # Ensure final_size is properly formatted + if isinstance(final_size, (list, tuple)) and len(final_size) >= 2: + self.final_size = (int(final_size[0]), int(final_size[1])) + elif isinstance(final_size, (int, float)): + self.final_size = (int(final_size), int(final_size)) + else: + self.final_size = DEFAULT_FINAL_IMAGE_SIZE # Default fallback + + self.norm_mean = torch.Tensor(OPENAI_CLIP_MEAN).reshape(1, 3, 1, 1) + self.norm_std = torch.Tensor(OPENAI_CLIP_STD).reshape(1, 3, 1, 1) + + # Create transforms + self._create_transforms() + + def _create_transforms(self): + """Create transform objects.""" + try: + import albumentations as A + except ImportError as err: + raise ImportError( + "The package `albumentations` is required to use " + "NemotronParse model. Please install it with `pip install " + "albumentations`." + ) from err + + # Ensure final_size is a tuple of integers + if isinstance(self.final_size, (list, tuple)): + self.target_height, self.target_width = ( + int(self.final_size[0]), + int(self.final_size[1]), + ) + else: + self.target_height = self.target_width = int(self.final_size) + + self.transform = A.Compose( + [ + A.PadIfNeeded( + min_height=self.target_height, + min_width=self.target_width, + border_mode=cv2.BORDER_CONSTANT, + fill=[255, 255, 255], + p=1.0, + ), + ] + ) + + self.torch_transform = T.Compose( + [ + T.ToTensor(), + ] + ) + + def _resize_with_aspect_ratio(self, image: np.ndarray) -> np.ndarray: + """Resize image maintaining aspect ratio (exact replica of original + LongestMaxSizeHW).""" + height, width = image.shape[:2] + max_size_height = self.target_height + max_size_width = self.target_width + + # Original LongestMaxSizeHW algorithm from custom_augmentations.py + aspect_ratio = width / height + new_height = height + new_width = width + + # If height too big then scale image down + if height > max_size_height: + new_height = max_size_height + new_width = int(new_height * aspect_ratio) + + # If width too big, scale image down further + if new_width > max_size_width: + new_width = max_size_width + new_height = int(new_width / aspect_ratio) + + # Use cv2.INTER_LINEAR like the original + return cv2.resize( + image, (new_width, new_height), interpolation=cv2.INTER_LINEAR + ) + + def _pad_to_size(self, image: np.ndarray) -> np.ndarray: + """Pad image to target size with white padding (matches A.PadIfNeeded + behavior).""" + h, w = image.shape[:2] + min_height, min_width = self.target_height, self.target_width + + # Only pad if image is smaller than target (matches A.PadIfNeeded logic) + pad_h = max(0, min_height - h) + pad_w = max(0, min_width - w) + + if pad_h == 0 and pad_w == 0: + return image + + # A.PadIfNeeded pads to bottom-right with constant value + if len(image.shape) == 3: + # Color image - pad bottom and right with white (255, 255, 255) + padded = np.pad( + image, + ((0, pad_h), (0, pad_w), (0, 0)), + mode="constant", + constant_values=255, + ) + else: + # Grayscale image - pad with white (255) + padded = np.pad( + image, ((0, pad_h), (0, pad_w)), mode="constant", constant_values=255 + ) + + return padded + + def preprocess( + self, + images: Image.Image | list[Image.Image], + **kwargs, + ) -> dict[str, torch.Tensor]: + """ + Preprocess an image or batch of images for the NemotronParse model. + + Args: + images: Input image(s) + """ + # Ensure images is a list + if not isinstance(images, list): + images = [images] + + # Convert PIL images to numpy arrays if needed + processed_images = [] + for image in images: + if isinstance(image, Image.Image): + image = np.asarray(image) + processed_images.append(image) + + # Apply NemotronParse-specific transforms + pixel_values = [] + for image in processed_images: + # Manual resize with aspect ratio preservation + # (replaces LongestMaxSizeHW) + processed_image = self._resize_with_aspect_ratio(image) + + # Apply remaining albumentations transforms if available + if self.transform is not None: + transformed = self.transform(image=processed_image) + processed_image = transformed["image"] + else: + # Fallback: just pad to target size + processed_image = self._pad_to_size(processed_image) + + # Convert to tensor + pixel_values_tensor = self.torch_transform(processed_image) + + # Handle grayscale images + if pixel_values_tensor.shape[0] == 1: + pixel_values_tensor = pixel_values_tensor.expand(3, -1, -1) + + pixel_values.append(pixel_values_tensor) + + # Stack into batch + pixel_values = torch.stack(pixel_values) + + # Normalize pixel values + normalized_values = (pixel_values - self.norm_mean) / self.norm_std + return {"pixel_values": normalized_values} + + def __call__( + self, images: Image.Image | list[Image.Image], **kwargs + ) -> dict[str, torch.Tensor]: + return self.preprocess(images, **kwargs) + + +class NemotronParseProcessor: + """ + NemotronParse Processor + """ + + def __init__( + self, + config: PretrainedConfig, + tokenizer: AnyTokenizer, + **kwargs, + ) -> None: + super().__init__() + + self.config = config + self.tokenizer = tokenizer + + self.image_processor = NemotronParseImageProcessor(final_size=config.image_size) + + def _make_batch_input(self, input_item=None): + if input_item is None: + input_item = [] + if not isinstance(input_item, list): + input_item = [input_item] + return input_item + + def __call__( + self, + text: str | None = None, + images: Image.Image | list[Image.Image] | None = None, + return_tensors: str | TensorType | None = None, + **kwargs, + ) -> BatchFeature: + text, images = [self._make_batch_input(x) for x in (text, images)] + image_inputs = {} if len(images) == 0 else self.image_processor(images) + + text_inputs = self.tokenizer(text, add_special_tokens=False, **kwargs) + combined_outputs = BatchFeature( + data={**text_inputs, **image_inputs}, + tensor_type=return_tensors, + ) + return combined_outputs + + +class NemotronParseProcessingInfo(BaseProcessingInfo): + def get_hf_config(self): + return self.ctx.get_hf_config() + + def get_hf_processor(self, **kwargs) -> NemotronParseProcessor: + return self.ctx.init_processor( + NemotronParseProcessor, + config=self.get_hf_config(), + tokenizer=self.get_tokenizer(), + **kwargs, + ) + + def get_supported_mm_limits(self) -> Mapping[str, int | None]: + return {"image": 1} + + def get_num_image_tokens(self) -> int: + config = self.get_hf_config() + final_size = config.image_size + patch_size = config.encoder.patch_size + + return (final_size[0] // patch_size) * ((final_size[1] // patch_size) // 4) + 1 + + def get_mm_max_tokens_per_item( + self, + seq_len: int, + mm_counts: Mapping[str, int], + ) -> Mapping[str, int] | None: + image_tokens = self.get_num_image_tokens() + return {"image": image_tokens} + + +class NemotronParseDummyInputsBuilder( + BaseDummyInputsBuilder[NemotronParseProcessingInfo] +): + def get_dummy_text(self, mm_counts: Mapping[str, int]) -> str: + return "" + + def get_dummy_mm_data( + self, + seq_len: int, + mm_counts: Mapping[str, int], + mm_options: Mapping[str, BaseDummyOptions] | None = None, + ) -> MultiModalDataDict: + num_images = mm_counts.get("image", 0) + + target_width, target_height = self.info.get_hf_config().image_size + + return { + "image": self._get_dummy_images( + width=target_width, height=target_height, num_images=num_images + ) + } + + +class NemotronParseMultiModalProcessor( + EncDecMultiModalProcessor[NemotronParseProcessingInfo] +): + def create_encoder_prompt( + self, + prompt: str | list[int], + mm_data: MultiModalDataDict, + ) -> str | list[int]: + return [0] + + @property + def pad_dummy_encoder_prompt(self) -> bool: + return True + + def _call_hf_processor( + self, + prompt: str, + mm_data: Mapping[str, object], + mm_kwargs: Mapping[str, object], + tok_kwargs: Mapping[str, object], + ) -> BatchFeature: + if mm_data: + processed_outputs = super()._call_hf_processor( + prompt, mm_data, mm_kwargs, tok_kwargs + ) + else: + hf_processor = self.info.get_hf_processor() + tokenizer = hf_processor.tokenizer + processed_outputs = tokenizer( + prompt, add_special_tokens=False, return_tensors="pt" + ) + return processed_outputs + + def _get_mm_fields_config( + self, + hf_inputs: BatchFeature, + hf_processor_mm_kwargs: Mapping[str, object], + ) -> Mapping[str, MultiModalFieldConfig]: + return dict(pixel_values=MultiModalFieldConfig.batched("image")) + + def _get_prompt_updates( + self, + mm_items: MultiModalDataItems, + hf_processor_mm_kwargs: Mapping[str, object], + out_mm_kwargs: MultiModalKwargsItems, + ) -> Sequence[PromptUpdate]: + num_image_tokens = self.info.get_num_image_tokens() + + return [ + PromptReplacement( + modality="image", + target=[0], + replacement=[0] * num_image_tokens, + ) + ] + + +class RadioWithNeck(nn.Module): + """Vision encoder using RADIO model with custom neck.""" + + def __init__( + self, + config: PretrainedConfig, + quant_config: QuantizationConfig | None = None, + prefix: str = "", + ): + super().__init__() + self.config = config.encoder + + self.model_encoder = self.get_vit_model_from_radio_config( + config, quant_config=quant_config + ) + + # Neck components + last_hidden_state = 1024 + self.conv1 = nn.Conv1d(1280, last_hidden_state, 1) + self.layer_norm1 = nn.LayerNorm( + last_hidden_state, eps=1e-06, elementwise_affine=True + ) + self.conv2 = nn.Conv2d( + last_hidden_state, + last_hidden_state, + kernel_size=(1, 4), + stride=(1, 4), + padding=0, + bias=False, + ) + self.layer_norm2 = nn.LayerNorm( + last_hidden_state, eps=1e-06, elementwise_affine=True + ) + self.sum_proj = ColumnParallelLinear( + 3840, + last_hidden_state, + quant_config=quant_config, + prefix=f"{prefix}.sum_proj", + ) + self.layer_norm3 = nn.LayerNorm( + last_hidden_state, eps=1e-06, elementwise_affine=True + ) + + def get_vit_model_from_radio_config( + self, + hf_config: PretrainedConfig, + quant_config: QuantizationConfig | None = None, + ) -> RadioModel: + hf_config_vision = hf_config.encoder + model_name = hf_config_vision.args.get("model") + if model_name is None: + raise ValueError(f"Unsupported vit model type: {model_name}") + + radio_config = RadioConfig( + model_name=model_name, + image_size=hf_config.image_size, + **hf_config_vision.args, + ) + + return RadioModel(config=radio_config, quant_config=quant_config) + + def forward(self, pixel_values: torch.Tensor, **kwargs) -> torch.Tensor: + summary, feature = self.model_encoder(pixel_values) + + output = self.conv1(feature.permute(0, 2, 1)).permute(0, 2, 1) + output = self.layer_norm1(output) + + patch_size = self.config.patch_size + output = rearrange( + output, + "b (h w) d -> b d h w", + h=pixel_values.shape[-2] // patch_size, + w=pixel_values.shape[-1] // patch_size, + ) + + output = self.conv2(output) + output = rearrange(output, "b d h w -> b (h w) d") + output = self.layer_norm2(output) + summary = self.layer_norm3(self.sum_proj(summary)[0]) + output = torch.cat((output, summary.unsqueeze(1)), dim=1) + + return output + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + model_encoder_weights = [] + adaptor_dict = { + name: param + for name, param in dict(self.named_parameters()).items() + if not name.startswith("model_encoder") + } + for name, w in weights: + if name.startswith("model_encoder"): + model_encoder_weights.append((".".join(name.split(".")[1:]), w)) + else: + param = adaptor_dict[name] + with torch.no_grad(): + default_weight_loader(param, w) + + self.model_encoder.load_weights(model_encoder_weights) + + +@MULTIMODAL_REGISTRY.register_processor( + NemotronParseMultiModalProcessor, + info=NemotronParseProcessingInfo, + dummy_inputs=NemotronParseDummyInputsBuilder, +) +class NemotronParseForConditionalGeneration(nn.Module, SupportsMultiModal): + def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): + super().__init__() + config = vllm_config.model_config.hf_config + + self.config = config + self.vision_config = config.encoder + cache_config = vllm_config.cache_config + quant_config = vllm_config.quant_config + + self.encoder = RadioWithNeck( + config=config, quant_config=quant_config, prefix=f"{prefix}.encoder" + ) + + self.decoder = MBartDecoderNoPos( + config.decoder, + cache_config=cache_config, + quant_config=quant_config, + prefix=f"{prefix}.decoder", + ) + + self.vocab_size = config.decoder.vocab_size + self.lm_head = ParallelLMHead( + config.decoder.vocab_size, config.decoder.d_model, quant_config=quant_config + ) + self.logits_processor = LogitsProcessor( + self.vocab_size, config.decoder.vocab_size + ) + + @classmethod + def get_placeholder_str(cls, modality: str, i: int) -> str | None: + if modality.startswith("image"): + return None + + raise ValueError("Only image modality is supported") + + def _parse_and_validate_image_input( + self, **kwargs: object + ) -> NemotronParsePixelInputs | None: + pixel_values = kwargs.pop("pixel_values", None) + image_embeds = kwargs.pop("image_embeds", None) + + if pixel_values is None and image_embeds is None: + return None + + if pixel_values is not None and image_embeds is not None: + raise ValueError("Both pixel values and image embeds are provided.") + + if pixel_values is not None: + h, w = self.config.image_size + return NemotronParsePixelInputs( + type="pixel_values", + data=pixel_values, + resolve_bindings={ + "h": h, + "w": w, + }, + ) + + if image_embeds is not None: + raise NotImplementedError + + raise AssertionError("This line should be unreachable.") + + def _process_image_input( + self, image_input: NemotronParsePixelInputs + ) -> torch.Tensor: + assert image_input["type"] == "pixel_values" + pixel_values = image_input["data"] + dtype = next(self.encoder.parameters()).dtype + pixel_values = pixel_values.to(dtype) + return self.encoder(pixel_values) + + def get_language_model(self) -> torch.nn.Module: + return self.decoder + + def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings | None: + image_input = self._parse_and_validate_image_input(**kwargs) + if image_input is None: + return None + vision_embeddings = self._process_image_input(image_input) + return vision_embeddings + + def forward( + self, + input_ids: torch.Tensor, + positions: torch.Tensor, + encoder_outputs: list[torch.Tensor] | None = None, + **kwargs, + ) -> torch.Tensor: + r""" + Args: + input_ids: torch.Tensor of *decoder* input token ids. + positions: torch.Tensor of *decoder* position indices. + encoder_outputs: List of encoder output tensors (vision embeddings). + During profiling, this may be None or empty. + Returns: + Output torch.Tensor + """ + inputs_embeds = None + if encoder_outputs: + inputs_embeds = torch.cat(encoder_outputs, dim=0) + hidden_states = self.decoder( + decoder_input_ids=input_ids, encoder_hidden_states=inputs_embeds + ) + return hidden_states + + def compute_logits( + self, + hidden_states: torch.Tensor, + ) -> torch.Tensor | None: + return self.logits_processor(self.lm_head, hidden_states) + + def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + lm_head_dict = dict(self.lm_head.named_parameters()) + + def is_encoder(name: str) -> bool: + return name.startswith("encoder") + + def is_decoder(name: str) -> bool: + return name.startswith("decoder") + + def is_lm_head(name: str): + return name.startswith("lm_head") + + # Separate weights by component + encoder_weights = [] + decoder_weights = [] + + for name, w in weights: + if is_encoder(name): + encoder_weights.append((".".join(name.split(".")[1:]), w)) + elif is_decoder(name): + decoder_weights.append((".".join(name.split(".")[1:]), w)) + elif is_lm_head(name): + trimmed_name = ".".join(name.split(".")[1:]) + param = lm_head_dict[trimmed_name] + with torch.no_grad(): + default_weight_loader(param, w) + else: + logger.info("Found unexpected weight: %s", name) + + # Load encoder weights + self.encoder.load_weights(encoder_weights) + # Load decoder weights + self.decoder.load_weights(decoder_weights) diff --git a/vllm/model_executor/models/olmoe.py b/vllm/model_executor/models/olmoe.py index a5a926151c5c..3d7aa20003e8 100644 --- a/vllm/model_executor/models/olmoe.py +++ b/vllm/model_executor/models/olmoe.py @@ -86,7 +86,11 @@ def __init__( # Gate always runs at half / full precision for now. self.gate = ReplicatedLinear( - hidden_size, num_experts, bias=False, quant_config=None + hidden_size, + num_experts, + bias=False, + quant_config=None, + prefix=f"{prefix}.gate", ) self.experts = FusedMoE( diff --git a/vllm/model_executor/models/phi4mm.py b/vllm/model_executor/models/phi4mm.py index 179d5df869be..c58abefc8544 100644 --- a/vllm/model_executor/models/phi4mm.py +++ b/vllm/model_executor/models/phi4mm.py @@ -428,14 +428,13 @@ def forward( output_imgs.append(torch.cat([sub_img, self.glb_GN, glb_img], dim=1)) else: raise NotImplementedError( - f'hd_transform_order = {self.hd_transform_order}, "\ - "not implemented' + f"hd_transform_order = {self.hd_transform_order}, not implemented" ) # temp_len = int((h*w+1)*144 + 1 + (h+1)*12) assert temp_len == output_imgs[-1].shape[1], ( - f'temp_len: {temp_len}, output_imgs[-1].shape[1]: "\ - "{output_imgs[-1].shape[1]}' + f"temp_len: {temp_len}, output_imgs[-1].shape[1]: " + f"{output_imgs[-1].shape[1]}" ) output_len.append(temp_len) diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 14f73d0c6458..e2877206194f 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -272,6 +272,7 @@ def __init__( bias=False, params_dtype=params_dtype, quant_config=None, + prefix=f"{prefix}.gate", ) self.experts = FusedMoE( diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 61a6e67805d6..50b53a1ff039 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -56,13 +56,22 @@ def __init__( intermediate_size: int, hidden_act: str = "silu", quant_config: QuantizationConfig | None = None, + prefix: str = "", ): super().__init__() self.gate_up_proj = MergedColumnParallelLinear( - hidden_size, [intermediate_size] * 2, bias=False, quant_config=quant_config + hidden_size, + [intermediate_size] * 2, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.gate_up_proj", ) self.c_proj = RowParallelLinear( - intermediate_size, hidden_size, bias=False, quant_config=quant_config + intermediate_size, + hidden_size, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.c_proj", ) if hidden_act != "silu": raise ValueError( @@ -163,7 +172,10 @@ def __init__( self.ln_2 = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) self.mlp = QWenMLP( - config.hidden_size, config.intermediate_size // 2, quant_config=quant_config + config.hidden_size, + config.intermediate_size // 2, + quant_config=quant_config, + prefix=f"{prefix}.mlp", ) def forward( diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index f4c2d3cb75d2..a91aa2cdf78d 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -389,8 +389,6 @@ def __init__( else: self.embed_tokens = PPMissingLayer() - # Use the provided decoder layer type or default to Qwen2DecoderLayer - decoder_layer_type = decoder_layer_type or Qwen2DecoderLayer self.start_layer, self.end_layer, self.layers = make_layers( config.num_hidden_layers, lambda prefix: decoder_layer_type( diff --git a/vllm/model_executor/models/qwen2_5_omni_thinker.py b/vllm/model_executor/models/qwen2_5_omni_thinker.py index 94deeb867c9f..bc9c8b73cff3 100644 --- a/vllm/model_executor/models/qwen2_5_omni_thinker.py +++ b/vllm/model_executor/models/qwen2_5_omni_thinker.py @@ -1128,8 +1128,6 @@ def embed_multimodal(self, **kwargs: object) -> MultiModalEmbeddings: multimodal_embeddings += tuple(audio_embeddings) return multimodal_embeddings - # TODO (ywang96): support overlapping modality embeddings so that - # `use_audio_in_video` will work on V1. def embed_input_ids( self, input_ids: torch.Tensor, diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py index ccf6cc6e5894..7137d3d8e31e 100644 --- a/vllm/model_executor/models/qwen3_next.py +++ b/vllm/model_executor/models/qwen3_next.py @@ -864,6 +864,7 @@ def __init__( intermediate_size=config.intermediate_size, hidden_act=config.hidden_act, quant_config=quant_config, + prefix=f"{prefix}.mlp", ) self.input_layernorm = Qwen3NextRMSNorm( diff --git a/vllm/model_executor/models/qwen3_omni_moe_thinker.py b/vllm/model_executor/models/qwen3_omni_moe_thinker.py index 07c5499d0b19..de8027c434cc 100755 --- a/vllm/model_executor/models/qwen3_omni_moe_thinker.py +++ b/vllm/model_executor/models/qwen3_omni_moe_thinker.py @@ -750,15 +750,42 @@ def pad_to_hop_length(x: np.ndarray, hop_length: int) -> np.ndarray: # https://github.com/huggingface/transformers/pull/41473 mm_kwargs = dict(mm_kwargs) tok_kwargs = dict(tok_kwargs) + mm_kwargs["audio_kwargs"] = dict(mm_kwargs.get("audio_kwargs") or {}) + mm_kwargs["text_kwargs"] = dict(mm_kwargs.get("text_kwargs") or {}) if Version(TRANSFORMERS_VERSION) < Version("4.58.0"): + # Extract audio_sample_rate before restructuring + audio_sample_rate = mm_kwargs.pop("audio_sample_rate", None) + # move truncation to audio_kwargs level to avoid conflict # with tok_kwargs - mm_kwargs["audio_kwargs"] = { - "truncation": mm_kwargs.pop("truncation", False) - } - mm_kwargs["text_kwargs"] = { - "truncation": tok_kwargs.pop("truncation", False) - } + mm_kwargs["audio_kwargs"].setdefault( + "truncation", mm_kwargs.pop("truncation", False) + ) + mm_kwargs["text_kwargs"].setdefault( + "truncation", tok_kwargs.pop("truncation", False) + ) + + # Validate and conditionally pass audio_sample_rate + # WhisperFeatureExtractor has a fixed sampling rate, and vLLM's + # audio loader already resamples audio to the target rate. + # Only pass the value if it matches to avoid unexpected behavior. + if audio_sample_rate is not None: + expected_sr = feature_extractor.sampling_rate + if audio_sample_rate != expected_sr: + logger.warning( + "[%s] audio_sample_rate mismatch: user provided %dHz " + "but model expects %dHz. Ignoring user value. " + "vLLM's audio loader already resampled to %dHz.", + self.__class__.__name__, + audio_sample_rate, + expected_sr, + expected_sr, + ) + else: + # Sample rate matches, safe to pass + mm_kwargs["audio_kwargs"]["audio_sample_rate"] = ( + audio_sample_rate + ) hf_inputs = super()._call_hf_processor( prompt=prompt, @@ -1346,8 +1373,6 @@ def embed_input_ids( return inputs_embeds deepstack_input_embeds = None - # TODO (ywang96): support overlapping modalitiy embeddings so that - # `use_audio_in_video` will work on V1. # split the feat dim to obtain multi-scale visual feature has_vision_embeddings = [ embeddings.shape[-1] != self.config.text_config.hidden_size diff --git a/vllm/model_executor/models/qwen_vl.py b/vllm/model_executor/models/qwen_vl.py index caac14716782..df0733de9803 100644 --- a/vllm/model_executor/models/qwen_vl.py +++ b/vllm/model_executor/models/qwen_vl.py @@ -109,6 +109,7 @@ def __init__( bias: bool = True, kdim: int | None = None, vdim: int | None = None, + prefix: str = "", ): super().__init__() self.embed_dim = embed_dim @@ -128,8 +129,12 @@ def __init__( assert self._qkv_same_embed_dim, ( "Visual Attention implementation only supports self-attention" ) - self.in_proj = ReplicatedLinear(embed_dim, 3 * embed_dim) - self.out_proj = ReplicatedLinear(embed_dim, embed_dim) + self.in_proj = ReplicatedLinear( + embed_dim, 3 * embed_dim, prefix=f"{prefix}.in_proj" + ) + self.out_proj = ReplicatedLinear( + embed_dim, embed_dim, prefix=f"{prefix}.out_proj" + ) self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) def forward( @@ -214,10 +219,15 @@ def __init__( hidden_size: int, intermediate_size: int, quant_config: QuantizationConfig | None = None, + prefix: str = "", ): super().__init__() self.c_fc = ColumnParallelLinear( - hidden_size, intermediate_size, bias=True, quant_config=quant_config + hidden_size, + intermediate_size, + bias=True, + quant_config=quant_config, + prefix=f"{prefix}.c_fc", ) self.act_fn = get_act_fn("gelu") self.c_proj = RowParallelLinear( @@ -225,6 +235,7 @@ def __init__( hidden_size, bias=True, quant_config=quant_config, + prefix=f"{prefix}.c_proj", ) def forward(self, x): @@ -242,17 +253,19 @@ def __init__( mlp_ratio: float = 4.0, norm_layer: Callable[[int], nn.Module] = nn.LayerNorm, quant_config: QuantizationConfig | None = None, + prefix: str = "", ): super().__init__() self.ln_1 = norm_layer(d_model) self.ln_2 = norm_layer(d_model) mlp_width = int(d_model * mlp_ratio) - self.attn = VisualAttention(d_model, n_head) + self.attn = VisualAttention(d_model, n_head, prefix=f"{prefix}.attn") self.mlp = QwenVLMLP( hidden_size=d_model, intermediate_size=mlp_width, quant_config=quant_config, + prefix=f"{prefix}.mlp", ) def attention( @@ -282,6 +295,7 @@ def __init__( mlp_ratio: float = 4.0, norm_layer: Callable[[int], nn.Module] = nn.LayerNorm, quant_config: QuantizationConfig | None = None, + prefix: str = "", ): super().__init__() self.width = width @@ -295,8 +309,9 @@ def __init__( mlp_ratio, norm_layer=norm_layer, quant_config=quant_config, + prefix=f"{prefix}.resblocks.{i}", ) - for _ in range(layers) + for i in range(layers) ] ) @@ -327,6 +342,7 @@ def __init__( output_dim: int = 512, image_start_id: int = 151857, quant_config: QuantizationConfig | None = None, + prefix: str = "", **kwargs, ): super().__init__() @@ -356,6 +372,7 @@ def __init__( mlp_ratio, norm_layer=norm_layer, quant_config=quant_config, + prefix=f"{prefix}.transformer", ) self.attn_pool = Resampler2( @@ -366,6 +383,7 @@ def __init__( norm_layer=norm_layer, adaptive=False, do_post_projection=False, + prefix=f"{prefix}.attn_pool", ).to( device=self.positional_embedding.device, dtype=self.positional_embedding.dtype, @@ -413,7 +431,9 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""): config = vllm_config.model_config.hf_config quant_config = vllm_config.quant_config - self.visual = VisionTransformer(**config.visual, quant_config=quant_config) + self.visual = VisionTransformer( + **config.visual, quant_config=quant_config, prefix=f"{prefix}.visual" + ) @lru_cache(maxsize=1) diff --git a/vllm/model_executor/models/radio.py b/vllm/model_executor/models/radio.py index 6a42564ac70a..ea0e7500f8a2 100644 --- a/vllm/model_executor/models/radio.py +++ b/vllm/model_executor/models/radio.py @@ -427,15 +427,17 @@ def __init__( to_2tuple(config.patch_size), config.image_size ) max_img_size = int( - round(config.max_img_size / config.patch_size) * config.patch_size + round(config.cpe_max_size / config.patch_size) * config.patch_size ) + unique_teachers = set(t["name"] for t in config.teachers) self.patch_generator = ViTPatchGenerator( config.patch_size, config.hidden_size, input_dims=self.img_size, max_input_dims=max_img_size, cls_token=True, - register_multiple=config.reg_tokens, + num_cls_tokens=len(unique_teachers) if config.cls_token_per_teacher else 1, + register_multiple=config.register_multiple, ) self.encoder = InternVisionEncoder( @@ -489,11 +491,20 @@ def __init__( prefix=prefix, ) + summary_idxs = None + if config.teachers: + summary_idxs = torch.tensor( + [i for i, t in enumerate(config.teachers) if t.get("use_summary", True)] + ) + if summary_idxs.numel() > 0: + self.register_buffer("summary_idxs", summary_idxs) + self.summary_idxs = summary_idxs + def forward( self, pixel_values: torch.Tensor | None = None, pixel_embeds: torch.Tensor | None = None, - ) -> torch.FloatTensor: + ) -> tuple[torch.FloatTensor, torch.FloatTensor]: y = self.model(pixel_values) return self._extract_final(y) @@ -546,10 +557,17 @@ def load_weights(self, weights) -> set[str]: return loaded_params - def _extract_final(self, y: torch.Tensor): + def _extract_final( + self, y: torch.Tensor + ) -> tuple[torch.FloatTensor, torch.FloatTensor]: # Remove CLS + REGISTERS tokens patch_gen = getattr(self.model, "patch_generator", None) if patch_gen is not None: + all_summary = y[:, : patch_gen.num_cls_tokens] + if self.summary_idxs is not None: + bb_summary = all_summary[:, self.summary_idxs] + else: + bb_summary = all_summary all_feat = y[:, patch_gen.num_skip :] - return all_feat + return bb_summary.flatten(1), all_feat diff --git a/vllm/model_executor/models/registry.py b/vllm/model_executor/models/registry.py index e0e346fcd878..a25267fc2267 100644 --- a/vllm/model_executor/models/registry.py +++ b/vllm/model_executor/models/registry.py @@ -428,6 +428,10 @@ "VoxtralForConditionalGeneration": ("voxtral", "VoxtralForConditionalGeneration"), # noqa: E501 "VoxtralStreamingGeneration": ("voxtral_streaming", "VoxtralStreamingGeneration"), # noqa: E501 # [Encoder-decoder] + "NemotronParseForConditionalGeneration": ( + "nemotron_parse", + "NemotronParseForConditionalGeneration", + ), "WhisperForConditionalGeneration": ("whisper", "WhisperForConditionalGeneration"), # noqa: E501 } diff --git a/vllm/model_executor/models/zamba2.py b/vllm/model_executor/models/zamba2.py index fe157887eea9..b5132cd86024 100644 --- a/vllm/model_executor/models/zamba2.py +++ b/vllm/model_executor/models/zamba2.py @@ -86,7 +86,13 @@ def __init__( B_class = MergedColumnParallelLinear else: B_class = ColumnParallelLinear - self.B = B_class(rank, output_dim, bias=False, quant_config=quant_config) + self.B = B_class( + rank, + output_dim, + bias=False, + quant_config=quant_config, + prefix=f"{prefix}.B", + ) def forward( self, @@ -346,6 +352,7 @@ def __init__( config.adapter_rank, 2 * [self.intermediate_size], quant_config, + prefix=f"{prefix}.gate_up_proj_adapter_list.{block_idx}", ) else: gate_up_proj_adapter = nn.Identity() diff --git a/vllm/model_executor/utils.py b/vllm/model_executor/utils.py index b89371d98754..d4e87707c847 100644 --- a/vllm/model_executor/utils.py +++ b/vllm/model_executor/utils.py @@ -10,12 +10,6 @@ from vllm.utils.torch_utils import is_torch_equal_or_newer -def set_random_seed(seed: int | None) -> None: - from vllm.platforms import current_platform - - current_platform.seed_everything(seed) - - def set_weight_attrs( weight: torch.Tensor, weight_attrs: dict[str, Any] | None, diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py index 3bbdab3b393c..dc5c4307ce48 100644 --- a/vllm/multimodal/processing.py +++ b/vllm/multimodal/processing.py @@ -1,9 +1,12 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import contextvars +import threading import time from abc import ABC, abstractmethod from collections import defaultdict from collections.abc import Callable, Generator, ItemsView, Iterable, Mapping, Sequence +from contextlib import contextmanager from dataclasses import dataclass, field, replace from enum import Enum from functools import lru_cache @@ -53,7 +56,7 @@ from transformers.feature_extraction_utils import BatchFeature from transformers.processing_utils import ProcessorMixin - from vllm.config import ModelConfig + from vllm.config import ModelConfig, ObservabilityConfig from .cache import BaseMultiModalProcessorCache from .profiling import BaseDummyInputsBuilder @@ -63,6 +66,7 @@ ProcessorMixin = object ModelConfig = object + ObservabilityConfig = object BaseMultiModalProcessorCache = object @@ -70,6 +74,127 @@ _S = TypeVar("_S", str, list[int]) +_request_id_context: contextvars.ContextVar[str | None] = contextvars.ContextVar( + "_request_id_context", default=None +) + + +def get_current_request_id() -> str | None: + """Get the current request_id from the context, if available.""" + return _request_id_context.get() + + +@contextmanager +def set_request_id(request_id: str) -> Generator[None, None, None]: + """Context manager to set the request_id for the current context.""" + token = _request_id_context.set(request_id) + try: + yield + finally: + _request_id_context.reset(token) + + +@dataclass +class MultiModalProcessorTimingStats: + """Per-request timing statistics for multimodal processor stages.""" + + hf_processor_time: float = 0.0 + """Time spent in HuggingFace processor calls (seconds).""" + + hashing_time: float = 0.0 + """Time spent computing multimodal item hashes (seconds).""" + + cache_lookup_time: float = 0.0 + """Time spent in cache lookups and merges (seconds).""" + + prompt_update_time: float = 0.0 + """Time spent applying prompt updates and finding placeholders (seconds).""" + + total_time: float = 0.0 + """Total processing time (seconds).""" + + def to_dict(self) -> dict[str, float]: + """Convert stats to a dictionary for JSON serialization.""" + return { + "hf_processor_time": self.hf_processor_time, + "hashing_time": self.hashing_time, + "cache_lookup_time": self.cache_lookup_time, + "prompt_update_time": self.prompt_update_time, + "total_time": self.total_time, + } + + +def get_timing_stats_from_engine_client( + engine_client: Any, +) -> dict[str, dict[str, float]]: + """ + Get all timing stats from the context associated with the engine client. + + Args: + engine_client: The engine client that has input_processor. + + Returns: + A dictionary mapping request_id to stats dict. + """ + try: + if not engine_client.vllm_config.observability_config.enable_mm_processor_stats: + return {} + except (AttributeError, RuntimeError): + return {} + + try: + input_processor = engine_client.input_processor + input_preprocessor = input_processor.input_preprocessor + + if hasattr(input_preprocessor, "_get_mm_processor"): + mm_processor = input_preprocessor._get_mm_processor() + if mm_processor is not None and hasattr(mm_processor, "info"): + ctx = mm_processor.info.ctx + return ctx.get_all_timing_stats() + except (AttributeError, RuntimeError): + pass + + return {} + + +@contextmanager +def _timed_operation(ctx: "InputProcessingContext", stage_name: str): + """ + Context manager to time an operation using the context's timing stats. + + The request_id is automatically retrieved from the context variable, + so it doesn't need to be passed as a parameter. + + Args: + ctx: The InputProcessingContext containing the timing stats registry. + stage_name: Name of the stage being timed. + """ + request_id = get_current_request_id() + if ctx is None or request_id is None: + yield + return + + stats = ctx.get_timing_stats(request_id) + if stats is None: + yield + return + + start_time = time.perf_counter() + try: + yield + finally: + elapsed = time.perf_counter() - start_time + if stage_name == "hf_processor": + stats.hf_processor_time += elapsed + elif stage_name == "hashing": + stats.hashing_time += elapsed + elif stage_name == "cache_lookup": + stats.cache_lookup_time += elapsed + elif stage_name == "prompt_update": + stats.prompt_update_time += elapsed + stats.total_time += elapsed + + PromptSeq: TypeAlias = str | list[int] """A token sequence (list of token IDs) or text.""" @@ -951,6 +1076,21 @@ class InputProcessingContext: tokenizer: TokenizerLike | None """The tokenizer used to tokenize the inputs.""" + observability_config: "ObservabilityConfig | None" = field( + default=None, compare=False, repr=False + ) + """Configuration for observability features.""" + + timing_stats_registry: dict[str, MultiModalProcessorTimingStats] = field( + default_factory=dict, compare=False, repr=False + ) + """Registry for storing timing stats keyed by request_id.""" + + _timing_stats_registry_lock: threading.Lock = field( + default_factory=threading.Lock, compare=False, repr=False + ) + """Lock for thread-safe access to timing_stats_registry.""" + def get_tokenizer(self) -> TokenizerLike: if self.tokenizer is None: raise ValueError( @@ -1159,6 +1299,71 @@ def call_hf_processor( return self._postprocess_output(output) + def get_timing_stats( + self, request_id: str + ) -> MultiModalProcessorTimingStats | None: + """ + Get timing stats for a request. + """ + if ( + self.observability_config is None + or not self.observability_config.enable_mm_processor_stats + ): + return None + with self._timing_stats_registry_lock: + return self.timing_stats_registry.get(request_id) + + def create_timing_stats(self, request_id: str) -> MultiModalProcessorTimingStats: + """ + Create and store timing stats in the registry for a request. + + This should be called at the start of processing for a request. + The stats object is created immediately and stored in the registry. + """ + if ( + self.observability_config is None + or not self.observability_config.enable_mm_processor_stats + ): + return MultiModalProcessorTimingStats() + + with self._timing_stats_registry_lock: + if request_id in self.timing_stats_registry: + raise ValueError( + f"Timing stats already exist for request_id: {request_id}" + ) + stats = MultiModalProcessorTimingStats() + self.timing_stats_registry[request_id] = stats + return stats + + def clear_timing_stats_registry(self) -> int: + """ + Clear all stats from the registry. Returns the number of stats cleared. + """ + if ( + self.observability_config is None + or not self.observability_config.enable_mm_processor_stats + ): + return 0 + with self._timing_stats_registry_lock: + count = len(self.timing_stats_registry) + self.timing_stats_registry.clear() + return count + + def get_all_timing_stats(self) -> dict[str, dict[str, float]]: + """ + Get all timing stats as a dictionary for API endpoints. + """ + if ( + self.observability_config is None + or not self.observability_config.enable_mm_processor_stats + ): + return {} + with self._timing_stats_registry_lock: + return { + rid: stats.to_dict() + for rid, stats in self.timing_stats_registry.items() + } + class BaseProcessingInfo: """Base class to provide the information necessary for data processing.""" @@ -1502,11 +1707,12 @@ def _call_hf_processor( Call the HF processor on the prompt text and associated multi-modal data. """ - return self.info.ctx.call_hf_processor( - self.info.get_hf_processor(**mm_kwargs), - dict(text=prompt, **mm_data), - dict(**mm_kwargs, **tok_kwargs), - ) + with _timed_operation(self.info.ctx, "hf_processor"): + return self.info.ctx.call_hf_processor( + self.info.get_hf_processor(**mm_kwargs), + dict(text=prompt, **mm_data), + dict(**mm_kwargs, **tok_kwargs), + ) def _hf_processor_applies_updates( self, @@ -1854,12 +2060,13 @@ def _apply_hf_processor( ) # Use overrides if provided; fallback to data-dependent hashing. - mm_hashes = self._hash_mm_items( - mm_data_items, - hf_processor_mm_kwargs, - tokenization_kwargs, - mm_uuids=mm_uuids, - ) + with _timed_operation(self.info.ctx, "hashing"): + mm_hashes = self._hash_mm_items( + mm_data_items, + hf_processor_mm_kwargs, + tokenization_kwargs, + mm_uuids=mm_uuids, + ) mm_prompt_updates = self._get_mm_prompt_updates( mm_data_items, @@ -1900,18 +2107,20 @@ def _cached_apply_hf_processor( mm_uuids=mm_uuids, ) - mm_hashes = self._hash_mm_items( - mm_data_items, - hf_processor_mm_kwargs, - tokenization_kwargs, - mm_uuids=mm_uuids, - ) + with _timed_operation(self.info.ctx, "hashing"): + mm_hashes = self._hash_mm_items( + mm_data_items, + hf_processor_mm_kwargs, + tokenization_kwargs, + mm_uuids=mm_uuids, + ) - mm_is_cached, mm_missing_data_items = self._get_cache_missing_items( - cache=cache, - mm_data_items=mm_data_items, - mm_hashes=mm_hashes, - ) + with _timed_operation(self.info.ctx, "cache_lookup"): + mm_is_cached, mm_missing_data_items = self._get_cache_missing_items( + cache=cache, + mm_data_items=mm_data_items, + mm_hashes=mm_hashes, + ) # NOTE: `prompt` does not correspond to `mm_missing_data_items`, # so we can't apply prompt updates until the new multimodal @@ -1941,13 +2150,14 @@ def _cached_apply_hf_processor( mm_missing_kwargs, ) - mm_kwargs, mm_prompt_updates = self._merge_mm_kwargs( - cache, - mm_hashes=mm_hashes, - mm_is_cached=mm_is_cached, - mm_missing_kwargs=mm_missing_kwargs, - mm_missing_prompt_updates=mm_missing_prompt_updates, - ) + with _timed_operation(self.info.ctx, "cache_lookup"): + mm_kwargs, mm_prompt_updates = self._merge_mm_kwargs( + cache, + mm_hashes=mm_hashes, + mm_is_cached=mm_is_cached, + mm_missing_kwargs=mm_missing_kwargs, + mm_missing_prompt_updates=mm_missing_prompt_updates, + ) mm_info = MultiModalProcessingInfo( kwargs=mm_kwargs, @@ -2129,6 +2339,10 @@ def apply( 3. Extract information about the placeholder tokens from the processed token IDs. """ + request_id = get_current_request_id() + if request_id is not None: + self.info.ctx.create_timing_stats(request_id) + mm_items = self._to_mm_items(mm_data) if tokenization_kwargs is None: @@ -2147,13 +2361,14 @@ def apply( ) # NOTE: tokenization_kwargs are not required to init processor - prompt_ids, mm_placeholders = self._maybe_apply_prompt_updates( - mm_items=mm_items, - prompt_ids=prompt_ids, - mm_kwargs=mm_info.kwargs, - mm_prompt_updates=mm_info.prompt_updates, - is_update_applied=is_update_applied, - ) + with _timed_operation(self.info.ctx, "prompt_update"): + prompt_ids, mm_placeholders = self._maybe_apply_prompt_updates( + mm_items=mm_items, + prompt_ids=prompt_ids, + mm_kwargs=mm_info.kwargs, + mm_prompt_updates=mm_info.prompt_updates, + is_update_applied=is_update_applied, + ) mm_placeholder_ranges = { modality: [item.to_range() for item in placeholders] diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py index 1e7fe8648ab7..ed6a893288d3 100644 --- a/vllm/multimodal/registry.py +++ b/vllm/multimodal/registry.py @@ -5,6 +5,7 @@ from typing import TYPE_CHECKING, Generic, Protocol, TypeVar, cast from vllm.config.multimodal import BaseDummyOptions +from vllm.config.observability import ObservabilityConfig from vllm.logger import init_logger from vllm.tokenizers import TokenizerLike, cached_tokenizer_from_config @@ -22,7 +23,7 @@ ) if TYPE_CHECKING: - from vllm.config import ModelConfig + from vllm.config import ModelConfig, ObservabilityConfig from vllm.model_executor.models.interfaces import SupportsMultiModal logger = init_logger(__name__) @@ -148,6 +149,7 @@ def get_max_tokens_per_item_by_modality( *, cache: BaseMultiModalProcessorCache | None = None, profiler_limits: Mapping[str, int] | None = None, + observability_config: ObservabilityConfig | None = None, ) -> Mapping[str, int]: """ Get the maximum number of tokens per data item from each modality based @@ -156,7 +158,9 @@ def get_max_tokens_per_item_by_modality( if not model_config.is_multimodal_model: return {} - processor = self.create_processor(model_config, cache=cache) + processor = self.create_processor( + model_config, observability_config, cache=cache + ) profiler: MultiModalProfiler = MultiModalProfiler(processor) seq_len = model_config.max_model_len @@ -174,6 +178,7 @@ def get_mm_limits_per_prompt( model_config: "ModelConfig", *, cache: BaseMultiModalProcessorCache | None = None, + observability_config: ObservabilityConfig | None = None, ) -> Mapping[str, int]: """ Get the maximum number of multi-modal input instances for each modality @@ -182,7 +187,9 @@ def get_mm_limits_per_prompt( if not model_config.is_multimodal_model: return {} - processor = self.create_processor(model_config, cache=cache) + processor = self.create_processor( + model_config, observability_config, cache=cache + ) profiler: MultiModalProfiler = MultiModalProfiler(processor) return profiler.get_mm_limits() @@ -231,27 +238,32 @@ def _get_model_cls(self, model_config: "ModelConfig") -> "SupportsMultiModal": def _create_processing_ctx( self, model_config: "ModelConfig", + observability_config: "ObservabilityConfig | None" = None, tokenizer: TokenizerLike | None = None, ) -> InputProcessingContext: if tokenizer is None and not model_config.skip_tokenizer_init: tokenizer = cached_tokenizer_from_config(model_config) - return InputProcessingContext(model_config, tokenizer) + return InputProcessingContext( + model_config, tokenizer, observability_config=observability_config + ) def _create_processing_info( self, model_config: "ModelConfig", + observability_config: "ObservabilityConfig | None" = None, *, tokenizer: TokenizerLike | None = None, ) -> BaseProcessingInfo: model_cls = self._get_model_cls(model_config) factories = model_cls._processor_factory - ctx = self._create_processing_ctx(model_config, tokenizer) + ctx = self._create_processing_ctx(model_config, observability_config, tokenizer) return factories.info(ctx) def create_processor( self, model_config: "ModelConfig", + observability_config: "ObservabilityConfig | None" = None, *, tokenizer: TokenizerLike | None = None, cache: BaseMultiModalProcessorCache | None = None, @@ -265,7 +277,7 @@ def create_processor( model_cls = self._get_model_cls(model_config) factories = model_cls._processor_factory - ctx = self._create_processing_ctx(model_config, tokenizer) + ctx = self._create_processing_ctx(model_config, observability_config, tokenizer) return factories.build_processor(ctx, cache=cache) @@ -276,13 +288,16 @@ def get_decoder_dummy_data( mm_counts: Mapping[str, int] | None = None, *, cache: BaseMultiModalProcessorCache | None = None, + observability_config: ObservabilityConfig | None = None, ) -> DummyDecoderData: """ Create dummy data for profiling the memory usage of a model. The model is identified by `model_config`. """ - processor = self.create_processor(model_config, cache=cache) + processor = self.create_processor( + model_config, observability_config, cache=cache + ) profiler: MultiModalProfiler = MultiModalProfiler(processor) # Extract configurable options from multimodal config. @@ -309,13 +324,16 @@ def get_encoder_dummy_data( mm_counts: Mapping[str, int] | None = None, *, cache: BaseMultiModalProcessorCache | None = None, + observability_config: ObservabilityConfig | None = None, ) -> DummyEncoderData: """ Create dummy data for profiling the memory usage of a model. The model is identified by `model_config`. """ - processor = self.create_processor(model_config, cache=cache) + processor = self.create_processor( + model_config, observability_config, cache=cache + ) profiler: MultiModalProfiler = MultiModalProfiler(processor) # Extract configurable options from multimodal config. @@ -349,8 +367,8 @@ def get_encdec_max_encoder_len(self, model_config: "ModelConfig") -> int: # than whisper. return 0 assert len(max_tokens) == 1, ( - "Encoder-decoder models are expected \ - to implement the multimodal interface with at most one modality." + "Encoder-decoder models are expected " + "to implement the multimodal interface with at most one modality." ) first_modality = next(iter(max_tokens)) diff --git a/vllm/outputs.py b/vllm/outputs.py index cdfe06f1c7fa..74e534ef0c07 100644 --- a/vllm/outputs.py +++ b/vllm/outputs.py @@ -13,7 +13,6 @@ from vllm.logprobs import PromptLogprobs, SampleLogprobs from vllm.lora.request import LoRARequest from vllm.multimodal.inputs import MultiModalPlaceholderDict -from vllm.sequence import RequestMetrics from vllm.v1.metrics.stats import RequestStateStats logger = init_logger(__name__) @@ -113,7 +112,7 @@ def __init__( prompt_logprobs: PromptLogprobs | None, outputs: list[CompletionOutput], finished: bool, - metrics: RequestMetrics | RequestStateStats | None = None, + metrics: RequestStateStats | None = None, lora_request: LoRARequest | None = None, encoder_prompt: str | None = None, encoder_prompt_token_ids: list[int] | None = None, diff --git a/vllm/platforms/interface.py b/vllm/platforms/interface.py index d4b40045df38..3bea498f1b87 100644 --- a/vllm/platforms/interface.py +++ b/vllm/platforms/interface.py @@ -11,6 +11,7 @@ import numpy as np import torch +from typing_extensions import deprecated from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger @@ -365,6 +366,10 @@ def inference_mode(cls): return torch.inference_mode(mode=True) @classmethod + @deprecated( + "`seed_everything` is deprecated. It will be removed in v0.15.0 or later. " + "Please use `vllm.utils.torch_utils.set_random_seed` instead." + ) def seed_everything(cls, seed: int | None = None) -> None: """ Set the seed of each random module. @@ -689,6 +694,13 @@ def check_max_model_len(cls, max_model_len: int) -> int: """ return max_model_len + @classmethod + def set_additional_forward_context(cls, *args, **kwargs) -> dict[str, Any]: + """ + Set some additional forward context for the current platform if needs. + """ + return {} + class UnspecifiedPlatform(Platform): _enum = PlatformEnum.UNSPECIFIED diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py index 785e457fc9f2..278be5a71a40 100644 --- a/vllm/platforms/rocm.py +++ b/vllm/platforms/rocm.py @@ -8,7 +8,6 @@ import torch import vllm.envs as envs -from vllm.attention.backends.abstract import AttentionType from vllm.attention.backends.registry import AttentionBackendEnum from vllm.logger import init_logger from vllm.utils.torch_utils import cuda_device_count_stateless @@ -289,14 +288,6 @@ def get_attn_backend_cls( logger.info("Using Aiter Flash Attention backend.") return AttentionBackendEnum.ROCM_AITER_FA.get_path() - # Priority 5: If model is Encoder-only self-attention type - if ( - attn_selector_config.attn_type is not None - and attn_selector_config.attn_type == AttentionType.ENCODER_ONLY - ): - logger.info("Using FlexAttention backend.") - return AttentionBackendEnum.FLEX_ATTENTION.get_path() - # Default: Triton Unified Attention logger.info("Using Triton Attention backend.") return AttentionBackendEnum.TRITON_ATTN.get_path() diff --git a/vllm/reasoning/deepseek_v3_reasoning_parser.py b/vllm/reasoning/deepseek_v3_reasoning_parser.py index 4e6758586bf4..efb080276e46 100644 --- a/vllm/reasoning/deepseek_v3_reasoning_parser.py +++ b/vllm/reasoning/deepseek_v3_reasoning_parser.py @@ -24,9 +24,9 @@ class DeepSeekV3ReasoningParser(ReasoningParser): def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs): super().__init__(tokenizer, *args, **kwargs) - chat_kwargs = kwargs.pop("chat_template_kwargs", {}) or {} - thinking = bool(chat_kwargs.pop("thinking", False)) - enable_thinking = bool(chat_kwargs.pop("enable_thinking", False)) + chat_kwargs = kwargs.get("chat_template_kwargs", {}) or {} + thinking = bool(chat_kwargs.get("thinking", False)) + enable_thinking = bool(chat_kwargs.get("enable_thinking", False)) thinking = thinking or enable_thinking if thinking: diff --git a/vllm/reasoning/glm4_moe_reasoning_parser.py b/vllm/reasoning/glm4_moe_reasoning_parser.py index 1871adcd4321..466819f8b45b 100644 --- a/vllm/reasoning/glm4_moe_reasoning_parser.py +++ b/vllm/reasoning/glm4_moe_reasoning_parser.py @@ -1,171 +1,13 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project -from collections.abc import Sequence +from vllm.reasoning.holo2_reasoning_parser import Holo2ReasoningParser -from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.openai.protocol import ChatCompletionRequest, DeltaMessage -from vllm.logger import init_logger -from vllm.reasoning import ReasoningParser - -logger = init_logger(__name__) - - -class Glm4MoeModelReasoningParser(ReasoningParser): +class Glm4MoeModelReasoningParser(Holo2ReasoningParser): """ - Reasoning parser for the Glm4MoeModel model. - - The Glm4MoeModel model uses ... tokens to denote reasoning - text within its output. The model provides a strict switch to disable - reasoning output via the 'enable_thinking=False' parameter. This parser - extracts the reasoning content enclosed by and tokens - from the model's output. + Reasoning parser for the Glm4MoeModel model,which inherits from + `Holo2ReasoningParser`. """ - def __init__(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs): - super().__init__(tokenizer, *args, **kwargs) - self.think_start_token = "" - self.think_end_token = "" - self.assistant_token = "<|assistant|>" - - if not self.model_tokenizer: - raise ValueError( - "The model tokenizer must be passed to the ReasoningParser " - "constructor during construction." - ) - - self.think_start_token_id = self.vocab.get(self.think_start_token) - self.think_end_token_id = self.vocab.get(self.think_end_token) - self.assistant_token_id = self.vocab.get(self.assistant_token) - if ( - self.think_start_token_id is None - or self.think_end_token_id is None - or self.assistant_token_id is None - ): - raise RuntimeError( - "Glm4MoeModel reasoning parser could not locate " - "think start/end or assistant tokens in the tokenizer!" - ) - - def is_reasoning_end(self, input_ids: list[int]) -> bool: - """ - GLM's chat template has tokens after every - <|assistant|> token. Thus, we need to check if is - after the most recent <|assistant|> token (if present). - """ - for token_id in input_ids[::-1]: - if token_id == self.think_end_token_id: - return True - elif token_id == self.assistant_token_id: - return False - return False - - def extract_content_ids(self, input_ids: list[int]) -> list[int]: - """ - Extract the content after the end tokens - """ - if self.think_end_token_id not in input_ids[:-1]: - return [] - else: - return input_ids[input_ids.index(self.think_end_token_id) + 1 :] - - def extract_reasoning_streaming( - self, - previous_text: str, - current_text: str, - delta_text: str, - previous_token_ids: Sequence[int], - current_token_ids: Sequence[int], - delta_token_ids: Sequence[int], - ) -> DeltaMessage | None: - """ - Extract reasoning content from a delta message. - Handles streaming output where previous + delta = current. - Uses token IDs for faster processing. - For text abcxyz: - - 'abc' goes to reasoning - - 'xyz' goes to content - """ - # Skip single special tokens - if len(delta_token_ids) == 1 and ( - delta_token_ids[0] in [self.think_start_token_id, self.think_end_token_id] - ): - return None - - if self.think_start_token_id in previous_token_ids: - if self.think_end_token_id in delta_token_ids: - # in previous, in delta, - # extract reasoning content - end_index = delta_text.find(self.think_end_token) - reasoning = delta_text[:end_index] - content = delta_text[end_index + len(self.think_end_token) :] - return DeltaMessage( - reasoning=reasoning, - content=content if content else None, - ) - elif self.think_end_token_id in previous_token_ids: - # in previous, in previous, - # reasoning content continues - return DeltaMessage(content=delta_text) - else: - # in previous, no in previous or delta, - # reasoning content continues - return DeltaMessage(reasoning=delta_text) - elif self.think_start_token_id in delta_token_ids: - if self.think_end_token_id in delta_token_ids: - # in delta, in delta, extract reasoning content - start_index = delta_text.find(self.think_start_token) - end_index = delta_text.find(self.think_end_token) - reasoning = delta_text[ - start_index + len(self.think_start_token) : end_index - ] - content = delta_text[end_index + len(self.think_end_token) :] - return DeltaMessage( - reasoning=reasoning, - content=content if content else None, - ) - else: - # in delta, no in delta, - # reasoning content continues - return DeltaMessage(reasoning=delta_text) - else: - # thinking is disabled, just content - return DeltaMessage(content=delta_text) - - def extract_reasoning( - self, model_output: str, request: ChatCompletionRequest - ) -> tuple[str | None, str | None]: - """ - Extract reasoning content from the model output. - - For text abcxyz: - - 'abc' goes to reasoning - - 'xyz' goes to content - - Returns: - tuple[Optional[str], Optional[str]]: reasoning content and content - """ - - # Check if the model output contains the and tokens. - if ( - self.think_start_token not in model_output - or self.think_end_token not in model_output - ): - return None, model_output - # Check if the is present in the model output, remove it - # if it is present. - model_output_parts = model_output.partition(self.think_start_token) - model_output = ( - model_output_parts[2] if model_output_parts[1] else model_output_parts[0] - ) - # Check if the model output contains the tokens. - # If the end token is not found, return the model output as is. - if self.think_end_token not in model_output: - return None, model_output - - # Extract reasoning content from the model output. - reasoning, _, content = model_output.partition(self.think_end_token) - - final_content = content or None - return reasoning, final_content + pass diff --git a/vllm/reasoning/holo2_reasoning_parser.py b/vllm/reasoning/holo2_reasoning_parser.py index f80190d28d6a..3b5bfd838017 100644 --- a/vllm/reasoning/holo2_reasoning_parser.py +++ b/vllm/reasoning/holo2_reasoning_parser.py @@ -46,9 +46,10 @@ def __init__(self, tokenizer: TokenizerLike, *args, **kwargs): # all requests in the structured output manager. So it is important that without # user specified chat template args, the default thinking is True. - enable_thinking = bool(chat_kwargs.get("thinking", True)) - - if enable_thinking: + thinking = bool(chat_kwargs.get("thinking", True)) + enable_thinking = bool(chat_kwargs.get("enable_thinking", True)) + thinking = thinking and enable_thinking + if thinking: self._parser = DeepSeekR1ReasoningParser(tokenizer, *args, **kwargs) else: self._parser = IdentityReasoningParser(tokenizer, *args, **kwargs) diff --git a/vllm/sequence.py b/vllm/sequence.py index 6d20ca9aac22..3e12f148b22e 100644 --- a/vllm/sequence.py +++ b/vllm/sequence.py @@ -12,40 +12,6 @@ else: KVConnectorOutput = Any -VLLM_TOKEN_ID_ARRAY_TYPE = "l" - -VLLM_INVALID_TOKEN_ID = -1 - - -@dataclass -class RequestMetrics: - """Metrics associated with a request. - - Attributes: - arrival_time: The time when the request arrived. - first_scheduled_time: The time when the request was first scheduled. - first_token_time: The time when the first token was generated. - time_in_queue: The time the request spent in the queue. - finished_time: The time when the request was finished. - scheduler_time: The time spent in the scheduler when this request was - being considered by the scheduler. - model_forward_time: The time spent in the model forward pass when this - request was in the batch. - model_execute_time: The time spent in the model execute function. This - will include model forward, block/sync across - workers, cpu-gpu sync time and sampling time. - """ - - arrival_time: float - last_token_time: float - first_scheduled_time: float | None - first_token_time: float | None - time_in_queue: float | None - finished_time: float | None = None - scheduler_time: float | None = None - model_forward_time: float | None = None - model_execute_time: float | None = None - # cannot use msgspec.Struct here because Dynamo does not support it @dataclass diff --git a/vllm/tokenizers/deepseek_v32.py b/vllm/tokenizers/deepseek_v32.py index d519b61ddb76..4402054c9a5d 100644 --- a/vllm/tokenizers/deepseek_v32.py +++ b/vllm/tokenizers/deepseek_v32.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from pathlib import Path -from typing import Any +from typing import Any, overload from transformers import BatchEncoding @@ -65,6 +65,7 @@ def apply_chat_template( drop_thinking = messages[-1]["role"] == "user" encode_config = dict(thinking_mode=thinking_mode, drop_thinking=drop_thinking) + prompt_str = encode_messages(messages, **encode_config) # type: ignore if kwargs.get("tokenize", True): @@ -161,6 +162,15 @@ def encode( add_special_tokens=add_special_tokens, ) + @overload + def convert_tokens_to_ids(self, tokens: str) -> int: ... + + @overload + def convert_tokens_to_ids(self, tokens: list[str]) -> list[int]: ... + + def convert_tokens_to_ids(self, tokens: str | list[str]) -> int | list[int]: + return self.tokenizer.convert_tokens_to_ids(tokens) + def convert_tokens_to_string(self, tokens: list[str]) -> str: return self.tokenizer.convert_tokens_to_string(tokens) diff --git a/vllm/tokenizers/deepseek_v32_encoding.py b/vllm/tokenizers/deepseek_v32_encoding.py index 521bd9295931..0c42699e5703 100644 --- a/vllm/tokenizers/deepseek_v32_encoding.py +++ b/vllm/tokenizers/deepseek_v32_encoding.py @@ -169,6 +169,7 @@ def render_message( response_format = msg.get("response_format") tool_calls = msg.get("tool_calls") reasoning_content = msg.get("reasoning") or msg.get("reasoning_content") + is_prefix = msg.get("prefix", False) if tools: tools = tools_from_openai_format(tools) @@ -273,11 +274,14 @@ def render_message( + thinking_end_token ) - prompt += assistant_msg_template.format( - reasoning=thinking_part, - content=summary_content, - tool_calls=tool_calls_content, - ) + if not tool_calls and is_prefix: + prompt += summary_content + else: + prompt += assistant_msg_template.format( + reasoning=thinking_part, + content=summary_content, + tool_calls=tool_calls_content, + ) else: raise NotImplementedError(f"Unknown role: {role}") diff --git a/vllm/tokenizers/mistral.py b/vllm/tokenizers/mistral.py index 090286228dda..35a11e95b8bd 100644 --- a/vllm/tokenizers/mistral.py +++ b/vllm/tokenizers/mistral.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from pathlib import Path -from typing import TYPE_CHECKING, Any, cast +from typing import TYPE_CHECKING, Any, cast, overload from mistral_common.protocol.instruct.request import ( ChatCompletionRequest as MistralChatCompletionRequest, @@ -441,6 +441,15 @@ def batch_decode( ids, skip_special_tokens=skip_special_tokens ) + @overload + def convert_tokens_to_ids(self, tokens: str) -> int: ... + + @overload + def convert_tokens_to_ids(self, tokens: list[str]) -> list[int]: ... + + def convert_tokens_to_ids(self, tokens: str | list[str]) -> int | list[int]: + return self.transformers_tokenizer.convert_tokens_to_ids(tokens) + def convert_tokens_to_string(self, tokens: list[str]) -> str: to_decode_special_tokens = {SpecialTokens.tool_calls} if self.is_tekken: diff --git a/vllm/tokenizers/protocol.py b/vllm/tokenizers/protocol.py index 28754f9e10d0..21e5b3a7bbdd 100644 --- a/vllm/tokenizers/protocol.py +++ b/vllm/tokenizers/protocol.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from pathlib import Path -from typing import TYPE_CHECKING, Any, Protocol +from typing import TYPE_CHECKING, Any, Protocol, overload if TYPE_CHECKING: from transformers import BatchEncoding @@ -100,6 +100,15 @@ def apply_chat_template( ) -> str | list[int]: raise NotImplementedError + @overload + def convert_tokens_to_ids(self, tokens: str) -> int: ... + + @overload + def convert_tokens_to_ids(self, tokens: list[str]) -> list[int]: ... + + def convert_tokens_to_ids(self, tokens: str | list[str]) -> int | list[int]: + raise NotImplementedError + def convert_tokens_to_string(self, tokens: list[str]) -> str: raise NotImplementedError diff --git a/vllm/tool_parsers/glm4_moe_tool_parser.py b/vllm/tool_parsers/glm4_moe_tool_parser.py index ebfd91297b41..6ad7d7cb460c 100644 --- a/vllm/tool_parsers/glm4_moe_tool_parser.py +++ b/vllm/tool_parsers/glm4_moe_tool_parser.py @@ -56,6 +56,20 @@ def __init__(self, tokenizer: TokenizerLike): self.tool_call_end_token_id = self.vocab.get(self.tool_call_end_token) self._buffer = "" + def adjust_request(self, request: ChatCompletionRequest) -> ChatCompletionRequest: + """ + Adjust request parameters to ensure tool call tokens are not skipped + during tokenizer decoding. + """ + request = super().adjust_request(request) + if request.tools and request.tool_choice != "none": + # Ensure tool call tokens (, ) are not skipped + # during decoding. Even though they are not marked as special tokens, + # setting skip_special_tokens=False ensures proper handling in + # transformers 5.x where decoding behavior may have changed. + request.skip_special_tokens = False + return request + def extract_tool_calls( self, model_output: str, diff --git a/vllm/transformers_utils/configs/radio.py b/vllm/transformers_utils/configs/radio.py index 2b6544fb273c..ddd72db1aedd 100644 --- a/vllm/transformers_utils/configs/radio.py +++ b/vllm/transformers_utils/configs/radio.py @@ -2,6 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project """Radio vision model configuration""" +from typing import Any + from transformers.configuration_utils import PretrainedConfig from transformers.utils import logging @@ -36,12 +38,15 @@ class RadioConfig(PretrainedConfig): layer_norm_eps: The epsilon used by the layer normalization layers. initializer_factor: A factor for initializing all weight matrices. hidden_act: The non-linear activation function in the encoder. - max_img_size: Maximum image size for position embeddings. + cpe_max_size: Maximum image size for position embeddings. norm_mean: Mean values for image normalization (RGB channels). Defaults to (0.48145466, 0.4578275, 0.40821073)). norm_std: Standard deviation values for image normalization (RGB channels). Defaults to (0.26862954, 0.26130258, 0.27577711)). - reg_tokens: Number of register tokens to use. + register_multiple: Number of register tokens to use. + teachers: A list of teacher model configurations. Each teacher configuration is + a dict with keys like "name" and some may have "use_summary". + cls_token_per_teacher: Whether to use a separate CLS token for each teacher. """ model_type = "radio" @@ -57,10 +62,12 @@ def __init__( layer_norm_eps: float = 1e-6, initializer_factor: float = 1.0, hidden_act: str = "gelu", - max_img_size: int = 2048, + cpe_max_size: int = 2048, norm_mean: tuple[float, float, float] | list = OPENAI_CLIP_MEAN, norm_std: tuple[float, float, float] | list = OPENAI_CLIP_STD, - reg_tokens: int | None = None, + register_multiple: int | None = None, + teachers: list[dict[str, Any]] | None = None, + cls_token_per_teacher: bool = False, **kwargs, ): self.model_name = model_name @@ -78,12 +85,14 @@ def __init__( self.layer_norm_eps = layer_norm_eps self.initializer_factor = initializer_factor self.hidden_act = hidden_act - self.max_img_size = max_img_size + self.cpe_max_size = cpe_max_size self.norm_mean = ( list(norm_mean) if isinstance(norm_mean, (tuple, list)) else norm_mean ) self.norm_std = ( list(norm_std) if isinstance(norm_std, (tuple, list)) else norm_std ) - self.reg_tokens = reg_tokens + self.register_multiple = register_multiple + self.teachers = teachers if teachers is not None else [] + self.cls_token_per_teacher = cls_token_per_teacher super().__init__(**kwargs) diff --git a/vllm/transformers_utils/model_arch_config_convertor.py b/vllm/transformers_utils/model_arch_config_convertor.py new file mode 100644 index 000000000000..dc067a09419b --- /dev/null +++ b/vllm/transformers_utils/model_arch_config_convertor.py @@ -0,0 +1,402 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + +from typing import final + +import torch +from safetensors.torch import _TYPES as _SAFETENSORS_TO_TORCH_DTYPE +from transformers import PretrainedConfig + +from vllm import envs +from vllm.config.model_arch import ( + ModelArchitectureConfig, +) +from vllm.config.utils import getattr_iter +from vllm.logger import init_logger +from vllm.transformers_utils.config import ( + try_get_safetensors_metadata, +) +from vllm.utils.torch_utils import common_broadcastable_dtype + +logger = init_logger(__name__) + + +class ModelArchConfigConvertorBase: + def __init__(self, hf_config: PretrainedConfig, hf_text_config: PretrainedConfig): + self.hf_config = hf_config + self.hf_text_config = hf_text_config + + def get_architectures(self) -> list[str]: + return getattr(self.hf_config, "architectures", []) + + def get_num_hidden_layers(self) -> int: + return getattr(self.hf_text_config, "num_hidden_layers", 0) + + def get_total_num_attention_heads(self) -> int: + return getattr(self.hf_text_config, "num_attention_heads", 0) + + def get_vocab_size(self) -> int: + return getattr(self.hf_text_config, "vocab_size", 0) + + def get_hidden_size(self) -> int: + return getattr(self.hf_text_config, "hidden_size", 0) + + def get_head_size(self) -> int: + if self.is_deepseek_mla(): + qk_rope_head_dim = getattr(self.hf_text_config, "qk_rope_head_dim", 0) + if not envs.VLLM_MLA_DISABLE: + return self.hf_text_config.kv_lora_rank + qk_rope_head_dim + else: + qk_nope_head_dim = getattr(self.hf_text_config, "qk_nope_head_dim", 0) + if qk_rope_head_dim and qk_nope_head_dim: + return qk_rope_head_dim + qk_nope_head_dim + + # NOTE: Some configs may set head_dim=None in the config + if getattr(self.hf_text_config, "head_dim", None) is not None: + return self.hf_text_config.head_dim + + # NOTE: Some models (such as PLaMo2.1) use `hidden_size_per_head` + if getattr(self.hf_text_config, "hidden_size_per_head", None) is not None: + return self.hf_text_config.hidden_size_per_head + + # FIXME(woosuk): This may not be true for all models. + return ( + self.hf_text_config.hidden_size // self.hf_text_config.num_attention_heads + ) + + def get_total_num_kv_heads(self) -> int: + attributes = [ + # For Falcon: + "n_head_kv", + "num_kv_heads", + # For LLaMA-2: + "num_key_value_heads", + # For ChatGLM: + "multi_query_group_num", + ] + # For non-grouped-query attention models, the number of KV heads is + # equal to the number of attention heads. + default_factory = lambda: self.hf_text_config.num_attention_heads + return getattr_iter( + self.hf_text_config, attributes, default_factory=default_factory + ) + + def get_num_experts(self) -> int: + """Returns the number of experts in the model.""" + num_expert_names = [ + "num_experts", # Jamba + "moe_num_experts", # Dbrx + "n_routed_experts", # DeepSeek + "num_local_experts", # Mixtral + ] + num_experts = getattr_iter(self.hf_text_config, num_expert_names, 0) + if isinstance(num_experts, list): + # Ernie VL's remote code uses list[int]... + # The values are always the same so we just take the first one. + return num_experts[0] + # Coerce to 0 if explicitly set to None + return num_experts or 0 + + @final + @classmethod + def get_torch_dtype( + cls, hf_config: PretrainedConfig, model_id: str, revision: str | None + ): + # NOTE: getattr(config, "dtype", torch.float32) is not correct + # because config.dtype can be None. + config_dtype = getattr(hf_config, "dtype", None) + + # Fallbacks for multi-modal models if the root config + # does not define dtype + if config_dtype is None: + config_dtype = getattr(hf_config.get_text_config(), "dtype", None) + if config_dtype is None and hasattr(hf_config, "vision_config"): + config_dtype = getattr(hf_config.vision_config, "dtype", None) + if config_dtype is None and hasattr(hf_config, "encoder_config"): + config_dtype = getattr(hf_config.encoder_config, "dtype", None) + + # Try to read the dtype of the weights if they are in safetensors format + if config_dtype is None: + repo_mt = try_get_safetensors_metadata(model_id, revision=revision) + + if repo_mt and (files_mt := repo_mt.files_metadata): + param_dtypes: set[torch.dtype] = { + _SAFETENSORS_TO_TORCH_DTYPE[dtype_str] + for file_mt in files_mt.values() + for dtype_str in file_mt.parameter_count + if dtype_str in _SAFETENSORS_TO_TORCH_DTYPE + } + + if param_dtypes: + return common_broadcastable_dtype(param_dtypes) + + if config_dtype is None: + config_dtype = torch.float32 + + return config_dtype + + def _normalize_quantization_config(self, config: PretrainedConfig): + quant_cfg = getattr(config, "quantization_config", None) + if quant_cfg is None: + # compressed-tensors uses a "compression_config" key + quant_cfg = getattr(config, "compression_config", None) + + else: + # Set quant_method for ModelOpt models. + producer_name = quant_cfg.get("producer", {}).get("name") + if producer_name == "modelopt": + quant_algo = quant_cfg.get("quantization", {}).get("quant_algo") + if quant_algo is not None: + quant_algo_upper = str(quant_algo).upper() + if quant_algo_upper in { + "FP8", + "FP8_PER_CHANNEL_PER_TOKEN", + "FP8_PB_WO", + }: + quant_cfg["quant_method"] = "modelopt" + elif quant_algo_upper == "NVFP4": + quant_cfg["quant_method"] = "modelopt_fp4" + else: + raise ValueError(f"Unknown ModelOpt quant algo: {quant_algo}") + + if quant_cfg is not None: + # Use the community standard 'quant_method' + quant_method = quant_cfg.get("quant_method", "").lower() + + # Normalize library names + quant_method = quant_method.replace( + "compressed_tensors", "compressed-tensors" + ) + + quant_cfg["quant_method"] = quant_method + + return quant_cfg + + def get_quantization_config(self): + quant_cfg = self._normalize_quantization_config(self.hf_config) + if quant_cfg is None and ( + text_config := getattr(self.hf_config, "text_config", None) + ): + # Check the text config as well for multi-modal models. + quant_cfg = self._normalize_quantization_config(text_config) + return quant_cfg + + def is_deepseek_mla(self) -> bool: + if not hasattr(self.hf_text_config, "model_type"): + return False + elif self.hf_text_config.model_type in ( + "deepseek_v2", + "deepseek_v3", + "deepseek_v32", + "deepseek_mtp", + "kimi_k2", + "kimi_linear", + "longcat_flash", + "pangu_ultra_moe", + "pangu_ultra_moe_mtp", + ): + return self.hf_text_config.kv_lora_rank is not None + elif self.hf_text_config.model_type == "eagle": + # if the model is an EAGLE module, check for the + # underlying architecture + return ( + self.hf_text_config.model.model_type + in ("deepseek_v2", "deepseek_v3", "deepseek_v32") + and self.hf_text_config.kv_lora_rank is not None + ) + return False + + def derive_max_model_len_and_key(self) -> tuple[float, str | None]: + derived_max_model_len = float("inf") + possible_keys = [ + # OPT + "max_position_embeddings", + # GPT-2 + "n_positions", + # MPT + "max_seq_len", + # ChatGLM2 + "seq_length", + # Command-R + "model_max_length", + # Whisper + "max_target_positions", + # Others + "max_sequence_length", + "max_seq_length", + "seq_len", + ] + # Choose the smallest "max_length" from the possible keys + max_len_key = None + for key in possible_keys: + max_len = getattr(self.hf_text_config, key, None) + if max_len is not None: + if max_len < derived_max_model_len: + max_len_key = key + derived_max_model_len = min(derived_max_model_len, max_len) + + # For Command-R / Cohere, Cohere2 / Aya Vision models + if tmp_max_len := getattr(self.hf_text_config, "model_max_length", None): + max_len_key = "model_max_length" + derived_max_model_len = tmp_max_len + return derived_max_model_len, max_len_key + + def convert(self) -> ModelArchitectureConfig: + model_arch_config = ModelArchitectureConfig( + architectures=self.get_architectures(), + model_type=self.hf_config.model_type, + text_model_type=getattr(self.hf_text_config, "model_type", None), + hidden_size=self.get_hidden_size(), + total_num_hidden_layers=self.get_num_hidden_layers(), + total_num_attention_heads=self.get_total_num_attention_heads(), + head_size=self.get_head_size(), + vocab_size=self.get_vocab_size(), + total_num_kv_heads=self.get_total_num_kv_heads(), + num_experts=self.get_num_experts(), + quantization_config=self.get_quantization_config(), + is_deepseek_mla=self.is_deepseek_mla(), + derived_max_model_len_and_key=self.derive_max_model_len_and_key(), + ) + + return model_arch_config + + +class MambaModelArchConfigConvertor(ModelArchConfigConvertorBase): + def get_head_size(self) -> int: + return 0 + + def get_total_num_kv_heads(self) -> int: + return 0 + + +class TerratorchModelArchConfigConvertor(ModelArchConfigConvertorBase): + def get_head_size(self) -> int: + return 0 + + def get_total_num_kv_heads(self) -> int: + return 0 + + +class MedusaModelArchConfigConvertor(ModelArchConfigConvertorBase): + def get_head_size(self) -> int: + return 0 + + def get_total_num_kv_heads(self) -> int: + return 0 + + +class Zamba2ModelArchConfigConvertor(ModelArchConfigConvertorBase): + def get_head_size(self) -> int: + return getattr(self.hf_text_config, "attention_head_dim", 0) + + +class FalconModelArchConfigConvertor(ModelArchConfigConvertorBase): + def get_total_num_kv_heads(self) -> int: + # NOTE: for falcon, when new_decoder_architecture is True, the + # multi_query flag is ignored and we use n_head_kv for the number of + # KV heads. + new_decoder_arch_falcon = getattr( + self.hf_text_config, "new_decoder_architecture", False + ) + + if not new_decoder_arch_falcon and getattr( + self.hf_text_config, "multi_query", False + ): + # Multi-query attention, only one KV head. + return 1 + + # Use the base implementation which checks n_head_kv, num_kv_heads, etc. + return super().get_total_num_kv_heads() + + +class MPTModelArchConfigConvertor(ModelArchConfigConvertorBase): + def get_total_num_kv_heads(self) -> int: + if "kv_n_heads" in self.hf_text_config.attn_config: + return self.hf_text_config.attn_config["kv_n_heads"] + return self.hf_text_config.num_attention_heads + + +class DbrxModelArchConfigConvertor(ModelArchConfigConvertorBase): + def get_total_num_kv_heads(self) -> int: + return getattr( + self.hf_text_config.attn_config, + "kv_n_heads", + self.hf_text_config.num_attention_heads, + ) + + +class NemotronNasModelArchConfigConvertor(ModelArchConfigConvertorBase): + def get_total_num_kv_heads(self) -> int: + for block in self.hf_text_config.block_configs: + if not block.attention.no_op: + return ( + self.hf_text_config.num_attention_heads + // block.attention.n_heads_in_group + ) + raise RuntimeError( + "Could not determine the number of key-value attention heads " + "from model configuration. " + f"Architecture: {self.get_architectures()}. " + "This usually indicates an unsupported model architecture or " + "missing configuration. " + "Please check if your model is supported at: " + "https://docs.vllm.ai/en/latest/models/supported_models.html" + ) + + +class DeepSeekMTPModelArchConfigConvertor(ModelArchConfigConvertorBase): + def get_num_hidden_layers(self) -> int: + return getattr(self.hf_text_config, "num_nextn_predict_layers", 0) + + +class MimoMTPModelArchConfigConvertor(ModelArchConfigConvertorBase): + def get_num_hidden_layers(self) -> int: + return getattr(self.hf_text_config, "num_nextn_predict_layers", 0) + + +class GLM4MoeMTPModelArchConfigConvertor(ModelArchConfigConvertorBase): + def get_num_hidden_layers(self) -> int: + return getattr(self.hf_text_config, "num_nextn_predict_layers", 0) + + +class ErnieMTPModelArchConfigConvertor(ModelArchConfigConvertorBase): + def get_num_hidden_layers(self) -> int: + return getattr(self.hf_text_config, "num_nextn_predict_layers", 0) + + +class Qwen3NextMTPModelArchConfigConvertor(ModelArchConfigConvertorBase): + def get_num_hidden_layers(self) -> int: + return getattr(self.hf_text_config, "num_nextn_predict_layers", 0) + + +class PanguUltraMoeMTPModelArchConfigConvertor(ModelArchConfigConvertorBase): + def get_num_hidden_layers(self) -> int: + return getattr(self.hf_text_config, "num_nextn_predict_layers", 0) + + +class LongCatFlashMTPModelArchConfigConvertor(ModelArchConfigConvertorBase): + def get_num_hidden_layers(self) -> int: + return getattr(self.hf_text_config, "num_nextn_predict_layers", 1) + + +# hf_config.model_type -> convertor class +MODEL_ARCH_CONFIG_CONVERTORS = { + "mamba": MambaModelArchConfigConvertor, + "falcon_mamba": MambaModelArchConfigConvertor, + "timm_wrapper": TerratorchModelArchConfigConvertor, + "medusa": MedusaModelArchConfigConvertor, + "zamba2": Zamba2ModelArchConfigConvertor, + "mpt": MPTModelArchConfigConvertor, + "dbrx": DbrxModelArchConfigConvertor, + "falcon": FalconModelArchConfigConvertor, + "RefinedWeb": FalconModelArchConfigConvertor, + "RefinedWebModel": FalconModelArchConfigConvertor, + "nemotron-nas": NemotronNasModelArchConfigConvertor, + "deepseek_mtp": DeepSeekMTPModelArchConfigConvertor, + "qwen3_next_mtp": Qwen3NextMTPModelArchConfigConvertor, + "mimo_mtp": MimoMTPModelArchConfigConvertor, + "glm4_moe_mtp": GLM4MoeMTPModelArchConfigConvertor, + "ernie_mtp": ErnieMTPModelArchConfigConvertor, + "pangu_ultra_moe_mtp": PanguUltraMoeMTPModelArchConfigConvertor, + "longcat_flash_mtp": LongCatFlashMTPModelArchConfigConvertor, +} diff --git a/vllm/utils/argparse_utils.py b/vllm/utils/argparse_utils.py index 87ee6f54c0c9..9c2cec876ee3 100644 --- a/vllm/utils/argparse_utils.py +++ b/vllm/utils/argparse_utils.py @@ -399,8 +399,7 @@ def _pull_args_from_config(self, args: list[str]) -> list[str]: index = args.index("--config") if index == len(args) - 1: raise ValueError( - "No config file specified! \ - Please check your command-line arguments." + "No config file specified! Please check your command-line arguments." ) file_path = args[index + 1] diff --git a/vllm/utils/mem_utils.py b/vllm/utils/mem_utils.py index bf6d7846573b..dd91400f2b8a 100644 --- a/vllm/utils/mem_utils.py +++ b/vllm/utils/mem_utils.py @@ -22,7 +22,7 @@ def get_max_shared_memory_bytes(gpu: int = 0) -> int: max_shared_mem = ops.get_max_shared_memory_per_block_device_attribute(gpu) # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py # will fail - assert max_shared_mem > 0, "max_shared_mem can not be zero" + assert max_shared_mem > 0, "max_shared_mem cannot be zero" return int(max_shared_mem) @@ -154,12 +154,16 @@ class MemoryProfilingResult: non_kv_cache_memory: int = 0 torch_peak_increase: int = 0 non_torch_increase: int = 0 - weights_memory: float = 0 + weights_memory: int = 0 before_create: MemorySnapshot = field(default_factory=MemorySnapshot) - before_profile: MemorySnapshot = field(default_factory=MemorySnapshot) - after_profile: MemorySnapshot = field(default_factory=MemorySnapshot) profile_time: float = 0.0 + def __post_init__(self) -> None: + device = self.before_create.device_ + + self.before_profile = MemorySnapshot(device=device, auto_measure=False) + self.after_profile = MemorySnapshot(device=device, auto_measure=False) + def __repr__(self) -> str: return ( f"Memory profiling takes {self.profile_time:.2f} seconds. " @@ -175,9 +179,12 @@ def __repr__(self) -> str: @contextlib.contextmanager def memory_profiling( - baseline_snapshot: MemorySnapshot, weights_memory: int + baseline_snapshot: MemorySnapshot, + weights_memory: int = 0, ) -> Generator[MemoryProfilingResult, None, None]: - """Memory profiling context manager. + """ + Memory profiling context manager. + baseline_snapshot: the memory snapshot before the current vLLM instance. weights_memory: memory used by PyTorch when loading the model weights. Note that, before loading the model weights, we also initialize the device @@ -217,21 +224,24 @@ def memory_profiling( b. 2 GiB reserved for the peak activation tensors (category 2) c. 1 GiB used by non-torch components (category 3) - The memory used for loading weights (a.) is directly given from the argument `weights_memory`. + The memory used for loading weights (a.) is directly given from the + argument `weights_memory`. - The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` during profiling gives (b.). + The increase of `torch.cuda.memory_stats()["allocated_bytes.all.peak"]` + during profiling gives (b.). - The increase of `non_torch_memory` from creating the current vLLM instance until after profiling to get (c.). - """ # noqa + The increase of `non_torch_memory` from creating the current vLLM instance + until after profiling to get (c.). + """ gc.collect() torch.cuda.empty_cache() - torch.cuda.reset_peak_memory_stats() - - result = MemoryProfilingResult() + torch.cuda.reset_peak_memory_stats(baseline_snapshot.device_) - result.before_create = baseline_snapshot - # the part of memory used for holding the model weights - result.weights_memory = weights_memory + result = MemoryProfilingResult( + before_create=baseline_snapshot, + # the part of memory used for holding the model weights + weights_memory=weights_memory, + ) result.before_profile.measure() @@ -252,4 +262,4 @@ def memory_profiling( peak_activation_memory = result.torch_peak_increase result.non_kv_cache_memory = ( non_torch_memory + peak_activation_memory + result.weights_memory - ) # noqa + ) diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py index db596052a04d..ca0cecc4a0ad 100644 --- a/vllm/utils/torch_utils.py +++ b/vllm/utils/torch_utils.py @@ -3,6 +3,7 @@ import contextlib import importlib.metadata import os +import random import threading from collections.abc import Callable, Collection from functools import lru_cache @@ -278,6 +279,13 @@ def kv_cache_dtype_str_to_dtype( return STR_DTYPE_TO_TORCH_DTYPE[kv_cache_dtype] +def set_random_seed(seed: int | None) -> None: + if seed is not None: + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + + def create_kv_caches_with_random_flash( num_blocks: int, block_size: int, @@ -290,9 +298,7 @@ def create_kv_caches_with_random_flash( device: str | None = "cuda", cache_layout: str | None = "NHD", ) -> tuple[list[torch.Tensor], list[torch.Tensor]]: - from vllm.platforms import current_platform - - current_platform.seed_everything(seed) + set_random_seed(seed) dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype) generic_kv_cache_shape = (num_blocks, 2, block_size, num_heads, head_size) @@ -335,9 +341,8 @@ def create_kv_caches_with_random( raise ValueError( f"Does not support key cache of type fp8 with head_size {head_size}" ) - from vllm.platforms import current_platform - current_platform.seed_everything(seed) + set_random_seed(seed) dtype = get_kv_cache_torch_dtype(cache_dtype, model_dtype) diff --git a/vllm/v1/attention/backends/flashinfer.py b/vllm/v1/attention/backends/flashinfer.py index 623ae892ecda..7ef157384be1 100755 --- a/vllm/v1/attention/backends/flashinfer.py +++ b/vllm/v1/attention/backends/flashinfer.py @@ -870,7 +870,9 @@ def build( # Guard access to seq_lens_cpu, which may not always be needed # and can be expensive to retrieve in async mode. needs_seq_lens_cpu = self.use_dcp or use_cascade or not is_only_trtllm_decode - seq_lens_cpu = common_attn_metadata.seq_lens_cpu if needs_seq_lens_cpu else None + seq_lens_cpu = ( + common_attn_metadata.seq_lens.cpu() if needs_seq_lens_cpu else None + ) seq_lens_np = seq_lens_cpu.numpy() if seq_lens_cpu is not None else None num_blocks_np = ( (seq_lens_np + (page_size - 1)) // page_size diff --git a/vllm/v1/attention/backends/flex_attention.py b/vllm/v1/attention/backends/flex_attention.py index 8193c05c2b1a..a151a437a76a 100644 --- a/vllm/v1/attention/backends/flex_attention.py +++ b/vllm/v1/attention/backends/flex_attention.py @@ -727,9 +727,7 @@ def build( block_table_tensor, seq_lens, block_size, num_gpu_blocks ) - offset_tensor = common_attn_metadata.num_computed_tokens_cpu.to( - self.device, non_blocking=True - ) + offset_tensor = common_attn_metadata.compute_num_computed_tokens() out = FlexAttentionMetadata( causal=common_attn_metadata.causal, diff --git a/vllm/v1/attention/backends/mla/common.py b/vllm/v1/attention/backends/mla/common.py index e9ec96835f27..a47a2282fe49 100755 --- a/vllm/v1/attention/backends/mla/common.py +++ b/vllm/v1/attention/backends/mla/common.py @@ -791,7 +791,9 @@ def build( prefill_metadata = None if num_prefills > 0: - num_computed_tokens_cpu = common_attn_metadata.num_computed_tokens_cpu + num_computed_tokens_cpu = ( + common_attn_metadata.compute_num_computed_tokens().cpu() + ) reqs_start = num_decodes # prefill_start diff --git a/vllm/v1/attention/backends/mla/flashmla_sparse.py b/vllm/v1/attention/backends/mla/flashmla_sparse.py index 9dbb17b78a53..1122538969d6 100644 --- a/vllm/v1/attention/backends/mla/flashmla_sparse.py +++ b/vllm/v1/attention/backends/mla/flashmla_sparse.py @@ -511,7 +511,7 @@ def _build_fp8_separate_prefill_decode( # For pure decode batches, prefill_request_id will be None # For mixed batches, it will have -1 for decode and request_id for prefill if num_prefills > 0: - seq_lens_cpu = common_attn_metadata.seq_lens_cpu + seq_lens_cpu = common_attn_metadata.seq_lens.cpu() seq_lens = common_attn_metadata.seq_lens query_start_loc_cpu = common_attn_metadata.query_start_loc_cpu diff --git a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py index e8921f8a1c40..c6e3f92dc4a6 100644 --- a/vllm/v1/attention/backends/mla/rocm_aiter_mla.py +++ b/vllm/v1/attention/backends/mla/rocm_aiter_mla.py @@ -88,6 +88,13 @@ def __init__( # TODO: we can disambiguate between decode and mixed-prefill decode here # so we can only use the persistent buffer if a cudagraph is actually # being used. + + # paged_kv_last_page_len is always 1s (kernel block size is always 1), + # so we create it once and reuse slices in both eager and cudagraph modes. + self.paged_kv_last_page_len = torch.ones( + max_num_reqs, dtype=torch.int32, device=device + ) + if self.compilation_config.cudagraph_mode.has_full_cudagraphs(): self.paged_kv_indptr = torch.zeros( max_num_reqs + 1, dtype=torch.int32, device=device @@ -95,9 +102,6 @@ def __init__( self.paged_kv_indices = torch.zeros( max_num_pages, dtype=torch.int32, device=device ) - self.paged_kv_last_page_len = torch.zeros( - max_num_reqs, dtype=torch.int32, device=device - ) self.qo_indptr = torch.zeros( max_num_reqs + 1, dtype=torch.int32, device=device @@ -122,7 +126,9 @@ def _build_decode( ).unsqueeze(0) < seq_lens_device.unsqueeze(1) paged_kv_indices = block_table_tensor[mask] - paged_kv_last_page_len = torch.where(seq_lens_device == 0, 1, seq_lens_device) + # kernel block size is always 1, so each page has exactly 1 token. + # last_page_len is always 1 - just slice the pre-initialized buffer. + paged_kv_last_page_len = self.paged_kv_last_page_len[:num_reqs] paged_kv_indptr = torch.cat( [ @@ -148,11 +154,8 @@ def _build_decode( self.paged_kv_indptr[1 + num_reqs :].fill_(paged_kv_indptr[-1]) paged_kv_indptr = self.paged_kv_indptr[: 1 + num_reqs] - self.paged_kv_last_page_len[:num_reqs].copy_( - paged_kv_last_page_len, non_blocking=True - ) - self.paged_kv_last_page_len[num_reqs:].fill_(1) - paged_kv_last_page_len = self.paged_kv_last_page_len[:num_reqs] + # paged_kv_last_page_len already uses the pre-initialized buffer slice + # (set above), so no copy needed - buffer is always 1s. self.qo_indptr[: 1 + num_reqs].copy_( query_start_loc_device, non_blocking=True diff --git a/vllm/v1/attention/backends/triton_attn.py b/vllm/v1/attention/backends/triton_attn.py index ca7be990ca55..9bf440a04d06 100644 --- a/vllm/v1/attention/backends/triton_attn.py +++ b/vllm/v1/attention/backends/triton_attn.py @@ -13,6 +13,7 @@ AttentionType, MultipleOf, ) +from vllm.attention.ops.triton_prefill_attention import context_attention_fwd from vllm.attention.ops.triton_reshape_and_cache_flash import ( triton_reshape_and_cache_flash, ) @@ -220,7 +221,7 @@ def build( prefix_kv_lens = torch.tensor( [common_prefix_len], dtype=torch.int32, device=self.device ) - suffix_kv_lens = common_attn_metadata.seq_lens_cpu - common_prefix_len + suffix_kv_lens = common_attn_metadata.seq_lens.cpu() - common_prefix_len suffix_kv_lens = suffix_kv_lens.to(self.device) else: cu_prefix_query_lens = None @@ -289,6 +290,19 @@ def get_kv_cache_shape( raise ValueError("Block size must be a multiple of 16.") return (num_blocks, 2, block_size, num_kv_heads, head_size) + @staticmethod + def get_kv_cache_stride_order( + include_num_layers_dimension: bool = False, + ) -> tuple[int, ...]: + # `stride_order` indicates the permutation that gets + # us from `get_kv_cache_shape` to the actual memory layout we want. + if include_num_layers_dimension: + # (num_blocks, num_layers, 2, block_size, num_kv_heads, head_size) + return (1, 0, 2, 3, 4, 5) + + # (num_blocks, 2, block_size, num_kv_heads, head_size) + return (0, 1, 2, 3, 4) + @staticmethod def use_cascade_attention(*args, **kwargs) -> bool: return False @@ -309,6 +323,16 @@ def supports_mm_prefix(cls) -> bool: def supports_sink(cls) -> bool: return True + @classmethod + def supports_attn_type(cls, attn_type: str) -> bool: + """TritonAttention supports all attention types.""" + return attn_type in ( + AttentionType.DECODER, + AttentionType.ENCODER, + AttentionType.ENCODER_ONLY, + AttentionType.ENCODER_DECODER, + ) + @classmethod def supports_compute_capability(cls, capability: DeviceCapability) -> bool: return True @@ -341,6 +365,8 @@ def __init__( self.alibi_slopes = alibi_slopes if sliding_window is None: self.sliding_window = (-1, -1) + elif attn_type in (AttentionType.ENCODER, AttentionType.ENCODER_ONLY): + self.sliding_window = (sliding_window - 1, sliding_window - 1) else: self.sliding_window = (sliding_window - 1, 0) self.kv_cache_dtype = kv_cache_dtype @@ -352,10 +378,6 @@ def __init__( self.num_queries_per_kv = self.num_heads // self.num_kv_heads - if attn_type not in [AttentionType.DECODER, AttentionType.ENCODER_DECODER]: - raise NotImplementedError( - "Encoder self-attention is not implemented for TritonAttentionImpl" - ) self.attn_type = attn_type self.fp8_dtype = current_platform.fp8_dtype() @@ -417,6 +439,21 @@ def forward( # performance to make sure it does not introduce any overhead. num_actual_tokens = attn_metadata.num_actual_tokens + + # Handle encoder attention differently - no KV cache needed + if self.attn_type in (AttentionType.ENCODER_ONLY, AttentionType.ENCODER): + # For encoder attention, + # we use direct Q, K, V tensors without caching + return self._forward_encoder_attention( + query[:num_actual_tokens], + key[:num_actual_tokens], + value[:num_actual_tokens], + output[:num_actual_tokens], + attn_metadata, + layer, + ) + + # For decoder and cross-attention, use KV cache as before key_cache, value_cache = kv_cache.unbind(1) if ( @@ -495,3 +532,48 @@ def forward( ) return output + + def _forward_encoder_attention( + self, + query: torch.Tensor, + key: torch.Tensor, + value: torch.Tensor, + output: torch.Tensor, + attn_metadata: TritonAttentionMetadata, + layer: torch.nn.Module, + ) -> torch.Tensor: + """Forward pass for encoder attention without KV cache. + + Args: + query: shape = [num_encoder_tokens, num_heads, head_size] + key: shape = [num_encoder_tokens, num_kv_heads, head_size] + value: shape = [num_encoder_tokens, num_kv_heads, head_size] + output: shape = [num_encoder_tokens, num_heads, head_size] + attn_metadata: Encoder attention metadata + layer: The attention layer + """ + # For encoder attention, process FP8 quantization if needed + if self.kv_cache_dtype.startswith("fp8"): + raise NotImplementedError( + "quantization is not supported for encoder attention" + ) + + # Use encoder-specific metadata for sequence information + query_start_loc = attn_metadata.query_start_loc + seq_lens = attn_metadata.seq_lens + max_query_len = attn_metadata.max_query_len + + # Call flash attention directly on Q, K, V tensors + context_attention_fwd( + q=query, + k=key, + v=value, + o=output, + b_start_loc=query_start_loc, + b_seq_len=seq_lens, + max_input_len=max_query_len, + is_causal=False, # Encoder attention is bidirectional + sliding_window_q=self.sliding_window[0], + sliding_window_k=self.sliding_window[1], + ) + return output diff --git a/vllm/v1/attention/backends/utils.py b/vllm/v1/attention/backends/utils.py index 6b94f786a26b..2c26d5bfa60d 100644 --- a/vllm/v1/attention/backends/utils.py +++ b/vllm/v1/attention/backends/utils.py @@ -4,7 +4,6 @@ import enum import functools from abc import abstractmethod -from collections.abc import Callable from dataclasses import dataclass, field, fields, make_dataclass from typing import ( TYPE_CHECKING, @@ -22,7 +21,6 @@ from typing_extensions import deprecated, runtime_checkable from vllm.config import VllmConfig, get_layers_from_vllm_config -from vllm.utils.math_utils import cdiv if TYPE_CHECKING: from vllm.v1.core.sched.output import SchedulerOutput @@ -100,6 +98,8 @@ class CommonAttentionMetadata: _seq_lens_cpu: torch.Tensor | None = None _num_computed_tokens_cpu: torch.Tensor | None = None + _num_computed_tokens_cache: torch.Tensor | None = None + @property @deprecated( """ @@ -130,6 +130,13 @@ def num_computed_tokens_cpu(self) -> torch.Tensor: self._num_computed_tokens_cpu = self.seq_lens_cpu - query_seq_lens return self._num_computed_tokens_cpu + def compute_num_computed_tokens(self) -> torch.Tensor: + """Compute num_computed_tokens on device (seq_lens - query_lens).""" + if self._num_computed_tokens_cache is None: + query_lens = self.query_start_loc[1:] - self.query_start_loc[:-1] + self._num_computed_tokens_cache = self.seq_lens - query_lens + return self._num_computed_tokens_cache + # TODO(lucas): remove once we have FULL-CG spec-decode support def unpadded( self, num_actual_tokens: int, num_actual_reqs: int @@ -161,6 +168,48 @@ def unpadded( ) +def make_lazy_sync_tensor_property( + obj: object, + prop_name: str, + tensor: torch.Tensor, + event: torch.cuda.Event, +) -> None: + """ + Make a tensor property lazily sync on first access. + + After a non-blocking D2H copy, call this to wrap the tensor so that + accessing obj.prop_name will synchronize on the CUDA event exactly + once (on first access), then return the tensor directly thereafter. + + This avoids blocking the CPU until the tensor is actually needed. + + Usage: + cpu_buf.copy_(gpu_tensor, non_blocking=True) + event = torch.cuda.Event() + event.record() + make_lazy_sync_tensor_property(metadata, "query_start_loc_cpu", cpu_buf, event) + """ + # Stash tensor and event under private names + val_slot = f"_lazy_{prop_name}_val" + ev_slot = f"_lazy_{prop_name}_ev" + setattr(obj, val_slot, tensor) + setattr(obj, ev_slot, event) + + def getter(self): + ev = getattr(self, ev_slot, None) + if ev is not None: + ev.synchronize() + setattr(self, ev_slot, None) + return getattr(self, val_slot) + + # Create one-off subclass with property override + obj.__class__ = type( + f"{obj.__class__.__name__}_LazySync", + (obj.__class__,), + {prop_name: property(getter)}, + ) + + def slice_query_start_locs( query_start_loc: torch.Tensor, request_slice: slice, @@ -566,201 +615,6 @@ def infer_global_hyperparameters( return global_params -# -# Take in `query_start_loc_np` and `seq_lens_np` and break the sequences into -# local attention blocks, where each block is passed to the attention kernel -# as an independent local ("virtual") batch item. -# -# For example, if are performing a chunked prefill a batch of 3 sequences: -# q_seqlens = [4, 10, 5] -# kv_seqlens = [6, 17, 9] -# Then normally for regular attention we would compute with an attention mask -# for batch idx 0 (q_seqlens = 4, kv_seqlens = 6) like: -# batch idx: 0 (q_seqlens = 4, kv_seqlens = 6) -# k_toks > 0 1 2 3 4 5 -# q_toks v _____________ -# 0 | 1 1 1 -# 1 | 1 1 1 1 -# 2 | 1 1 1 1 1 -# 3 | 1 1 1 1 1 1 -# -# for local attention (with attn_chunk_size = 4) we would compute with an -# attention mask like: -# batch idx: 0 (q_seqlens = 4, kv_seqlens = 6, attn_chunk_size = 4) -# k_toks > 0 1 2 3 4 5 -# q_toks v _____________ -# 0 | 1 1 1 -# 1 | 1 1 1 1 -# 2 | 1 -# 3 | 1 1 -# -# We can simulate this mask using standard flash-attention by breaking the -# sequences into local ("virtual") batches, where each local batch item is a -# local attention block, so in this case batch idx 0 would be broken up into: -# -# local-batch idx: 0 (q_seqlens = 2, kv_seqlens = 4) (batch 0) -# k_toks > 0 1 2 3 -# q_toks v _____________ -# 0 | 1 1 1 -# 1 | 1 1 1 1 -# local-batch idx: 1 (q_seqlens = 2, kv_seqlens = 2) (batch 0) -# k_toks > 4 5 -# q_toks v _____________ -# 2 | 1 -# 3 | 1 1 -# -# e.g. if we have: -# attn_chunk_size = 4 -# query_start_loc_np = [0, 4, 14, 19] (q_seqlens = [4, 10, 5]) -# Then this function would return: -# __b0__ ______b1______ __b2__ < orig batch indices -# q_seqlens_local = [ 2, 2, 1, 4, 4, 1, 4, 1] -# cu_seqlens_q_local = [0, 4, 6, 10, 14, 18, 19, 23, 24] -# seqlens_k_local = [ 4, 2, 4, 4, 4, 1, 4, 1] -# block_table_local : shape[local_virtual_batches, pages_per_local_batch] -def make_local_attention_virtual_batches( - attn_chunk_size: int, - common_attn_metadata: CommonAttentionMetadata, - block_size: int = 0, -) -> tuple[CommonAttentionMetadata, Callable[[torch.Tensor], torch.Tensor]]: - query_start_loc_np = common_attn_metadata.query_start_loc_cpu.numpy() - seq_lens_np = common_attn_metadata.seq_lens_cpu.numpy() - block_table = common_attn_metadata.block_table_tensor - device = common_attn_metadata.query_start_loc.device - - q_seqlens = query_start_loc_np[1:] - query_start_loc_np[:-1] - actual_batch_size = seq_lens_np.shape[0] - - # Handle if we are starting in the middle of a local attention block, - # we assume q_seqlens > 0 (for all elements), for each batch idx we compute - # the number of tokens that are not in the first local attention block and - # then we can simply use a cdiv for the rest. - # For example if we have: - # attn_chunk_size = 4 - # q_seqlens = [4, 10, 5] - # k_seqlens = [6, 17, 9] - # Then we would get: - # new_tokens_in_first_block = [2, 1, 4] - # local_blocks = [2, 4, 2] - q_tokens_in_first_block = np.minimum( - attn_chunk_size - ((seq_lens_np - q_seqlens) % attn_chunk_size), q_seqlens - ).astype(np.int32) - tokens_in_last_block = attn_chunk_size + (seq_lens_np % -attn_chunk_size) - local_blocks = 1 + cdiv(q_seqlens - q_tokens_in_first_block, attn_chunk_size) - - # Once we know the number of local blocks we can compute the request spans - # for each batch idx, we can figure out the number of "virtual" requests we - # have to make, - # For the above example we would get: - # seqlens_q_local = [2, 2, 1, 4, 4, 1, 4, 1] - # - # First Get batched arange. (E.g., [2, 4, 2] -> [0, 1, 0, 1, 2, 3, 0, 1]) - # (TODO: max a utility to share this code with _prepare_inputs) - # arange step 1. [2, 4, 2] -> [2, 6, 8] - cu_num_blocks = np.cumsum(local_blocks) - virtual_batches = cu_num_blocks[-1] - # arange step 2. [2, 6, 8] -> [0, 0, 2, 2, 2, 2, 6, 6] - block_offsets = np.repeat(cu_num_blocks - local_blocks, local_blocks) - # arange step 3. [0, 1, 0, 1, 2, 3, 0, 1] - arange = np.arange(virtual_batches, dtype=np.int32) - block_offsets - # also compute reverse arange (i.e. [1, 0, 3, 2, 1, 0, 1, 0]) - rarange = np.repeat(local_blocks, local_blocks) - arange - 1 - # Then we can compute the seqlens_q_local, handling the fact that the - # first and last blocks could be partial - seqlens_q_local = np.repeat(q_seqlens - q_tokens_in_first_block, local_blocks) - # set the first block since this may be a partial block - seqlens_q_local[arange == 0] = q_tokens_in_first_block - # set the remaining blocks - seqlens_q_local[arange > 0] = np.minimum( - seqlens_q_local - attn_chunk_size * (arange - 1), attn_chunk_size - )[arange > 0] - - # convert from q_seqlens to cu_seqlens_q - cu_seqlens_q_local = np.empty(virtual_batches + 1, dtype=np.int32) - np.cumsum(seqlens_q_local, out=cu_seqlens_q_local[1:]) - cu_seqlens_q_local[0] = 0 - - # compute the seqlens_k_local, - # basically a full local attention block for all but the last block in each - # batch - # For our example this will be: - # seqlens_k_local = [4, 2, 4, 4, 4, 1, 4, 1] - seqlens_k_local = np.full(cu_num_blocks[-1], attn_chunk_size, dtype=np.int32) - seqlens_k_local[cu_num_blocks - 1] = tokens_in_last_block - num_computed_tokens_local = seqlens_k_local - seqlens_q_local - - k_seqstarts_absolute = np.repeat(seq_lens_np, local_blocks) - ( - rarange * attn_chunk_size + np.repeat(tokens_in_last_block, local_blocks) - ) - # For the example the local attention blocks start at: - # _b0_ _____b1_____ _b2_ - # k_seqstarts_absolute = [0, 4, 4, 8, 12, 16, 4, 8] - block_starts = k_seqstarts_absolute // block_size - assert attn_chunk_size % block_size == 0, ( - f"attn_chunk_size {attn_chunk_size} is not divisible by block_size {block_size}" - ) - pages_per_local_batch = attn_chunk_size // block_size - - # Create a block_table for the local attention blocks - # For out example if we have a block-table like (assuming block_size=2): - # block_table = [ - # [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9], < batch 0 - # [10, 11, 12, 13, 14, 15, 16, 17, 18, 19], < batch 1 - # [20, 21, 22, 23, 24, 25, 26, 27, 28, 29], < batch 2 - # ] - # Then for the local batches we would want a block-table like - # block_table_local = [ - # [ 0, 1 ], < local-batch 0, (batch 0, starting from k[0]) - # [ 2, 3 ], < local-batch 1, (batch 0, starting from k[4]) - # [ 12, 13 ], < local-batch 2, (batch 1, starting from k[4]) - # [ 14, 15 ], < local-batch 3, (batch 1, starting from k[8]) - # [ 16, 17 ], < local-batch 4, (batch 1, starting from k[12]) - # [ 18, 19 ], < local-batch 5, (batch 1, starting from k[16]) - # [ 22, 23 ], < local-batch 6, (batch 2, starting from k[4]) - # [ 24, 25 ], < local-batch 7, (batch 2, starting from k[8]) - # ] - block_indices = block_starts[:, None] + np.arange( - pages_per_local_batch, dtype=np.int32 - ) - block_indices = block_indices.reshape(-1).clip(max=block_table.shape[1] - 1) - batch_indices = np.repeat( - np.arange(actual_batch_size, dtype=np.int32), - local_blocks * pages_per_local_batch, - ) - - # NOTE: https://github.com/pytorch/pytorch/pull/160256 causes performance - # regression when using numpy arrays (batch and block indices) to index into - # torch tensor (block_table). As a workaround, convert numpy arrays to torch - # tensor first, which recovers perf. - batch_indices_torch = torch.from_numpy(batch_indices) - block_indices_torch = torch.from_numpy(block_indices) - - # Save as a lambda so we can return this for update_block_table - make_block_table = lambda block_table: block_table[ - batch_indices_torch, block_indices_torch - ].view(virtual_batches, -1) - block_table_local = make_block_table(block_table) - - query_start_loc_cpu = torch.from_numpy(cu_seqlens_q_local) - seq_lens_cpu = torch.from_numpy(seqlens_k_local) - max_seq_len = int(seq_lens_cpu.max()) - - return CommonAttentionMetadata( - query_start_loc_cpu=query_start_loc_cpu, - query_start_loc=query_start_loc_cpu.to(device=device, non_blocking=True), - seq_lens=seq_lens_cpu.to(device=device, non_blocking=True), - num_reqs=len(seq_lens_cpu), - num_actual_tokens=common_attn_metadata.num_actual_tokens, - max_query_len=seqlens_q_local.max(), - max_seq_len=max_seq_len, - block_table_tensor=block_table_local, - slot_mapping=common_attn_metadata.slot_mapping, - causal=True, - _seq_lens_cpu=seq_lens_cpu, - _num_computed_tokens_cpu=torch.from_numpy(num_computed_tokens_local), - ), make_block_table - - def make_kv_sharing_fast_prefill_common_attn_metadata( common_attn_metadata: CommonAttentionMetadata, ) -> CommonAttentionMetadata: diff --git a/vllm/v1/core/sched/async_scheduler.py b/vllm/v1/core/sched/async_scheduler.py index df61eebb395e..a2e1b71e142b 100644 --- a/vllm/v1/core/sched/async_scheduler.py +++ b/vllm/v1/core/sched/async_scheduler.py @@ -10,10 +10,7 @@ class AsyncScheduler(Scheduler): - def _update_after_schedule( - self, - scheduler_output: SchedulerOutput, - ) -> None: + def _update_after_schedule(self, scheduler_output: SchedulerOutput) -> None: super()._update_after_schedule(scheduler_output) pending_structured_output_tokens = False spec_decode_tokens = scheduler_output.scheduled_spec_decode_tokens @@ -41,9 +38,7 @@ def _update_after_schedule( ) def _update_request_with_output( - self, - request: Request, - new_token_ids: list[int], + self, request: Request, new_token_ids: list[int] ) -> tuple[list[int], bool]: if request.discard_latest_async_tokens: # If the request is force preempted in reset_prefix_cache, we diff --git a/vllm/v1/core/sched/interface.py b/vllm/v1/core/sched/interface.py index 596ab05ad320..9255e6092d30 100644 --- a/vllm/v1/core/sched/interface.py +++ b/vllm/v1/core/sched/interface.py @@ -85,10 +85,7 @@ def update_from_output( raise NotImplementedError @abstractmethod - def update_draft_token_ids( - self, - draft_token_ids: "DraftTokenIds", - ) -> None: + def update_draft_token_ids(self, draft_token_ids: "DraftTokenIds") -> None: """Update the draft token ids for the scheduled requests.""" raise NotImplementedError diff --git a/vllm/v1/core/sched/output.py b/vllm/v1/core/sched/output.py index b69fa87ebddc..c9e6df996a6b 100644 --- a/vllm/v1/core/sched/output.py +++ b/vllm/v1/core/sched/output.py @@ -2,11 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass -from functools import cached_property from typing import TYPE_CHECKING -from typing_extensions import deprecated - from vllm._bc_linter import bc_linter_include if TYPE_CHECKING: @@ -129,19 +126,6 @@ class CachedRequestData: def num_reqs(self) -> int: return len(self.req_ids) - @cached_property - @deprecated("This will be removed in v0.14, use `resumed_req_ids` instead.") - def resumed_from_preemption(self) -> list[bool]: - return [req_id in self.resumed_req_ids for req_id in self.req_ids] - - @cached_property - @deprecated("This will be removed in v0.14, use `all_token_ids` instead.") - def resumed_req_token_ids(self) -> list[list[int] | None]: - return [ - self.all_token_ids[req_id] if req_id in self.resumed_req_ids else None - for req_id in self.req_ids - ] - @classmethod def make_empty(cls) -> "CachedRequestData": return cls( diff --git a/vllm/v1/core/sched/scheduler.py b/vllm/v1/core/sched/scheduler.py index 3e6176289746..11d581642fb5 100644 --- a/vllm/v1/core/sched/scheduler.py +++ b/vllm/v1/core/sched/scheduler.py @@ -127,7 +127,7 @@ def __init__( self.kv_event_publisher = EventPublisherFactory.create( self.kv_events_config, - self.parallel_config.data_parallel_rank, + self.parallel_config.data_parallel_index, ) self.ec_connector = None if self.vllm_config.ec_transfer_config is not None: @@ -762,11 +762,7 @@ def schedule(self) -> SchedulerOutput: self._update_after_schedule(scheduler_output) return scheduler_output - def _preempt_request( - self, - request: Request, - timestamp: float, - ) -> None: + def _preempt_request(self, request: Request, timestamp: float) -> None: """Preempt a request and put it back to the waiting queue. NOTE: The request should be popped from the running queue outside of this @@ -786,10 +782,7 @@ def _preempt_request( # Put the request back to the waiting queue. self.waiting.prepend_request(request) - def _update_after_schedule( - self, - scheduler_output: SchedulerOutput, - ) -> None: + def _update_after_schedule(self, scheduler_output: SchedulerOutput) -> None: # Advance the number of computed tokens for the request AFTER # the request is scheduled. # 1. The scheduler_output of the current step has to include the @@ -1006,8 +999,7 @@ def _try_schedule_encoder_inputs( ) curr_embeds_start, curr_embeds_end = ( mm_feature.mm_position.get_embeds_indices_in_range( - start_idx_rel, - end_idx_rel, + start_idx_rel, end_idx_rel ) ) # There's no embeddings in the current range of encoder placeholder tokens @@ -1034,8 +1026,7 @@ def _try_schedule_encoder_inputs( ) def get_grammar_bitmask( - self, - scheduler_output: SchedulerOutput, + self, scheduler_output: SchedulerOutput ) -> GrammarOutput | None: # Collect list of scheduled request ids that use structured output. # The corresponding rows of the bitmask will be in this order. @@ -1285,9 +1276,7 @@ def update_from_output( return engine_core_outputs def _update_request_with_output( - self, - request: Request, - new_token_ids: list[int], + self, request: Request, new_token_ids: list[int] ) -> tuple[list[int], bool]: # Append generated tokens and check for stop. Note that if # a request is still being prefilled, we expect the model runner @@ -1328,10 +1317,7 @@ def _free_encoder_inputs(self, request: Request) -> None: # in the decoder's KV cache. self.encoder_cache_manager.free_encoder_input(request, input_id) - def update_draft_token_ids( - self, - draft_token_ids: DraftTokenIds, - ) -> None: + def update_draft_token_ids(self, draft_token_ids: DraftTokenIds) -> None: for req_id, spec_token_ids in zip( draft_token_ids.req_ids, draft_token_ids.draft_token_ids, @@ -1361,9 +1347,7 @@ def add_request(self, request: Request) -> None: request.record_event(EngineCoreEventType.QUEUED) def finish_requests( - self, - request_ids: str | Iterable[str], - finished_status: RequestStatus, + self, request_ids: str | Iterable[str], finished_status: RequestStatus ) -> None: """Handles the finish signal from outside the scheduler. diff --git a/vllm/v1/engine/coordinator.py b/vllm/v1/engine/coordinator.py index 953342cdd5d0..c2a9fe7c046a 100644 --- a/vllm/v1/engine/coordinator.py +++ b/vllm/v1/engine/coordinator.py @@ -55,7 +55,9 @@ class DPCoordinator: request wave / running state changes. """ - def __init__(self, parallel_config: ParallelConfig): + def __init__( + self, parallel_config: ParallelConfig, enable_wave_coordination: bool = True + ): dp_size = parallel_config.data_parallel_size assert dp_size > 1, "Coordinator only used for data parallel" @@ -83,6 +85,7 @@ def __init__(self, parallel_config: ParallelConfig): "front_publish_address": front_publish_address, "back_output_address": back_output_address, "back_publish_address": back_publish_address, + "enable_wave_coordination": enable_wave_coordination, }, daemon=True, ) @@ -110,13 +113,19 @@ def __init__(self): class DPCoordinatorProc: - def __init__(self, engine_count: int, min_stats_update_interval_ms: int = 100): + def __init__( + self, + engine_count: int, + min_stats_update_interval_ms: int = 100, + enable_wave_coordination: bool = True, + ): set_process_title("DPCoordinator") self.ctx = zmq.Context() self.engines = [EngineState() for _ in range(engine_count)] self.stats_update_interval_ms = min_stats_update_interval_ms + self.enable_wave_coordination = enable_wave_coordination @staticmethod def run_coordinator( @@ -125,10 +134,12 @@ def run_coordinator( back_output_address: str, back_publish_address: str, min_stats_update_interval_ms: int = 100, + enable_wave_coordination: bool = True, ): coordinator = DPCoordinatorProc( engine_count=engine_count, min_stats_update_interval_ms=min_stats_update_interval_ms, + enable_wave_coordination=enable_wave_coordination, ) try: coordinator.process_input_socket( @@ -265,22 +276,25 @@ def process_input_socket( ) continue # Skip normal engine notification processing - # We received a message on the front-end XPUB socket, - # from an API server sending a new request while the - # engines are paused, so that we can wake the other - # engines. - engine_to_exclude, wave = decoded - if not engines_running: - if wave < current_wave: - # If the wave number is stale, ensure the message - # is handled by all the engines. - engine_to_exclude = None - - engines_running = True - wave_state_changed = True - self._send_start_wave( - publish_back, current_wave, engine_to_exclude - ) + # Wave coordination: handle new-request messages from front-end. + # Only process these when wave coordination is enabled + if self.enable_wave_coordination: + # We received a message on the front-end XPUB socket, + # from an API server sending a new request while the + # engines are paused, so that we can wake the other + # engines. + engine_to_exclude, wave = decoded + if not engines_running: + if wave < current_wave: + # If the wave number is stale, ensure the message + # is handled by all the engines. + engine_to_exclude = None + + engines_running = True + wave_state_changed = True + self._send_start_wave( + publish_back, current_wave, engine_to_exclude + ) if output_back in events: # We received a message from one of the engines. @@ -325,34 +339,39 @@ def process_input_socket( stats[1] = scheduler_stats.num_running_reqs stats_changed = True - if (wave := outputs.wave_complete) is not None: - # 2. Notification from rank 0 engine that we've - # moved into the global paused state - # (engines_running==False). - if current_wave <= wave: - new_wave = wave + 1 + # Wave coordination: handle wave completion and start notifications + # Only process these when wave coordination is enabled + if self.enable_wave_coordination: + if (wave := outputs.wave_complete) is not None: + # 2. Notification from rank 0 engine that we've + # moved into the global paused state + # (engines_running==False). + if current_wave <= wave: + new_wave = wave + 1 + logger.debug( + "Moving DP wave from %d to %d.", + current_wave, + new_wave, + ) + current_wave = new_wave + engines_running = False + wave_state_changed = True + elif (wave := outputs.start_wave) is not None and ( + wave > current_wave + or (wave == current_wave and not engines_running) + ): + # 3. The engine received request for a non-current wave + # so we must ensure that other engines progress to the + # next wave (race condition handling). logger.debug( - "Moving DP wave from %d to %d.", current_wave, new_wave + "Starting wave %d after notification of " + "stale wave request from engine.", + wave, ) - current_wave = new_wave - engines_running = False + current_wave = wave + engines_running = True wave_state_changed = True - elif (wave := outputs.start_wave) is not None and ( - wave > current_wave - or (wave == current_wave and not engines_running) - ): - # 3. The engine received request for a non-current wave - # so we must ensure that other engines progress to the - # next wave (race condition handling). - logger.debug( - "Starting wave %d after notification of " - "stale wave request from engine.", - wave, - ) - current_wave = wave - engines_running = True - wave_state_changed = True - self._send_start_wave(publish_back, wave, eng_index) + self._send_start_wave(publish_back, wave, eng_index) if wave_state_changed: message = (None, current_wave, engines_running) diff --git a/vllm/v1/engine/core.py b/vllm/v1/engine/core.py index 7fbce2da8587..d5c08c851db3 100644 --- a/vllm/v1/engine/core.py +++ b/vllm/v1/engine/core.py @@ -84,6 +84,7 @@ def __init__( executor_class: type[Executor], log_stats: bool, executor_fail_callback: Callable | None = None, + include_finished_set: bool = False, ): # plugins need to be loaded at the engine/scheduler level too from vllm.plugins import load_general_plugins @@ -91,7 +92,7 @@ def __init__( load_general_plugins() self.vllm_config = vllm_config - if vllm_config.parallel_config.data_parallel_rank == 0: + if not vllm_config.parallel_config.data_parallel_rank_local: logger.info( "Initializing a V1 LLM engine (v%s) with config: %s", VLLM_VERSION, @@ -138,7 +139,7 @@ def __init__( vllm_config=vllm_config, kv_cache_config=kv_cache_config, structured_output_manager=self.structured_output_manager, - include_finished_set=vllm_config.parallel_config.data_parallel_size > 1, + include_finished_set=include_finished_set, log_stats=self.log_stats, block_size=scheduler_block_size, ) @@ -178,7 +179,7 @@ def __init__( # to eliminate pipeline bubbles. self.batch_queue_size = self.model_executor.max_concurrent_batches self.batch_queue: ( - deque[tuple[Future[ModelRunnerOutput], SchedulerOutput]] | None + deque[tuple[Future[ModelRunnerOutput], SchedulerOutput, Future[Any]]] | None ) = None if self.batch_queue_size > 1: logger.info("Batch queue is enabled with size %d", self.batch_queue_size) @@ -336,16 +337,6 @@ def log_error_detail(self, scheduler_output: SchedulerOutput): ) raise err - def _log_err_callback(self, scheduler_output: SchedulerOutput): - """Log error details of a future that's not expected to return a result.""" - - def callback(f, sched_output=scheduler_output): - with self.log_error_detail(sched_output): - result = f.result() - assert result is None - - return callback - def step(self) -> tuple[dict[int, EngineCoreOutputs], bool]: """Schedule, execute, and make output. @@ -422,8 +413,6 @@ def step_with_batch_queue( # No sampling required (no requests scheduled). future = cast(Future[ModelRunnerOutput], exec_future) else: - exec_future.add_done_callback(self._log_err_callback(scheduler_output)) - if not scheduler_output.pending_structured_output_tokens: # We aren't waiting for any tokens, get any grammar output # and sample immediately. @@ -440,7 +429,7 @@ def step_with_batch_queue( if not deferred_scheduler_output: # Add this step's future to the queue. - batch_queue.appendleft((future, scheduler_output)) + batch_queue.appendleft((future, scheduler_output, exec_future)) if ( model_executed and len(batch_queue) < self.batch_queue_size @@ -457,9 +446,14 @@ def step_with_batch_queue( return None, False # Block until the next result is available. - future, scheduler_output = batch_queue.pop() + future, scheduler_output, exec_model_fut = batch_queue.pop() with self.log_error_detail(scheduler_output): model_output = future.result() + if model_output is None: + # None from sample_tokens() implies that the original execute_model() + # call failed - raise that exception. + exec_model_fut.result() + raise RuntimeError("unexpected error") # Before processing the model output, process any aborts that happened # during the model execution. @@ -478,7 +472,7 @@ def step_with_batch_queue( deferred_scheduler_output ) future = self.model_executor.sample_tokens(grammar_output, non_block=True) - batch_queue.appendleft((future, deferred_scheduler_output)) + batch_queue.appendleft((future, deferred_scheduler_output, exec_future)) return engine_core_outputs, model_executed @@ -605,6 +599,7 @@ def __init__( executor_class: type[Executor], log_stats: bool, client_handshake_address: str | None = None, + *, engine_index: int = 0, ): self.input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]() @@ -636,17 +631,22 @@ def __init__( self.has_coordinator, self.frontend_stats_publish_address, ) - # Only publish request queue stats to coordinator for "internal" - # and "hybrid" LB modes . - self.publish_dp_lb_stats = ( + internal_dp_balancing = ( self.has_coordinator and not vllm_config.parallel_config.data_parallel_external_lb ) + # Only publish request queue stats to coordinator for "internal" + # and "hybrid" LB modes. + self.publish_dp_lb_stats = internal_dp_balancing self._init_data_parallel(vllm_config) super().__init__( - vllm_config, executor_class, log_stats, executor_fail_callback + vllm_config, + executor_class, + log_stats, + executor_fail_callback, + internal_dp_balancing, ) # Background Threads and Queues for IO. These enable us to @@ -854,18 +854,29 @@ def signal_handler(signum, frame): engine_core: EngineCoreProc | None = None try: - parallel_config: ParallelConfig = kwargs["vllm_config"].parallel_config - if parallel_config.data_parallel_size > 1 or dp_rank > 0: + vllm_config: VllmConfig = kwargs["vllm_config"] + parallel_config: ParallelConfig = vllm_config.parallel_config + data_parallel = parallel_config.data_parallel_size > 1 or dp_rank > 0 + if data_parallel: + parallel_config.data_parallel_rank_local = local_dp_rank set_process_title("EngineCore", f"DP{dp_rank}") - decorate_logs() + else: + set_process_title("EngineCore") + decorate_logs() + + parallel_config.data_parallel_index = dp_rank + if data_parallel and vllm_config.model_config.is_moe: # Set data parallel rank for this engine process. parallel_config.data_parallel_rank = dp_rank - parallel_config.data_parallel_rank_local = local_dp_rank engine_core = DPEngineCoreProc(*args, **kwargs) else: - set_process_title("EngineCore") - decorate_logs() - engine_core = EngineCoreProc(*args, **kwargs) + # Non-MoE DP ranks are completely independent, so treat like DP=1. + # Note that parallel_config.data_parallel_index will still reflect + # the original DP rank. + parallel_config.data_parallel_size = 1 + parallel_config.data_parallel_size_local = 1 + parallel_config.data_parallel_rank = 0 + engine_core = EngineCoreProc(*args, engine_index=dp_rank, **kwargs) engine_core.run_busy_loop() @@ -1195,6 +1206,10 @@ def __init__( log_stats: bool, client_handshake_address: str | None = None, ): + assert vllm_config.model_config.is_moe, ( + "DPEngineCoreProc should only be used for MoE models" + ) + # Counts forward-passes of the model so that we can synchronize # finished with DP peers every N steps. self.step_counter = 0 @@ -1210,7 +1225,7 @@ def __init__( executor_class, log_stats, client_handshake_address, - dp_rank, + engine_index=dp_rank, ) def _init_data_parallel(self, vllm_config: VllmConfig): @@ -1391,7 +1406,7 @@ def reinitialize_distributed( ) -class DPEngineCoreActor(DPEngineCoreProc): +class EngineCoreActorMixin: """ Ray actor for running EngineCore in a data parallel context """ @@ -1399,15 +1414,12 @@ class DPEngineCoreActor(DPEngineCoreProc): def __init__( self, vllm_config: VllmConfig, - local_client: bool, addresses: EngineZmqAddresses, - executor_class: type[Executor], - log_stats: bool, dp_rank: int = 0, local_dp_rank: int = 0, ): self.addresses = addresses - vllm_config.parallel_config.data_parallel_rank = dp_rank + vllm_config.parallel_config.data_parallel_index = dp_rank vllm_config.parallel_config.data_parallel_rank_local = local_dp_rank # Set CUDA_VISIBLE_DEVICES as early as possible in actor life cycle @@ -1429,8 +1441,6 @@ def __init__( # of ray. self._set_visible_devices(vllm_config, local_dp_rank) - super().__init__(vllm_config, local_client, "", executor_class, log_stats) - def _set_visible_devices(self, vllm_config: VllmConfig, local_dp_rank: int): from vllm.platforms import current_platform @@ -1491,7 +1501,7 @@ def run(self): Run the engine core busy loop. """ try: - self.run_busy_loop() + self.run_busy_loop() # type: ignore[attr-defined] except SystemExit: logger.debug("EngineCore exiting.") raise @@ -1499,4 +1509,58 @@ def run(self): logger.exception("EngineCore encountered a fatal error.") raise finally: - self.shutdown() + self.shutdown() # type: ignore[attr-defined] + + +class DPMoEEngineCoreActor(EngineCoreActorMixin, DPEngineCoreProc): + """Used for MoE model data parallel cases.""" + + def __init__( + self, + vllm_config: VllmConfig, + local_client: bool, + addresses: EngineZmqAddresses, + executor_class: type[Executor], + log_stats: bool, + dp_rank: int = 0, + local_dp_rank: int = 0, + ): + vllm_config.parallel_config.data_parallel_rank = dp_rank + + EngineCoreActorMixin.__init__( + self, vllm_config, addresses, dp_rank, local_dp_rank + ) + DPEngineCoreProc.__init__( + self, vllm_config, local_client, "", executor_class, log_stats + ) + + +class EngineCoreActor(EngineCoreActorMixin, EngineCoreProc): + """Used for non-MoE and/or non-DP cases.""" + + def __init__( + self, + vllm_config: VllmConfig, + local_client: bool, + addresses: EngineZmqAddresses, + executor_class: type[Executor], + log_stats: bool, + dp_rank: int = 0, + local_dp_rank: int = 0, + ): + vllm_config.parallel_config.data_parallel_size = 1 + vllm_config.parallel_config.data_parallel_size_local = 1 + vllm_config.parallel_config.data_parallel_rank = 0 + + EngineCoreActorMixin.__init__( + self, vllm_config, addresses, dp_rank, local_dp_rank + ) + EngineCoreProc.__init__( + self, + vllm_config, + local_client, + "", + executor_class, + log_stats, + engine_index=dp_rank, + ) diff --git a/vllm/v1/engine/core_client.py b/vllm/v1/engine/core_client.py index cacbc805e84f..f74e90abc906 100644 --- a/vllm/v1/engine/core_client.py +++ b/vllm/v1/engine/core_client.py @@ -502,7 +502,7 @@ def __init__( parallel_config = vllm_config.parallel_config dp_size = parallel_config.data_parallel_size - dp_rank = parallel_config.data_parallel_rank + dp_rank = parallel_config.data_parallel_index dp_local_size = parallel_config.data_parallel_size_local offline_mode = parallel_config.data_parallel_rank_local is not None # Client manages local+remote EngineCores in pure internal LB case. diff --git a/vllm/v1/engine/input_processor.py b/vllm/v1/engine/input_processor.py index 7cee1ead7c0b..c14a7cb45d0f 100644 --- a/vllm/v1/engine/input_processor.py +++ b/vllm/v1/engine/input_processor.py @@ -15,7 +15,7 @@ from vllm.multimodal.cache import processor_cache_from_config from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalUUIDDict from vllm.multimodal.parse import MultiModalDataParser -from vllm.multimodal.processing import EncDecMultiModalProcessor +from vllm.multimodal.processing import EncDecMultiModalProcessor, set_request_id from vllm.multimodal.utils import argsort_mm_positions from vllm.pooling_params import PoolingParams from vllm.sampling_params import SamplingParams @@ -60,6 +60,7 @@ def __init__( self.input_preprocessor = InputPreprocessor( self.model_config, tokenizer, + self.vllm_config.observability_config, mm_registry, mm_processor_cache=self.mm_processor_cache, ) @@ -493,11 +494,13 @@ def process_inputs( # 1. Tokenize text prompt, with LoRA request if one exists. # 2. For multimodal models with a merged preprocessor, preprocess # multimodal data and expand prompt token ids accordingly. - processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess( - prompt, - tokenization_kwargs=tokenization_kwargs, - mm_uuids=mm_uuids, - ) + with set_request_id(request_id): + processed_inputs: ProcessorInputs = self.input_preprocessor.preprocess( + prompt, + tokenization_kwargs=tokenization_kwargs, + mm_uuids=mm_uuids, + ) + from vllm.platforms import current_platform current_platform.validate_request( @@ -641,6 +644,7 @@ def _validate_model_input( mm_registry = self.input_preprocessor.mm_registry mm_processor = mm_registry.create_processor( model_config, + self.vllm_config.observability_config, tokenizer=tokenizer, ) assert isinstance(mm_processor, EncDecMultiModalProcessor) diff --git a/vllm/v1/engine/llm_engine.py b/vllm/v1/engine/llm_engine.py index 7c3f9a0e868b..c02143c7295e 100644 --- a/vllm/v1/engine/llm_engine.py +++ b/vllm/v1/engine/llm_engine.py @@ -65,8 +65,9 @@ def __init__( self.log_stats = log_stats - executor_backend = self.vllm_config.parallel_config.distributed_executor_backend parallel_config = vllm_config.parallel_config + executor_backend = parallel_config.distributed_executor_backend + self.external_launcher_dp = ( parallel_config.data_parallel_size > 1 and executor_backend == "external_launcher" diff --git a/vllm/v1/engine/logprobs.py b/vllm/v1/engine/logprobs.py index 599725b6de91..64ac32312633 100644 --- a/vllm/v1/engine/logprobs.py +++ b/vllm/v1/engine/logprobs.py @@ -2,6 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import itertools +from collections.abc import Iterable from dataclasses import dataclass from vllm.logger import init_logger @@ -88,11 +89,16 @@ def _update_sample_logprobs(self, logprobs_lists: LogprobsLists) -> None: logprobs = logprobs_np.tolist() token_ids = token_ids_np.tolist() # Detokenize (non-incrementally). - decoded_tokens = ( - NONES - if self.tokenizer is None - else (convert_ids_list_to_tokens(self.tokenizer, token_ids)) - ) + decoded_tokens: list[str] | Iterable[None] + if self.tokenizer is None: + decoded_tokens = NONES + else: + decoded_tokens_list = convert_ids_list_to_tokens( + self.tokenizer, token_ids + ) + decoded_tokens = self._verify_tokens( + decoded_tokens_list=decoded_tokens_list, tokens=token_ids + ) # Sampler puts the sampled logprob in first. sampled_token_logprob = logprobs[0] @@ -126,37 +132,45 @@ def _update_prompt_logprobs( token_ids, logprobs, ranks = prompt_logprobs_tensors + # Recover shapes. + num_prompt_tokens, num_logprobs = logprobs.shape + # Detokenize non-incrementally. # Output is flat: [num_tok, num_lps] -> [num_tok * num_lps] - decoded_tokens = ( + all_decoded_tokens: list[str] | None = ( None if self.tokenizer is None - else ( - convert_ids_list_to_tokens(self.tokenizer, token_ids.flatten().tolist()) + else convert_ids_list_to_tokens( + self.tokenizer, token_ids.flatten().tolist() ) ) - # Recover shapes. - num_prompt_tokens, num_logprobs = logprobs.shape - # Pythonize the torch tensors. prompt_token_ranks = ranks.tolist() prompt_logprobs = logprobs.tolist() - token_ids = token_ids.tolist() + token_ids_list = token_ids.tolist() # Make Logprob for each position. for pos in range(num_prompt_tokens): - # Handle flattening. + # Handle flattening and UTF-8 correction per position offset = pos * num_logprobs offset_end = offset + num_logprobs - decoded_tokens_for_pos = ( - NONES if decoded_tokens is None else decoded_tokens[offset:offset_end] - ) + + decoded_tokens_for_pos: list[str] | Iterable[None] + if all_decoded_tokens is None: + decoded_tokens_for_pos = NONES + else: + # Extract decoded tokens for this position + decoded_tokens_slice = all_decoded_tokens[offset:offset_end] + # Apply UTF-8 correction within this position's token boundaries + decoded_tokens_for_pos = self._verify_tokens( + decoded_tokens_list=decoded_tokens_slice, tokens=token_ids_list[pos] + ) # Update with the Logprob container for this pos. append_logprobs_for_next_position( self.prompt_logprobs, - token_ids[pos], + token_ids_list[pos], prompt_logprobs[pos], decoded_tokens_for_pos, prompt_token_ranks[pos], @@ -182,6 +196,48 @@ def pop_prompt_logprobs(self) -> PromptLogprobs | None: self.prompt_logprobs = [] return plp + def _correct_decoded_token(self, idx: int, tokens: list[int]) -> str: + assert self.tokenizer is not None, "self.tokenizer should not be None" + + # try with prev token id in same list + if idx > 0: + possible_decoded_token = self.tokenizer.decode(tokens[idx - 1 : idx + 1]) + if not possible_decoded_token.endswith("�"): + return possible_decoded_token + # try with previous logprob token id + if self.logprobs: + latest_token_id = next(iter(self.logprobs[-1])) + + decode_ids = [latest_token_id] + if idx > 0: + decode_ids.extend(tokens[idx - 1 : idx + 1]) + else: + decode_ids.extend(tokens[idx : idx + 1]) + + possible_decoded_token = self.tokenizer.decode(decode_ids) + if not possible_decoded_token.endswith("�"): + return possible_decoded_token + + # by default return empty string + return "" + + def _verify_tokens( + self, decoded_tokens_list: list[str], tokens: list[int] + ) -> list[str]: + corrected_decoded_token_map = dict() + for idx, text in enumerate(decoded_tokens_list): + if text.endswith("�"): + # utf-8 char at the end means it's a potential unfinished byte sequence + # from byte fallback tokenization. + corrected_decoded_token_map[idx] = self._correct_decoded_token( + idx, tokens + ) + + for idx, text in corrected_decoded_token_map.items(): + decoded_tokens_list[idx] = text + + return decoded_tokens_list + def update_from_output(self, output: EngineCoreOutput) -> None: if output.new_logprobs is not None: self._update_sample_logprobs(output.new_logprobs) diff --git a/vllm/v1/engine/utils.py b/vllm/v1/engine/utils.py index 24bf66c42f31..66212ed7cd5e 100644 --- a/vllm/v1/engine/utils.py +++ b/vllm/v1/engine/utils.py @@ -75,7 +75,6 @@ class EngineHandshakeMetadata: addresses: EngineZmqAddresses parallel_config: dict[str, int | str | list[int]] - parallel_config_hash: str | None = None class CoreEngineProcManager: @@ -249,12 +248,19 @@ def __init__( from ray.runtime_env import RuntimeEnv from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy - from vllm.v1.engine.core import DPEngineCoreActor + from vllm.v1.engine.core import DPMoEEngineCoreActor, EngineCoreActor + + dp_size = vllm_config.parallel_config.data_parallel_size + actor_class = ( + DPMoEEngineCoreActor + if dp_size > 1 and vllm_config.model_config.is_moe + else EngineCoreActor + ) self.local_engine_actors: list[ray.ActorHandle] = [] self.remote_engine_actors: list[ray.ActorHandle] = [] - env_vars_list = get_env_vars_to_copy(destination="DPEngineCoreActor") + env_vars_list = get_env_vars_to_copy(destination=actor_class.__name__) self.env_vars_dict = { name: os.environ[name] for name in env_vars_list if name in os.environ } @@ -263,7 +269,6 @@ def __init__( self.addresses = addresses self.executor_class = executor_class self.log_stats = log_stats - dp_size = vllm_config.parallel_config.data_parallel_size local_engine_count = vllm_config.parallel_config.data_parallel_size_local world_size = vllm_config.parallel_config.world_size @@ -314,7 +319,7 @@ def __init__( runtime_env = RuntimeEnv(env_vars=actor_env_vars) actor = ( - ray.remote(DPEngineCoreActor) + ray.remote(actor_class) .options( scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=pg, @@ -624,7 +629,13 @@ def scale_up_elastic_ep( from ray.runtime_env import RuntimeEnv from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy - from vllm.v1.engine.core import DPEngineCoreActor + from vllm.v1.engine.core import DPMoEEngineCoreActor, EngineCoreActor + + actor_class = ( + DPMoEEngineCoreActor + if cur_vllm_config.model_config.is_moe + else EngineCoreActor + ) cur_data_parallel_size = len(self.local_engine_actors) + len( self.remote_engine_actors @@ -667,7 +678,7 @@ def scale_up_elastic_ep( ) actor = ( - ray.remote(DPEngineCoreActor) + ray.remote(actor_class) .options( scheduling_strategy=PlacementGroupSchedulingStrategy( placement_group=pg, @@ -804,12 +815,19 @@ def launch_core_engines( ], ) - # Run the DP Coordinator process with rank 0 when in - # online DP mode. - run_coordinator = dp_size > 1 and not offline_mode and dp_rank == 0 + # Run the DP Coordinator process with rank 0 when in online DP mode. + # The coordinator is needed for: + # 1. Internal/hybrid LB: collecting and publishing queue stats for load balancing + # 2. MoE models: wave coordination in addition to stats + run_coordinator = ( + vllm_config.needs_dp_coordinator and not offline_mode and dp_rank == 0 + ) if run_coordinator: - coordinator = DPCoordinator(parallel_config) + coordinator = DPCoordinator( + parallel_config, + enable_wave_coordination=vllm_config.model_config.is_moe, + ) addresses.coordinator_input, addresses.coordinator_output = ( coordinator.get_engine_socket_addresses() @@ -905,6 +923,7 @@ def launch_core_engines( addresses, engines_to_handshake, parallel_config, + dp_size > 1 and vllm_config.model_config.is_moe, vllm_config.cache_config, local_engine_manager, coordinator.proc if coordinator else None, @@ -916,6 +935,7 @@ def wait_for_engine_startup( addresses: EngineZmqAddresses, core_engines: list[CoreEngine], parallel_config: ParallelConfig, + coordinated_dp: bool, cache_config: CacheConfig, proc_manager: CoreEngineProcManager | None, coord_process: Process | None, @@ -997,8 +1017,7 @@ def wait_for_engine_startup( ) if status == "HELLO" and engine.state == CoreEngineState.NEW: - # Send init message with DP config info and config hash. - # The config hash ensures all DP workers have compatible configs. + # Send init message with DP config info. init_message = msgspec.msgpack.encode( EngineHandshakeMetadata( addresses=addresses, @@ -1010,10 +1029,9 @@ def wait_for_engine_startup( "_data_parallel_master_port_list", "data_parallel_size", ) - }, - parallel_config_hash=parallel_config.compute_hash() - if parallel_config.data_parallel_size > 1 - else None, + } + if coordinated_dp + else {}, ) ) handshake_socket.send_multipart((eng_identity, init_message), copy=False) @@ -1034,8 +1052,8 @@ def wait_for_engine_startup( if addresses.frontend_stats_publish_address is None: addresses.frontend_stats_publish_address = msg.get("dp_stats_address") - # Validate config hash consistency across DP workers - if parallel_config.data_parallel_size > 1: + # Validate config hash consistency across DP workers for MoE models. + if coordinated_dp: worker_config_hash = msg.get("parallel_config_hash") expected_hash = parallel_config.compute_hash() if worker_config_hash != expected_hash: diff --git a/vllm/v1/spec_decode/eagle.py b/vllm/v1/spec_decode/eagle.py index 66697132b365..4c5e32340b9b 100644 --- a/vllm/v1/spec_decode/eagle.py +++ b/vllm/v1/spec_decode/eagle.py @@ -27,7 +27,6 @@ from vllm.platforms import current_platform from vllm.triton_utils import triton from vllm.utils.platform_utils import is_pin_memory_available -from vllm.v1.attention.backends.flash_attn import FlashAttentionMetadata from vllm.v1.attention.backends.tree_attn import ( TreeAttentionMetadata, TreeAttentionMetadataBuilder, @@ -71,7 +70,6 @@ def __init__( self.device = device self.dtype = vllm_config.model_config.dtype self.max_model_len = vllm_config.model_config.max_model_len - self.block_size = vllm_config.cache_config.block_size self.dp_rank = vllm_config.parallel_config.data_parallel_rank self.num_speculative_tokens = self.speculative_config.num_speculative_tokens self.max_num_tokens = vllm_config.scheduler_config.max_num_batched_tokens @@ -168,7 +166,12 @@ def __init__( # Determine allowed attention backends once during initialization. self.allowed_attn_types: tuple | None = None if current_platform.is_rocm(): - rocm_types = [TritonAttentionMetadata, FlashAttentionMetadata] + from vllm.v1.attention.backends.rocm_attn import RocmAttentionMetadata + + rocm_types = [ + TritonAttentionMetadata, + RocmAttentionMetadata, + ] # ROCM_AITER_FA is an optional backend if find_spec( AttentionBackendEnum.ROCM_AITER_FA.get_path(include_classname=False) @@ -205,10 +208,7 @@ def __init__( ) # Precompute draft position offsets in flattened tree. self.tree_draft_pos_offsets = torch.arange( - 1, - len(self.tree_choices) + 1, - device=device, - dtype=torch.int32, + 1, len(self.tree_choices) + 1, device=device, dtype=torch.int32 ).repeat(max_batch_size, 1) def _get_positions(self, num_tokens: int): @@ -288,8 +288,7 @@ def propose( per_layer_attn_metadata[layer_name] = draft_indexer_metadata num_tokens_dp_padded, num_tokens_across_dp = self._pad_batch_across_dp( - num_tokens_unpadded=num_tokens, - num_tokens_padded=num_tokens, + num_tokens_unpadded=num_tokens, num_tokens_padded=num_tokens ) cudagraph_runtime_mode = CUDAGraphMode.NONE @@ -392,8 +391,7 @@ def propose( draft_token_ids_list = [draft_token_ids] batch_size_dp_padded, batch_size_across_dp = self._pad_batch_across_dp( - num_tokens_unpadded=batch_size, - num_tokens_padded=batch_size, + num_tokens_unpadded=batch_size, num_tokens_padded=batch_size ) if ( @@ -470,22 +468,23 @@ def propose( common_attn_metadata._num_computed_tokens_cpu += 1 # Compute the slot mapping. + block_size = attn_metadata_builder.kv_cache_spec.block_size if self.uses_mrope: # all dimensions of positions are the same - block_numbers = clamped_positions[0] // self.block_size + block_numbers = clamped_positions[0] // block_size else: - block_numbers = clamped_positions // self.block_size + block_numbers = clamped_positions // block_size block_ids = common_attn_metadata.block_table_tensor.gather( dim=1, index=block_numbers.view(-1, 1) ) block_ids = block_ids.view(-1) if self.uses_mrope: common_attn_metadata.slot_mapping = ( - block_ids * self.block_size + clamped_positions[0] % self.block_size + block_ids * block_size + clamped_positions[0] % block_size ) else: common_attn_metadata.slot_mapping = ( - block_ids * self.block_size + clamped_positions % self.block_size + block_ids * block_size + clamped_positions % block_size ) # Mask out the slot mappings that exceed the max model length. # Otherwise, the KV cache will be inadvertently updated with the @@ -610,10 +609,8 @@ def prepare_next_token_ids_padded( assert discard_request_mask.dtype == torch.bool assert backup_tokens_gpu.dtype == torch.int32 - next_token_ids = torch.empty((batch_size,), dtype=torch.int32, device=device) - valid_sampled_tokens_count = torch.empty( - (batch_size,), dtype=torch.int32, device=device - ) + next_token_ids = torch.empty(batch_size, dtype=torch.int32, device=device) + valid_sampled_tokens_count = next_token_ids.new_empty(batch_size) # Kernel grid: one program per request (row) grid = (batch_size,) @@ -782,8 +779,7 @@ def propose_tree( max_query_len=query_len, ) attn_metadata = tree_attn_metadata_builder.build_for_drafting( - common_attn_metadata=common_attn_metadata, - draft_index=level + 1, + common_attn_metadata=common_attn_metadata, draft_index=level + 1 ) # Apply new attention metadata to all layers. @@ -800,12 +796,11 @@ def propose_tree( attn_metadata.seq_lens.masked_fill_(exceeds_max_model_len, 1) # Compute the slot mapping. + block_size = tree_attn_metadata_builder.kv_cache_spec.block_size query_positions = flattened_draft_positions[:, level : level + query_len] - block_numbers = query_positions // self.block_size + block_numbers = query_positions // block_size block_ids = attn_metadata.block_table.gather(dim=1, index=block_numbers) - slot_mapping = ( - block_ids * self.block_size + query_positions % self.block_size - ) + slot_mapping = block_ids * block_size + query_positions % block_size # Mask out the slot mappings that exceed the max model length. # Otherwise, the KV cache will be inadvertently updated with the # padding tokens. @@ -1162,8 +1157,8 @@ def load_model(self, target_model: nn.Module) -> None: def dummy_run( self, num_tokens: int, - use_cudagraphs=True, - is_graph_capturing=False, + use_cudagraphs: bool = True, + is_graph_capturing: bool = False, ) -> None: # Determine if CUDA graphs should be used for this run. cudagraphs_enabled = use_cudagraphs and self.use_cuda_graph @@ -1175,8 +1170,7 @@ def dummy_run( ): if fwd_idx <= 1: num_tokens_dp_padded, num_tokens_across_dp = self._pad_batch_across_dp( - num_tokens_unpadded=num_tokens, - num_tokens_padded=num_tokens, + num_tokens_unpadded=num_tokens, num_tokens_padded=num_tokens ) if ( cudagraphs_enabled @@ -1343,9 +1337,5 @@ def compute_probs_and_sample_next_token( next_token_ids = probs.div(q).argmax(dim=-1).view(-1) if not sampling_metadata.all_random: greedy_token_ids = probs.argmax(dim=-1) - next_token_ids = torch.where( - is_greedy, - greedy_token_ids, - next_token_ids, - ) + next_token_ids = torch.where(is_greedy, greedy_token_ids, next_token_ids) return next_token_ids, probs diff --git a/vllm/v1/structured_output/__init__.py b/vllm/v1/structured_output/__init__.py index 7fab7050cde3..4c1d38110d7e 100644 --- a/vllm/v1/structured_output/__init__.py +++ b/vllm/v1/structured_output/__init__.py @@ -28,8 +28,6 @@ else: torch = LazyLoader("torch", globals(), "torch") - ReasoningParser = object - Request = object logger = init_logger(__name__) @@ -98,7 +96,7 @@ def __init__(self, vllm_config: VllmConfig): self.vllm_config.structured_outputs_config.enable_in_reasoning ) - def grammar_init(self, request: Request) -> None: + def grammar_init(self, request: "Request") -> None: if request.structured_output_request is None: return @@ -156,10 +154,7 @@ def grammar_init(self, request: Request) -> None: grammar = self._create_grammar(request) # type: ignore[assignment] request.structured_output_request.grammar = grammar # type: ignore[assignment] - def _create_grammar( - self, - request: Request, - ) -> StructuredOutputGrammar: + def _create_grammar(self, request: "Request") -> StructuredOutputGrammar: key = request.structured_output_request.structured_output_key # type: ignore[union-attr] # Note that the request was validated in the engine core client, @@ -173,8 +168,7 @@ def _create_grammar( return self.backend.compile_grammar(request_type, grammar_spec) def _fill_bitmasks( - self, - batch: Iterable[tuple[StructuredOutputGrammar, int, bool]], + self, batch: Iterable[tuple[StructuredOutputGrammar, int, bool]] ) -> None: assert self._grammar_bitmask is not None for grammar, index, apply_bitmask in batch: @@ -187,14 +181,13 @@ def _fill_bitmasks( self._grammar_bitmask[index].fill_(self._full_mask) def _async_submit_fill_bitmask( - self, - batch: list[tuple[StructuredOutputGrammar, int, bool]], + self, batch: list[tuple[StructuredOutputGrammar, int, bool]] ) -> Future: return self.executor_for_fillmask.submit(self._fill_bitmasks, batch) def grammar_bitmask( self, - requests: dict[str, Request], + requests: dict[str, "Request"], structured_output_request_ids: list[str], scheduled_spec_decode_tokens: dict[str, list[int]], ) -> "npt.NDArray[np.int32] | None": @@ -239,11 +232,10 @@ def grammar_bitmask( if TYPE_CHECKING: assert structured_output_request is not None assert structured_output_request.grammar is not None + grammar = structured_output_request.grammar apply_bitmask = self.should_fill_bitmask(request) - batch.append( - (structured_output_request.grammar, cumulative_index, apply_bitmask) - ) + batch.append((grammar, cumulative_index, apply_bitmask)) if len(batch) == self.fill_bitmask_parallel_batch_size: promises.append(self._async_submit_fill_bitmask(batch)) batch = [] @@ -264,34 +256,23 @@ def grammar_bitmask( if TYPE_CHECKING: assert structured_output_request is not None assert structured_output_request.grammar is not None + grammar = structured_output_request.grammar apply_bitmask = self.should_fill_bitmask(request) state_advancements = 0 req_tokens = scheduled_spec_decode_tokens.get(req_id, ()) - for token in itertools.chain(req_tokens, (None,)): - self._fill_bitmasks( - ( - ( - structured_output_request.grammar, - cumulative_index, - apply_bitmask, - ), - ) - ) - - if ( - apply_bitmask - and token is not None - and not structured_output_request.grammar.is_terminated() - ): - accepted = structured_output_request.grammar.accept_tokens( - req_id, [token] - ) + for token in itertools.chain(req_tokens, (-1,)): + self._fill_bitmasks(((grammar, cumulative_index, apply_bitmask),)) + if token == -1: + # Stop advancing the grammar once we hit a padding token. + apply_bitmask = False + if apply_bitmask and not grammar.is_terminated(): + accepted = grammar.accept_tokens(req_id, [token]) assert accepted, (token, req_id, scheduled_spec_decode_tokens) state_advancements += 1 cumulative_index += 1 if state_advancements > 0: - structured_output_request.grammar.rollback(state_advancements) + grammar.rollback(state_advancements) bitmask_tensor = self._grammar_bitmask if cumulative_index < bitmask_tensor.shape[0]: @@ -302,7 +283,7 @@ def grammar_bitmask( # and deserialization when sending this to the GPU workers. return bitmask_tensor.numpy() - def should_fill_bitmask(self, request: Request) -> bool: + def should_fill_bitmask(self, request: "Request") -> bool: # NOTE (Hanchen) if enable_in_reasoning is True, it means that # the model needs to be constrained in reasoning. So we should always # enable the bitmask filling. @@ -318,7 +299,7 @@ def should_fill_bitmask(self, request: Request) -> bool: return request.structured_output_request.reasoning_ended return True - def should_advance(self, request: Request) -> bool: + def should_advance(self, request: "Request") -> bool: if not request.use_structured_output: return False diff --git a/vllm/v1/structured_output/utils.py b/vllm/v1/structured_output/utils.py index 74df0fa06767..3c98538f8d73 100644 --- a/vllm/v1/structured_output/utils.py +++ b/vllm/v1/structured_output/utils.py @@ -5,6 +5,7 @@ import hashlib import importlib.metadata import os +import tempfile from typing import TYPE_CHECKING import numpy as np @@ -34,9 +35,6 @@ "convert_slow_tokenizer", globals(), "transformers.convert_slow_tokenizer" ) - TokenizerLike = object - SchedulerOutput = object - InputBatch = object logger = init_logger(__name__) @@ -72,13 +70,12 @@ def apply_grammar_bitmask( # request in the batch, as the logit indices are offset by this amount. struct_out_req_batch_indices: dict[str, int] = {} cumulative_offset = 0 - seq = sorted(input_batch.req_id_to_index.items(), key=lambda x: x[1]) - for req_id, batch_index in seq: + spec_tokens = scheduler_output.scheduled_spec_decode_tokens + struct_out_req_ids = set(grammar_output.structured_output_request_ids) + for batch_index, req_id in enumerate(input_batch.req_ids): logit_index = batch_index + cumulative_offset - cumulative_offset += len( - scheduler_output.scheduled_spec_decode_tokens.get(req_id, []) - ) - if req_id in grammar_output.structured_output_request_ids: + cumulative_offset += len(spec_tokens.get(req_id, ())) + if req_id in struct_out_req_ids: struct_out_req_batch_indices[req_id] = logit_index out_indices = [] @@ -91,14 +88,12 @@ def apply_grammar_bitmask( ) cumulative_index = 0 for req_id in grammar_output.structured_output_request_ids: - num_spec_tokens = len( - scheduler_output.scheduled_spec_decode_tokens.get(req_id, []) - ) - if req_id in struct_out_req_batch_indices: - logit_index = struct_out_req_batch_indices[req_id] + num_spec_tokens = len(spec_tokens.get(req_id, ())) + if (logit_idx := struct_out_req_batch_indices.get(req_id)) is not None: for i in range(1 + num_spec_tokens): - sorted_bitmask[logit_index + i] = grammar_bitmask[cumulative_index + i] - out_indices.append(logit_index + i) + bitmask_index = logit_idx + i + sorted_bitmask[bitmask_index] = grammar_bitmask[cumulative_index + i] + out_indices.append(bitmask_index) cumulative_index += 1 + num_spec_tokens # Copy async to device as tensor. @@ -149,21 +144,19 @@ def get_outlines_cache_path() -> str: if outlines_cache_dir: # OUTLINES_CACHE_DIR takes precedence return outlines_cache_dir - elif xdg_cache_home: + if xdg_cache_home: return os.path.join(xdg_cache_home, ".cache", "outlines") # If homedir is "/", we may be inside a container, and thus writing to # root would be problematic, so we fall back to using a tempfile. # Also validate the path exists, since os.path.expanduser does # not guarantee existence. - elif os.path.isdir(home_dir) and home_dir != "/": + if os.path.isdir(home_dir) and home_dir != "/": # Default Unix fallback: ~/.cache/outlines return os.path.join(home_dir, ".cache", "outlines") - else: - import tempfile - # home_dir may be / inside a docker container without existing user - tempdir = tempfile.gettempdir() - return os.path.join(tempdir, ".cache", "outlines") + # home_dir may be / inside a docker container without existing user + tempdir = tempfile.gettempdir() + return os.path.join(tempdir, ".cache", "outlines") def get_outlines_cache(): @@ -184,8 +177,8 @@ def get_outlines_cache(): cache.clear() cache.set("__version__", outlines_version) return cache - else: - return LRUCache(maxsize=128) + + return LRUCache(maxsize=128) re_llama_byte_token = re.compile(r"^<0x[0-9A-F]{2}>$") @@ -193,8 +186,7 @@ def get_outlines_cache(): def _reduced_vocabulary( - tokenizer: TokenizerLike, - eos_token_id: int, + tokenizer: TokenizerLike, eos_token_id: int ) -> dict[bytes, list[int]]: """Create a map from vocabulary tokens to lists of equivalent token ids. @@ -267,17 +259,13 @@ def get_outlines_vocabulary(tokenizer: TokenizerLike) -> oc.Vocabulary: return tokenizer._outlines_vocabulary # type: ignore try: - if ( - hasattr( - tokenizer, - "eos_token_id", - ) - and tokenizer.eos_token_id is not None - ): + if hasattr(tokenizer, "eos_token_id") and tokenizer.eos_token_id is not None: eos_token_id = tokenizer.eos_token_id else: raise ValueError( - f"Error during structured outputs setup for outlines: Tokenizer ({type(tokenizer)}) has no `eos_token_id` property, but `eos_token_id` is required for structured outputs to work properly." # noqa: E501 + "Error during structured outputs setup for outlines: Tokenizer " + f"({type(tokenizer)}) has no `eos_token_id` property, but " + "`eos_token_id` is required for structured outputs to work properly." ) reduced_vocab = _reduced_vocabulary( @@ -290,7 +278,7 @@ def get_outlines_vocabulary(tokenizer: TokenizerLike) -> oc.Vocabulary: return vocabulary except AttributeError as e: raise ValueError( - f"Cannot get the vocabulary of the tokenizer " + "Cannot get the vocabulary of the tokenizer " f"({type(tokenizer)}). The tokenizer should have a " "get_vocab method." ) from e diff --git a/vllm/v1/worker/cpu_worker.py b/vllm/v1/worker/cpu_worker.py index e54b995ab908..654f58834a15 100644 --- a/vllm/v1/worker/cpu_worker.py +++ b/vllm/v1/worker/cpu_worker.py @@ -10,10 +10,10 @@ from vllm import envs from vllm.config import VllmConfig from vllm.logger import init_logger -from vllm.model_executor.utils import set_random_seed from vllm.platforms import CpuArchEnum, current_platform from vllm.platforms.cpu import CpuPlatform, LogicalCPUInfo from vllm.profiler.wrapper import TorchProfilerWrapper +from vllm.utils.torch_utils import set_random_seed from vllm.v1.worker.cpu_model_runner import CPUModelRunner from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index 06b4aed56c91..204635635544 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -98,9 +98,6 @@ def __init__( self.max_num_reqs = self.scheduler_config.max_num_seqs self.inputs_embeds_size = self.model_config.get_inputs_embeds_size() - self.dp_size = self.parallel_config.data_parallel_size - self.dp_rank = self.parallel_config.data_parallel_rank - self.use_async_scheduling = self.scheduler_config.async_scheduling self.output_copy_stream = torch.cuda.Stream(self.device) self.output_copy_event = torch.cuda.Event() @@ -268,7 +265,8 @@ def _dummy_run( if not skip_attn: self.prepare_dummy_attn_metadata(input_batch) - num_tokens_across_dp = make_num_tokens_across_dp(self.dp_size, num_tokens) + dp_size = self.parallel_config.data_parallel_size + num_tokens_across_dp = make_num_tokens_across_dp(dp_size, num_tokens) num_sampled_tokens = np.ones(input_batch.num_reqs, dtype=np.int32) with ( self.maybe_dummy_run_with_lora( @@ -312,7 +310,7 @@ def profile_run(self) -> None: self._dummy_sampler_run(sample_hidden_states) if self.do_spec_decode: num_tokens_across_dp = make_num_tokens_across_dp( - self.dp_size, self.max_num_tokens + self.parallel_config.data_parallel_size, self.max_num_tokens ) self.speculator.run_model( self.max_num_tokens, @@ -807,7 +805,8 @@ def get_cudagraph_and_dp_padding( scheduler_output: SchedulerOutput, ) -> tuple[CUDAGraphMode, int, torch.Tensor | None]: total_num_scheduled_tokens = scheduler_output.total_num_scheduled_tokens - if self.dp_size == 1: + dp_size = self.parallel_config.data_parallel_size + if dp_size == 1: # No DP. Only consider CUDA graphs. if total_num_scheduled_tokens == 0: # Special case: no tokens to run. @@ -835,11 +834,12 @@ def get_cudagraph_and_dp_padding( cudagraph_size_before_dp = -1 assert cudagraph_size_before_dp is not None + dp_rank = self.parallel_config.data_parallel_rank num_tokens_across_dp, cudagraph_size_across_dp = get_batch_metadata_across_dp( total_num_scheduled_tokens, cudagraph_size_before_dp, - self.dp_size, - self.dp_rank, + dp_size, + dp_rank, ) if all(cudagraph_size_across_dp >= 0): # If all ranks can use CUDA graph, pad to the maximum number of tokens @@ -850,7 +850,7 @@ def get_cudagraph_and_dp_padding( # If any of the ranks cannot use CUDA graph, use eager mode for all ranks. # No padding is needed except for ranks that have no tokens to run. num_tokens_across_dp = torch.clamp(num_tokens_across_dp, min=1) - num_tokens_after_padding = num_tokens_across_dp[self.dp_rank] + num_tokens_after_padding = num_tokens_across_dp[dp_rank] cudagraph_mode = CUDAGraphMode.NONE return cudagraph_mode, num_tokens_after_padding, num_tokens_across_dp diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index de8e168e45de..74006dfb7655 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -263,7 +263,6 @@ def __init__( async_output_copy_stream: torch.cuda.Stream, ): self._model_runner_output = model_runner_output - self._finished_mask = finished_mask # Event on the copy stream so we can synchronize the non-blocking copy. self.async_copy_ready_event = torch.Event() @@ -276,11 +275,15 @@ def __init__( default_stream = torch.cuda.current_stream() with torch.cuda.stream(async_output_copy_stream): async_output_copy_stream.wait_stream(default_stream) - self._raw_pooler_output_cpu = json_map_leaves( + raw_pooler_output_cpu = json_map_leaves( lambda x: None if x is None else x.to("cpu", non_blocking=True), self._raw_pooler_output, ) self.async_copy_ready_event.record() + self._model_runner_output.pooler_output = [ + out if include else None + for out, include in zip(raw_pooler_output_cpu, finished_mask) + ] def get_output(self) -> ModelRunnerOutput: """Copy the device tensors to the host and return a ModelRunnerOutput. @@ -290,11 +293,6 @@ def get_output(self) -> ModelRunnerOutput: # Release the device tensors once the copy has completed. del self._raw_pooler_output - - self._model_runner_output.pooler_output = [ - out if include else None - for out, include in zip(self._raw_pooler_output_cpu, self._finished_mask) - ] return self._model_runner_output @@ -2537,8 +2535,7 @@ def _pool( model = cast(VllmModelForPooling, self.model) raw_pooler_output: PoolerOutput = model.pooler( - hidden_states=hidden_states, - pooling_metadata=pooling_metadata, + hidden_states=hidden_states, pooling_metadata=pooling_metadata ) finished_mask = [ @@ -2568,12 +2565,12 @@ def _pool( lambda x: None if x is None else x.to("cpu", non_blocking=True), raw_pooler_output, ) - self._sync_device() - model_runner_output.pooler_output = [ out if include else None for out, include in zip(raw_pooler_output, finished_mask) ] + self._sync_device() + return model_runner_output def _pad_for_sequence_parallelism(self, num_scheduled_tokens: int) -> int: @@ -3567,14 +3564,13 @@ def _copy_valid_sampled_token_count( def _get_valid_sampled_token_count(self) -> list[int]: # Wait until valid_sampled_tokens_count is copied to cpu, prev_sampled_token_ids = self.input_batch.prev_sampled_token_ids - if ( - self.valid_sampled_token_count_event is None - or prev_sampled_token_ids is None - ): + sampled_count_event = self.valid_sampled_token_count_event + if sampled_count_event is None or prev_sampled_token_ids is None: return [] counts_cpu = self.valid_sampled_token_count_cpu - self.valid_sampled_token_count_event.synchronize() + assert counts_cpu is not None + sampled_count_event.synchronize() return counts_cpu[: prev_sampled_token_ids.shape[0]].tolist() def propose_draft_token_ids( diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 2416c8ebafa5..fd4ee596c30e 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -34,7 +34,6 @@ ) from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.model_executor import set_random_seed from vllm.model_executor.models.interfaces import is_mixture_of_experts from vllm.model_executor.warmup.kernel_warmup import kernel_warmup from vllm.platforms import current_platform @@ -43,6 +42,7 @@ from vllm.tasks import SupportedTask from vllm.utils.mem_constants import GiB_bytes from vllm.utils.mem_utils import MemorySnapshot, memory_profiling +from vllm.utils.torch_utils import set_random_seed from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput from vllm.v1.engine import ReconfigureDistributedRequest, ReconfigureRankType from vllm.v1.kv_cache_interface import KVCacheConfig, KVCacheSpec @@ -84,7 +84,7 @@ def __init__( # configure float32 matmul precision according to vLLM env. precision = envs.VLLM_FLOAT32_MATMUL_PRECISION - torch.backends.cuda.matmul.fp32_precision = precision + torch.set_float32_matmul_precision(precision) if self.model_config.trust_remote_code: # note: lazy import to avoid importing torch before initializing @@ -179,22 +179,20 @@ def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None: self.cache_config.num_cpu_blocks = num_cpu_blocks def init_device(self): - device = self.device_config.device - if isinstance(device, torch.device) and device.type == "cuda": + if self.device_config.device_type == "cuda": # This env var set by Ray causes exceptions with graph building. os.environ.pop("NCCL_ASYNC_ERROR_HANDLING", None) + parallel_config = self.parallel_config if ( - self.parallel_config.data_parallel_size > 1 - and self.parallel_config.data_parallel_size_local > 0 - and self.parallel_config.distributed_executor_backend - not in ["ray", "external_launcher"] - and self.vllm_config.parallel_config.data_parallel_backend != "ray" - and self.vllm_config.parallel_config.nnodes_within_dp == 1 + parallel_config.distributed_executor_backend + not in ("ray", "external_launcher") + and parallel_config.data_parallel_backend != "ray" + and parallel_config.nnodes_within_dp == 1 ): # Use local DP rank if available, otherwise use global DP rank. dp_local_rank = self.parallel_config.data_parallel_rank_local if dp_local_rank is None: - dp_local_rank = self.parallel_config.data_parallel_rank + dp_local_rank = self.parallel_config.data_parallel_index tp_pp_world_size = ( self.parallel_config.pipeline_parallel_size diff --git a/vllm/v1/worker/tpu_worker.py b/vllm/v1/worker/tpu_worker.py index ab22d0af63a5..3ece4c58214a 100644 --- a/vllm/v1/worker/tpu_worker.py +++ b/vllm/v1/worker/tpu_worker.py @@ -20,12 +20,11 @@ ) from vllm.logger import init_logger from vllm.lora.request import LoRARequest -from vllm.model_executor import set_random_seed from vllm.platforms import current_platform from vllm.platforms.tpu import USE_TPU_INFERENCE from vllm.tasks import SupportedTask from vllm.utils.math_utils import cdiv -from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE +from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE, set_random_seed from vllm.v1.core.sched.output import GrammarOutput, SchedulerOutput from vllm.v1.kv_cache_interface import AttentionSpec, KVCacheConfig, KVCacheSpec from vllm.v1.outputs import ModelRunnerOutput diff --git a/vllm/v1/worker/xpu_worker.py b/vllm/v1/worker/xpu_worker.py index 1faa1a24ff0e..fe0850771dd0 100644 --- a/vllm/v1/worker/xpu_worker.py +++ b/vllm/v1/worker/xpu_worker.py @@ -9,9 +9,9 @@ from vllm.config import VllmConfig from vllm.distributed import get_world_group from vllm.logger import init_logger -from vllm.model_executor import set_random_seed from vllm.platforms import current_platform from vllm.profiler.wrapper import TorchProfilerWrapper +from vllm.utils.torch_utils import set_random_seed from vllm.v1.worker.gpu_worker import Worker, init_worker_distributed_environment from vllm.v1.worker.xpu_model_runner import XPUModelRunner