diff --git a/docker/Dockerfile b/docker/Dockerfile index 839ac501dbaf..2e272cbca841 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -373,7 +373,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist # Install FlashInfer from source ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git" # Keep this in sync with "flashinfer" extra in setup.py -ARG FLASHINFER_GIT_REF="v0.2.12" +ARG FLASHINFER_GIT_REF="v0.2.14.post1" # Flag to control whether to compile FlashInfer AOT kernels # Set to "true" to enable AOT compilation: # docker build --build-arg FLASHINFER_AOT_COMPILE=true ... diff --git a/setup.py b/setup.py index ca6e0a8592cc..ffe8ec4e79af 100644 --- a/setup.py +++ b/setup.py @@ -694,7 +694,7 @@ def _read_requirements(filename: str) -> list[str]: "mistral_common[audio]"], # Required for audio processing "video": [], # Kept for backwards compatibility # FlashInfer should be updated together with the Dockerfile - "flashinfer": ["flashinfer-python==0.2.12"], + "flashinfer": ["flashinfer-python==0.2.14.post1"], # Optional deps for AMD FP4 quantization support "petit-kernel": ["petit-kernel"], }, diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py index 6ae50245ed3a..c44ac8e0aa7e 100644 --- a/vllm/compilation/collective_fusion.py +++ b/vllm/compilation/collective_fusion.py @@ -465,7 +465,8 @@ def call_trtllm_fused_allreduce_norm( quant_out=quant_out, scale_out=scale_out, # in vllm we only support swizzled layout - layout_code=flashinfer_comm.FP4QuantizationSFLayout.SWIZZLED, + layout_code=flashinfer_comm.QuantizationSFLayout. + SWIZZLED_128x4, scale_factor=scale_factor, ) else: diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py index 6a190ebbc063..df96e5d8c413 100644 --- a/vllm/model_executor/layers/quantization/mxfp4.py +++ b/vllm/model_executor/layers/quantization/mxfp4.py @@ -6,6 +6,7 @@ from torch.nn.parameter import Parameter from vllm import envs +from vllm.config import get_current_vllm_config from vllm.logger import init_logger from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig, FusedMoEMethodBase) @@ -113,6 +114,8 @@ def __init__(self, moe: FusedMoEConfig): self.topk_indices_dtype = None self.moe = moe self.use_marlin = self._should_use_marlin() + self.max_capture_size = get_current_vllm_config( + ).compilation_config.max_capture_size if current_platform.is_device_capability(100) and not has_flashinfer(): logger.warning_once( @@ -520,7 +523,8 @@ def apply( x_scale = None else: x_quant, x_scale = mxfp8_quantize(x, False) # to mxfp8 - x_scale = x_scale.view(torch.float8_e4m3fn).reshape(-1) + x_scale = x_scale.view(torch.float8_e4m3fn).reshape( + *x.shape[:-1], -1) trtllm_gen_output = trtllm_fp4_block_scale_moe( router_logits.to(torch.bfloat16), None, # routing_bias @@ -549,6 +553,7 @@ def apply( self._get_tile_tokens_dim(x, top_k), 1 if renormalize else 0, # routing_method_type, renormalize True, # do finalize + tune_max_num_tokens=self.max_capture_size, )[0] return trtllm_gen_output else: diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index f83a4f4faeb5..0c9c745dab9d 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -311,6 +311,10 @@ def compile_or_warm_up_model(self) -> None: logger.info("Compile and warming up model for size %d", size) self.model_runner._dummy_run(size, skip_eplb=True) + # Warmup and tune the kernels used during model execution before + # cuda graph capture. + kernel_warmup(self) + if not self.model_config.enforce_eager: self.model_runner.capture_model() @@ -335,9 +339,6 @@ def compile_or_warm_up_model(self) -> None: self.model_runner._dummy_sampler_run( hidden_states=last_hidden_states) - # Warmup kernels used during model execution - kernel_warmup(self) - # Reset the seed to ensure that the random state is not affected by # the model initialization and profiling. set_random_seed(self.model_config.seed)