Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
# Install FlashInfer from source
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
# Keep this in sync with "flashinfer" extra in setup.py
ARG FLASHINFER_GIT_REF="v0.2.12"
ARG FLASHINFER_GIT_REF="v0.2.13"
# Flag to control whether to compile FlashInfer AOT kernels
# Set to "true" to enable AOT compilation:
# docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -694,7 +694,7 @@ def _read_requirements(filename: str) -> list[str]:
"mistral_common[audio]"], # Required for audio processing
"video": [], # Kept for backwards compatibility
# FlashInfer should be updated together with the Dockerfile
"flashinfer": ["flashinfer-python==0.2.12"],
"flashinfer": ["flashinfer-python==0.2.13"],
},
cmdclass=cmdclass,
package_data=package_data,
Expand Down
7 changes: 6 additions & 1 deletion vllm/model_executor/layers/quantization/mxfp4.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from torch.nn.parameter import Parameter

from vllm import envs
from vllm.config import get_current_vllm_config
from vllm.logger import init_logger
from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
FusedMoEMethodBase)
Expand Down Expand Up @@ -113,6 +114,8 @@ def __init__(self, moe: FusedMoEConfig):
self.topk_indices_dtype = None
self.moe = moe
self.use_marlin = self._should_use_marlin()
self.max_captute_size = get_current_vllm_config(
).compilation_config.max_capture_size

if current_platform.is_device_capability(100) and not has_flashinfer():
logger.warning_once(
Expand Down Expand Up @@ -520,7 +523,8 @@ def apply(
x_scale = None
else:
x_quant, x_scale = mxfp8_quantize(x, False) # to mxfp8
x_scale = x_scale.view(torch.float8_e4m3fn).reshape(-1)
x_scale = x_scale.view(torch.float8_e4m3fn).reshape(
*x.shape[:-1], -1)
trtllm_gen_output = trtllm_fp4_block_scale_moe(
router_logits.to(torch.bfloat16),
None, # routing_bias
Expand Down Expand Up @@ -549,6 +553,7 @@ def apply(
self._get_tile_tokens_dim(x, top_k),
1 if renormalize else 0, # routing_method_type, renormalize
True, # do finalize
tune_max_num_tokens=self.max_captute_size,
)[0]
return trtllm_gen_output
else:
Expand Down
7 changes: 4 additions & 3 deletions vllm/v1/worker/gpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,10 @@ def compile_or_warm_up_model(self) -> None:
logger.info("Compile and warming up model for size %d", size)
self.model_runner._dummy_run(size, skip_eplb=True)

# Warmup and tune the kernels used during model execution before
# cuda graph capture.
kernel_warmup(self)

if not self.model_config.enforce_eager:
self.model_runner.capture_model()

Expand All @@ -336,9 +340,6 @@ def compile_or_warm_up_model(self) -> None:
self.model_runner._dummy_sampler_run(
hidden_states=last_hidden_states)

# Warmup kernels used during model execution
kernel_warmup(self)

# Reset the seed to ensure that the random state is not affected by
# the model initialization and profiling.
set_random_seed(self.model_config.seed)
Expand Down