Skip to content
Closed
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -373,7 +373,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
# Install FlashInfer from source
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
# Keep this in sync with "flashinfer" extra in setup.py
ARG FLASHINFER_GIT_REF="v0.2.12"
ARG FLASHINFER_GIT_REF="v0.2.13"
# Flag to control whether to compile FlashInfer AOT kernels
# Set to "true" to enable AOT compilation:
# docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -694,7 +694,7 @@ def _read_requirements(filename: str) -> list[str]:
"mistral_common[audio]"], # Required for audio processing
"video": [], # Kept for backwards compatibility
# FlashInfer should be updated together with the Dockerfile
"flashinfer": ["flashinfer-python==0.2.12"],
"flashinfer": ["flashinfer-python==0.2.13"],
},
cmdclass=cmdclass,
package_data=package_data,
Expand Down
5 changes: 4 additions & 1 deletion vllm/model_executor/layers/quantization/mxfp4.py
Original file line number Diff line number Diff line change
Expand Up @@ -520,7 +520,8 @@ def apply(
x_scale = None
else:
x_quant, x_scale = mxfp8_quantize(x, False) # to mxfp8
x_scale = x_scale.view(torch.float8_e4m3fn).reshape(-1)
x_scale = x_scale.view(torch.float8_e4m3fn).reshape(
*x.shape[:-1], -1)
trtllm_gen_output = trtllm_fp4_block_scale_moe(
router_logits.to(torch.bfloat16),
None, # routing_bias
Expand Down Expand Up @@ -549,6 +550,8 @@ def apply(
self._get_tile_tokens_dim(x, top_k),
1 if renormalize else 0, # routing_method_type, renormalize
True, # do finalize
# TODO: use the maximum number in the cudagraph_batch_sizes
tune_max_num_tokens=8192,
)[0]
return trtllm_gen_output
else:
Expand Down
7 changes: 4 additions & 3 deletions vllm/v1/worker/gpu_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,6 +312,10 @@ def compile_or_warm_up_model(self) -> None:
logger.info("Compile and warming up model for size %d", size)
self.model_runner._dummy_run(size, skip_eplb=True)

# Warmup and tune the kernels used during model execution before
# cuda graph capture.
kernel_warmup(self)

if not self.model_config.enforce_eager:
self.model_runner.capture_model()

Expand All @@ -336,9 +340,6 @@ def compile_or_warm_up_model(self) -> None:
self.model_runner._dummy_sampler_run(
hidden_states=last_hidden_states)

# Warmup kernels used during model execution
kernel_warmup(self)

# Reset the seed to ensure that the random state is not affected by
# the model initialization and profiling.
set_random_seed(self.model_config.seed)
Expand Down