Skip to content

Commit f5bc341

Browse files
weireweireIwakuraReinmgoingemini-code-assist[bot]
authored andcommitted
Update Flashinfer to 0.2.14.post1 (vllm-project#23537)
Signed-off-by: Siyuan Fu <[email protected]> Signed-off-by: siyuanf <[email protected]> Signed-off-by: Weiliang Liu <[email protected]> Signed-off-by: Michael Goin <[email protected]> Co-authored-by: Siyuan Fu <[email protected]> Co-authored-by: Michael Goin <[email protected]> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Signed-off-by: tc-mb <[email protected]>
1 parent ec1705c commit f5bc341

File tree

5 files changed

+14
-7
lines changed

5 files changed

+14
-7
lines changed

docker/Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -373,7 +373,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
373373
# Install FlashInfer from source
374374
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
375375
# Keep this in sync with "flashinfer" extra in setup.py
376-
ARG FLASHINFER_GIT_REF="v0.2.12"
376+
ARG FLASHINFER_GIT_REF="v0.2.14.post1"
377377
# Flag to control whether to compile FlashInfer AOT kernels
378378
# Set to "true" to enable AOT compilation:
379379
# docker build --build-arg FLASHINFER_AOT_COMPILE=true ...

setup.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -694,7 +694,7 @@ def _read_requirements(filename: str) -> list[str]:
694694
"mistral_common[audio]"], # Required for audio processing
695695
"video": [], # Kept for backwards compatibility
696696
# FlashInfer should be updated together with the Dockerfile
697-
"flashinfer": ["flashinfer-python==0.2.12"],
697+
"flashinfer": ["flashinfer-python==0.2.14.post1"],
698698
# Optional deps for AMD FP4 quantization support
699699
"petit-kernel": ["petit-kernel"],
700700
},

vllm/compilation/collective_fusion.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -465,7 +465,8 @@ def call_trtllm_fused_allreduce_norm(
465465
quant_out=quant_out,
466466
scale_out=scale_out,
467467
# in vllm we only support swizzled layout
468-
layout_code=flashinfer_comm.FP4QuantizationSFLayout.SWIZZLED,
468+
layout_code=flashinfer_comm.QuantizationSFLayout.
469+
SWIZZLED_128x4,
469470
scale_factor=scale_factor,
470471
)
471472
else:

vllm/model_executor/layers/quantization/mxfp4.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from torch.nn.parameter import Parameter
77

88
from vllm import envs
9+
from vllm.config import get_current_vllm_config
910
from vllm.logger import init_logger
1011
from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
1112
FusedMoEMethodBase)
@@ -113,6 +114,8 @@ def __init__(self, moe: FusedMoEConfig):
113114
self.topk_indices_dtype = None
114115
self.moe = moe
115116
self.use_marlin = self._should_use_marlin()
117+
self.max_capture_size = get_current_vllm_config(
118+
).compilation_config.max_capture_size
116119

117120
if current_platform.is_device_capability(100) and not has_flashinfer():
118121
logger.warning_once(
@@ -520,7 +523,8 @@ def apply(
520523
x_scale = None
521524
else:
522525
x_quant, x_scale = mxfp8_quantize(x, False) # to mxfp8
523-
x_scale = x_scale.view(torch.float8_e4m3fn).reshape(-1)
526+
x_scale = x_scale.view(torch.float8_e4m3fn).reshape(
527+
*x.shape[:-1], -1)
524528
trtllm_gen_output = trtllm_fp4_block_scale_moe(
525529
router_logits.to(torch.bfloat16),
526530
None, # routing_bias
@@ -549,6 +553,7 @@ def apply(
549553
self._get_tile_tokens_dim(x, top_k),
550554
1 if renormalize else 0, # routing_method_type, renormalize
551555
True, # do finalize
556+
tune_max_num_tokens=self.max_capture_size,
552557
)[0]
553558
return trtllm_gen_output
554559
else:

vllm/v1/worker/gpu_worker.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,10 @@ def compile_or_warm_up_model(self) -> None:
310310
logger.info("Compile and warming up model for size %d", size)
311311
self.model_runner._dummy_run(size, skip_eplb=True)
312312

313+
# Warmup and tune the kernels used during model execution before
314+
# cuda graph capture.
315+
kernel_warmup(self)
316+
313317
if not self.model_config.enforce_eager:
314318
self.model_runner.capture_model()
315319

@@ -334,9 +338,6 @@ def compile_or_warm_up_model(self) -> None:
334338
self.model_runner._dummy_sampler_run(
335339
hidden_states=last_hidden_states)
336340

337-
# Warmup kernels used during model execution
338-
kernel_warmup(self)
339-
340341
# Reset the seed to ensure that the random state is not affected by
341342
# the model initialization and profiling.
342343
set_random_seed(self.model_config.seed)

0 commit comments

Comments
 (0)