From 82de5e09ee38510e267a0a15578b21edb5b707d0 Mon Sep 17 00:00:00 2001
From: Siyuan Fu <siyuanf@nvidia.com>
Date: Tue, 19 Aug 2025 14:45:03 -0700
Subject: [PATCH 01/13] fix after flashinfer autotuner

Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
---
 vllm/model_executor/layers/quantization/mxfp4.py | 9 ++++++++-
 vllm/v1/worker/gpu_worker.py                     | 6 +++---
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 6a190ebbc063..07ed8e27a37a 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -113,6 +113,8 @@ def __init__(self, moe: FusedMoEConfig):
         self.topk_indices_dtype = None
         self.moe = moe
         self.use_marlin = self._should_use_marlin()
+        self.device_support_pdl = current_platform.is_cuda(
+        ) and current_platform.has_device_capability(90)
 
         if current_platform.is_device_capability(100) and not has_flashinfer():
             logger.warning_once(
@@ -520,7 +522,8 @@ def apply(
                 x_scale = None
             else:
                 x_quant, x_scale = mxfp8_quantize(x, False)  # to mxfp8
-                x_scale = x_scale.view(torch.float8_e4m3fn).reshape(-1)
+                x_scale = x_scale.view(torch.float8_e4m3fn).reshape(
+                    *x.shape[:-1], -1)
             trtllm_gen_output = trtllm_fp4_block_scale_moe(
                 router_logits.to(torch.bfloat16),
                 None,  # routing_bias
@@ -549,6 +552,10 @@ def apply(
                 self._get_tile_tokens_dim(x, top_k),
                 1 if renormalize else 0,  # routing_method_type, renormalize
                 True,  # do finalize
+                self.device_support_pdl,
+                None,  # output
+                # TODO: use the maximum number in the cudagraph_batch_sizes
+                8192,  # tune_max_num_tokens.
             )[0]
             return trtllm_gen_output
         else:
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index f83a4f4faeb5..e30b6e86842e 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -311,6 +311,9 @@ def compile_or_warm_up_model(self) -> None:
             logger.info("Compile and warming up model for size %d", size)
             self.model_runner._dummy_run(size, skip_eplb=True)
 
+        # run autotuner before cuda graph capture.
+        kernel_warmup(self)
+
         if not self.model_config.enforce_eager:
             self.model_runner.capture_model()
 
@@ -335,9 +338,6 @@ def compile_or_warm_up_model(self) -> None:
                 self.model_runner._dummy_sampler_run(
                     hidden_states=last_hidden_states)
 
-        # Warmup kernels used during model execution
-        kernel_warmup(self)
-
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)

From bfc1f205da39beecbade860a651909f717f82305 Mon Sep 17 00:00:00 2001
From: Siyuan Fu <siyuanf@nvidia.com>
Date: Tue, 19 Aug 2025 15:41:06 -0700
Subject: [PATCH 02/13] add warmup

Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
---
 vllm/model_executor/warmup/kernel_warmup.py | 9 +++++----
 vllm/v1/worker/gpu_worker.py                | 5 ++++-
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py
index 761172e4d361..a30c469589a9 100644
--- a/vllm/model_executor/warmup/kernel_warmup.py
+++ b/vllm/model_executor/warmup/kernel_warmup.py
@@ -20,7 +20,7 @@
     from vllm.v1.worker.gpu_worker import Worker
 
 
-def kernel_warmup(worker: "Worker"):
+def kernel_warmup(worker: "Worker", do_autotune: bool = False):
     # Deep GEMM warmup
     do_deep_gemm_warmup = (envs.VLLM_USE_DEEP_GEMM
                            and is_deep_gemm_supported()
@@ -32,10 +32,11 @@ def kernel_warmup(worker: "Worker"):
 
     # FlashInfer autotune for Blackwell (SM 10.0) GPUs
     if has_flashinfer() and current_platform.is_device_capability(100):
-        flashinfer_autotune(worker.model_runner)
+        flashinfer_autotune(worker.model_runner, do_autotune)
 
 
-def flashinfer_autotune(runner: "GPUModelRunner") -> None:
+def flashinfer_autotune(runner: "GPUModelRunner",
+                        do_autotune: bool = True) -> None:
     """
     Autotune FlashInfer operations.
     FlashInfer have many implementations for the same operation,
@@ -47,7 +48,7 @@ def flashinfer_autotune(runner: "GPUModelRunner") -> None:
     """
     from vllm.utils.flashinfer import autotune
 
-    with torch.inference_mode(), autotune():
+    with torch.inference_mode(), autotune(do_autotune):
         # We skip EPLB here since we don't want to record dummy metrics
         # When autotuning with number of tokens m, flashinfer will autotune
         # operations for all number of tokens up to m.
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index e30b6e86842e..119c5d09938c 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -312,7 +312,7 @@ def compile_or_warm_up_model(self) -> None:
             self.model_runner._dummy_run(size, skip_eplb=True)
 
         # run autotuner before cuda graph capture.
-        kernel_warmup(self)
+        kernel_warmup(self, do_autotune=True)
 
         if not self.model_config.enforce_eager:
             self.model_runner.capture_model()
@@ -338,6 +338,9 @@ def compile_or_warm_up_model(self) -> None:
                 self.model_runner._dummy_sampler_run(
                     hidden_states=last_hidden_states)
 
+        # Warmup kernels used during model execution
+        kernel_warmup(self, do_autotune=False)
+
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)

From 4cd9551e3520063689f25e42655bfda2e68362ea Mon Sep 17 00:00:00 2001
From: Siyuan Fu <siyuanf@nvidia.com>
Date: Wed, 20 Aug 2025 09:44:27 -0700
Subject: [PATCH 03/13] address comment

Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
---
 vllm/model_executor/warmup/kernel_warmup.py | 9 ++++-----
 vllm/v1/worker/gpu_worker.py                | 8 +++-----
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/vllm/model_executor/warmup/kernel_warmup.py b/vllm/model_executor/warmup/kernel_warmup.py
index a30c469589a9..761172e4d361 100644
--- a/vllm/model_executor/warmup/kernel_warmup.py
+++ b/vllm/model_executor/warmup/kernel_warmup.py
@@ -20,7 +20,7 @@
     from vllm.v1.worker.gpu_worker import Worker
 
 
-def kernel_warmup(worker: "Worker", do_autotune: bool = False):
+def kernel_warmup(worker: "Worker"):
     # Deep GEMM warmup
     do_deep_gemm_warmup = (envs.VLLM_USE_DEEP_GEMM
                            and is_deep_gemm_supported()
@@ -32,11 +32,10 @@ def kernel_warmup(worker: "Worker", do_autotune: bool = False):
 
     # FlashInfer autotune for Blackwell (SM 10.0) GPUs
     if has_flashinfer() and current_platform.is_device_capability(100):
-        flashinfer_autotune(worker.model_runner, do_autotune)
+        flashinfer_autotune(worker.model_runner)
 
 
-def flashinfer_autotune(runner: "GPUModelRunner",
-                        do_autotune: bool = True) -> None:
+def flashinfer_autotune(runner: "GPUModelRunner") -> None:
     """
     Autotune FlashInfer operations.
     FlashInfer have many implementations for the same operation,
@@ -48,7 +47,7 @@ def flashinfer_autotune(runner: "GPUModelRunner",
     """
     from vllm.utils.flashinfer import autotune
 
-    with torch.inference_mode(), autotune(do_autotune):
+    with torch.inference_mode(), autotune():
         # We skip EPLB here since we don't want to record dummy metrics
         # When autotuning with number of tokens m, flashinfer will autotune
         # operations for all number of tokens up to m.
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
index 119c5d09938c..0c9c745dab9d 100644
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -311,8 +311,9 @@ def compile_or_warm_up_model(self) -> None:
             logger.info("Compile and warming up model for size %d", size)
             self.model_runner._dummy_run(size, skip_eplb=True)
 
-        # run autotuner before cuda graph capture.
-        kernel_warmup(self, do_autotune=True)
+        # Warmup and tune the kernels used during model execution before
+        # cuda graph capture.
+        kernel_warmup(self)
 
         if not self.model_config.enforce_eager:
             self.model_runner.capture_model()
@@ -338,9 +339,6 @@ def compile_or_warm_up_model(self) -> None:
                 self.model_runner._dummy_sampler_run(
                     hidden_states=last_hidden_states)
 
-        # Warmup kernels used during model execution
-        kernel_warmup(self, do_autotune=False)
-
         # Reset the seed to ensure that the random state is not affected by
         # the model initialization and profiling.
         set_random_seed(self.model_config.seed)

From 57e4e21b11b6bad62f53020a1a6e1cbb54f82036 Mon Sep 17 00:00:00 2001
From: siyuanf <siyuanf@nvidia.com>
Date: Wed, 20 Aug 2025 23:39:09 -0700
Subject: [PATCH 04/13] Update flashinfer tag

Signed-off-by: siyuanf <siyuanf@nvidia.com>
---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index ca6e0a8592cc..1cfca906da48 100644
--- a/setup.py
+++ b/setup.py
@@ -694,7 +694,7 @@ def _read_requirements(filename: str) -> list[str]:
                   "mistral_common[audio]"],  # Required for audio processing
         "video": [],  # Kept for backwards compatibility
         # FlashInfer should be updated together with the Dockerfile
-        "flashinfer": ["flashinfer-python==0.2.12"],
+        "flashinfer": ["flashinfer-python==0.2.13"],
         # Optional deps for AMD FP4 quantization support
         "petit-kernel": ["petit-kernel"],
     },

From 8764384d4a7232df3405e64cd674585734def0fc Mon Sep 17 00:00:00 2001
From: Siyuan Fu <siyuanf@nvidia.com>
Date: Thu, 21 Aug 2025 11:24:49 -0700
Subject: [PATCH 05/13] address comment

Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
---
 vllm/model_executor/layers/quantization/mxfp4.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 07ed8e27a37a..036127a4b0de 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -552,10 +552,8 @@ def apply(
                 self._get_tile_tokens_dim(x, top_k),
                 1 if renormalize else 0,  # routing_method_type, renormalize
                 True,  # do finalize
-                self.device_support_pdl,
-                None,  # output
                 # TODO: use the maximum number in the cudagraph_batch_sizes
-                8192,  # tune_max_num_tokens.
+                tune_max_num_tokens=8192,
             )[0]
             return trtllm_gen_output
         else:

From e9f6e126dfd262f9cc916d078923f61ce40fa4d9 Mon Sep 17 00:00:00 2001
From: Siyuan Fu <siyuanf@nvidia.com>
Date: Thu, 21 Aug 2025 11:26:42 -0700
Subject: [PATCH 06/13] address comment

Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
---
 vllm/model_executor/layers/quantization/mxfp4.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 036127a4b0de..90839ec9ccf4 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -113,8 +113,6 @@ def __init__(self, moe: FusedMoEConfig):
         self.topk_indices_dtype = None
         self.moe = moe
         self.use_marlin = self._should_use_marlin()
-        self.device_support_pdl = current_platform.is_cuda(
-        ) and current_platform.has_device_capability(90)
 
         if current_platform.is_device_capability(100) and not has_flashinfer():
             logger.warning_once(

From 31bac6a84995f3c8ca2a5b37a341f4c9197940ea Mon Sep 17 00:00:00 2001
From: Siyuan Fu <siyuanf@nvidia.com>
Date: Thu, 21 Aug 2025 14:29:55 -0700
Subject: [PATCH 07/13] update dockerfile

Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
---
 docker/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 839ac501dbaf..57100fadab74 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -373,7 +373,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # Install FlashInfer from source
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 # Keep this in sync with "flashinfer" extra in setup.py
-ARG FLASHINFER_GIT_REF="v0.2.12"
+ARG FLASHINFER_GIT_REF="v0.2.13"
 # Flag to control whether to compile FlashInfer AOT kernels
 # Set to "true" to enable AOT compilation:
 # docker build --build-arg FLASHINFER_AOT_COMPILE=true ...

From 74c5e7bf6abae7a3698254cc1a8628bda2d2cb3c Mon Sep 17 00:00:00 2001
From: Siyuan Fu <siyuanf@nvidia.com>
Date: Fri, 22 Aug 2025 09:22:38 -0700
Subject: [PATCH 08/13] address todo

Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
---
 vllm/model_executor/layers/quantization/mxfp4.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 90839ec9ccf4..f719af76cc69 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -6,6 +6,7 @@
 from torch.nn.parameter import Parameter
 
 from vllm import envs
+from vllm.config import get_current_vllm_config
 from vllm.logger import init_logger
 from vllm.model_executor.layers.fused_moe import (FusedMoE, FusedMoEConfig,
                                                   FusedMoEMethodBase)
@@ -113,6 +114,8 @@ def __init__(self, moe: FusedMoEConfig):
         self.topk_indices_dtype = None
         self.moe = moe
         self.use_marlin = self._should_use_marlin()
+        self.max_captute_size = get_current_vllm_config(
+        ).compilation_config.max_capture_size
 
         if current_platform.is_device_capability(100) and not has_flashinfer():
             logger.warning_once(
@@ -551,7 +554,7 @@ def apply(
                 1 if renormalize else 0,  # routing_method_type, renormalize
                 True,  # do finalize
                 # TODO: use the maximum number in the cudagraph_batch_sizes
-                tune_max_num_tokens=8192,
+                tune_max_num_tokens=self.max_captute_size,
             )[0]
             return trtllm_gen_output
         else:

From e635283df40d1e557a0c6b84129b97780e940375 Mon Sep 17 00:00:00 2001
From: Siyuan Fu <siyuanf@nvidia.com>
Date: Fri, 22 Aug 2025 09:24:00 -0700
Subject: [PATCH 09/13] address todo

Signed-off-by: Siyuan Fu <siyuanf@nvidia.com>
---
 vllm/model_executor/layers/quantization/mxfp4.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index f719af76cc69..354c715c2bec 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -553,7 +553,6 @@ def apply(
                 self._get_tile_tokens_dim(x, top_k),
                 1 if renormalize else 0,  # routing_method_type, renormalize
                 True,  # do finalize
-                # TODO: use the maximum number in the cudagraph_batch_sizes
                 tune_max_num_tokens=self.max_captute_size,
             )[0]
             return trtllm_gen_output

From c0a5e4b76a56876c2f768fcb764bfc0feae98c15 Mon Sep 17 00:00:00 2001
From: Weiliang Liu <weiliangl@nvidia.com>
Date: Mon, 25 Aug 2025 06:10:07 +0000
Subject: [PATCH 10/13] Fix flashinfer swizzle enum name.

Signed-off-by: Weiliang Liu <weiliangl@nvidia.com>
---
 vllm/compilation/collective_fusion.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
index 6ae50245ed3a..c44ac8e0aa7e 100644
--- a/vllm/compilation/collective_fusion.py
+++ b/vllm/compilation/collective_fusion.py
@@ -465,7 +465,8 @@ def call_trtllm_fused_allreduce_norm(
                 quant_out=quant_out,
                 scale_out=scale_out,
                 # in vllm we only support swizzled layout
-                layout_code=flashinfer_comm.FP4QuantizationSFLayout.SWIZZLED,
+                layout_code=flashinfer_comm.QuantizationSFLayout.
+                SWIZZLED_128x4,
                 scale_factor=scale_factor,
             )
         else:

From d522d9ad597a3e306dadfe8cdbe4ee7d52a106b0 Mon Sep 17 00:00:00 2001
From: Weiliang Liu <weiliangl@nvidia.com>
Date: Mon, 25 Aug 2025 06:30:23 +0000
Subject: [PATCH 11/13] update flashinfer to 0.2.14.post1

Signed-off-by: Weiliang Liu <weiliangl@nvidia.com>
---
 docker/Dockerfile | 2 +-
 setup.py          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/Dockerfile b/docker/Dockerfile
index 57100fadab74..2e272cbca841 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -373,7 +373,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 # Install FlashInfer from source
 ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
 # Keep this in sync with "flashinfer" extra in setup.py
-ARG FLASHINFER_GIT_REF="v0.2.13"
+ARG FLASHINFER_GIT_REF="v0.2.14.post1"
 # Flag to control whether to compile FlashInfer AOT kernels
 # Set to "true" to enable AOT compilation:
 # docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
diff --git a/setup.py b/setup.py
index 1cfca906da48..ffe8ec4e79af 100644
--- a/setup.py
+++ b/setup.py
@@ -694,7 +694,7 @@ def _read_requirements(filename: str) -> list[str]:
                   "mistral_common[audio]"],  # Required for audio processing
         "video": [],  # Kept for backwards compatibility
         # FlashInfer should be updated together with the Dockerfile
-        "flashinfer": ["flashinfer-python==0.2.13"],
+        "flashinfer": ["flashinfer-python==0.2.14.post1"],
         # Optional deps for AMD FP4 quantization support
         "petit-kernel": ["petit-kernel"],
     },

From 7313a92c9a950060c2018732aae65aef56d728d3 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Mon, 25 Aug 2025 11:08:37 -0400
Subject: [PATCH 12/13] Apply suggestion from @gemini-code-assist[bot]

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
---
 vllm/model_executor/layers/quantization/mxfp4.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index 354c715c2bec..ced1f7c01d8b 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -553,7 +553,7 @@ def apply(
                 self._get_tile_tokens_dim(x, top_k),
                 1 if renormalize else 0,  # routing_method_type, renormalize
                 True,  # do finalize
-                tune_max_num_tokens=self.max_captute_size,
+                tune_max_num_tokens=self.max_capture_size,
             )[0]
             return trtllm_gen_output
         else:

From 8548148a42aad36ef17e717b9d66369fa562f60e Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin64@gmail.com>
Date: Mon, 25 Aug 2025 11:08:45 -0400
Subject: [PATCH 13/13] Apply suggestion from @gemini-code-assist[bot]

Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
Signed-off-by: Michael Goin <mgoin64@gmail.com>
---
 vllm/model_executor/layers/quantization/mxfp4.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/layers/quantization/mxfp4.py b/vllm/model_executor/layers/quantization/mxfp4.py
index ced1f7c01d8b..df96e5d8c413 100644
--- a/vllm/model_executor/layers/quantization/mxfp4.py
+++ b/vllm/model_executor/layers/quantization/mxfp4.py
@@ -114,7 +114,7 @@ def __init__(self, moe: FusedMoEConfig):
         self.topk_indices_dtype = None
         self.moe = moe
         self.use_marlin = self._should_use_marlin()
-        self.max_captute_size = get_current_vllm_config(
+        self.max_capture_size = get_current_vllm_config(
         ).compilation_config.max_capture_size
 
         if current_platform.is_device_capability(100) and not has_flashinfer():