vllm-project
diff --git a/‎benchmarks/kernels/benchmark_trtllm_decode_attention.py
Lines changed: 37 additions & 13 deletions b/‎benchmarks/kernels/benchmark_trtllm_decode_attention.py
Lines changed: 37 additions & 13 deletions
diff --git a/‎benchmarks/kernels/benchmark_trtllm_prefill_attention.py
Lines changed: 39 additions & 13 deletions b/‎benchmarks/kernels/benchmark_trtllm_prefill_attention.py
Lines changed: 39 additions & 13 deletions
diff --git a/‎tests/compile/test_functionalization.py
Lines changed: 3 additions & 2 deletions b/‎tests/compile/test_functionalization.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎tests/compile/test_fusion.py
Lines changed: 5 additions & 5 deletions b/‎tests/compile/test_fusion.py
Lines changed: 5 additions & 5 deletions
@@ -9,8 +9,11 @@
 import flashinfer
 import torch
 
+from vllm.utils import round_up
+
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 FP8_DTYPE = torch.float8_e4m3fn
+FP4_DTYPE = torch.uint8
 
 
 def to_float8(x, dtype=torch.float8_e4m3fn):
@@ -61,28 +64,27 @@ def benchmark_decode(
     else:
         raise ValueError(f"Invalid kv_layout: {kv_layout}")
 
-    query = torch.randn(batch_size, num_qo_heads, head_size, dtype=dtype)
+    # Always using 1.0 scale to reflect the real perf in benchmarking
+    q_scale = 1.0
+    ref_query = torch.randn(batch_size, num_qo_heads, head_size, dtype=dtype)
     if q_quant_dtype == FP8_DTYPE:
-        query, q_scale = to_float8(query)
-        ref_query = query.to(dtype) * q_scale
+        query, _ = to_float8(ref_query)
     else:
-        q_scale = 1.0
-        ref_query = query
+        query = ref_query
 
     kv_lens = torch.randint(1, max_seq_len, (batch_size,), dtype=torch.int32)
     kv_lens[-1] = max_seq_len
 
     seq_lens = kv_lens
     max_seq_len = torch.max(seq_lens).item()
 
-    kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
+    # Always using 1.0 scale to reflect the real perf in benchmarking
+    k_scale = v_scale = 1.0
+    ref_kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
     if kv_quant_dtype == FP8_DTYPE:
-        kv_cache, kv_scale = to_float8(kv_cache)
-        ref_kv_cache = kv_cache.to(dtype) * kv_scale
+        kv_cache, _ = to_float8(ref_kv_cache)
     else:
-        kv_scale = 1.0
-        ref_kv_cache = kv_cache
-    k_scale = v_scale = kv_scale
+        kv_cache = ref_kv_cache
 
     max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
     block_tables = torch.randint(
@@ -142,11 +144,31 @@ def time_fn(fn, warmup=10, trials=20):
         return sum(times) / len(times), torch.std(torch.tensor(times))
 
     o_scale = 1.0
+    o_sf_scale = None
     output_baseline = torch.empty(ref_query.shape, dtype=dtype)
-    output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
+    if o_quant_dtype == FP4_DTYPE:
+        o_sf_scale = 500.0
+        output_trtllm = flashinfer.utils.FP4Tensor(
+            torch.empty(query.shape[:-1] + (query.shape[-1] // 2,), dtype=torch.uint8),
+            torch.empty(
+                (
+                    round_up(query.shape[0], 128),
+                    round_up(query.shape[1] * query.shape[2] // 16, 4),
+                ),
+                dtype=torch.float8_e4m3fn,
+            ),
+        )
+    else:
+        output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
 
     def baseline_decode():
-        return wrapper.run(ref_query, ref_kv_cache, out=output_baseline)
+        return wrapper.run(
+            ref_query,
+            ref_kv_cache,
+            k_scale=k_scale,
+            v_scale=v_scale,
+            out=output_baseline,
+        )
 
     def trtllm_decode():
         return flashinfer.decode.trtllm_batch_decode_with_kv_cache(
@@ -158,6 +180,7 @@ def trtllm_decode():
             max_seq_len=max_seq_len,
             bmm1_scale=q_scale * k_scale * sm_scale,
             bmm2_scale=v_scale / o_scale,
+            o_sf_scale=o_sf_scale,
             out=output_trtllm,
         )
 
@@ -237,6 +260,7 @@ def write_results_to_csv(results, filename=None):
         (None, None, None),
         (None, FP8_DTYPE, None),
         (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
+        (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
     ]
 
     for quant_dtype in quant_dtypes:
 
@@ -9,8 +9,11 @@
 import flashinfer
 import torch
 
+from vllm.utils import round_up
+
 FLOAT32_BYTES = torch.finfo(torch.float).bits // 8
 FP8_DTYPE = torch.float8_e4m3fn
+FP4_DTYPE = torch.uint8
 
 
 def to_float8(x, dtype=torch.float8_e4m3fn):
@@ -72,28 +75,29 @@ def benchmark_prefill(
         ]
     )
 
-    query = torch.randn(torch.sum(q_lens).item(), num_qo_heads, head_size, dtype=dtype)
+    # Always using 1.0 scale to reflect the real perf in benchmarking
+    q_scale = 1.0
+    ref_query = torch.randn(
+        torch.sum(q_lens).item(), num_qo_heads, head_size, dtype=dtype
+    )
     if q_quant_dtype == FP8_DTYPE:
-        query, q_scale = to_float8(query)
-        ref_query = query.to(dtype) * q_scale
+        query, _ = to_float8(ref_query)
     else:
-        q_scale = 1.0
-        ref_query = query
+        query = ref_query
 
     kv_lens = torch.randint(0, max_kv_len, (batch_size,), dtype=torch.int32)
     kv_lens[-1] = max_kv_len
 
     seq_lens = kv_lens + q_lens
     max_seq_len = torch.max(seq_lens).item()
 
-    kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
+    # Always using 1.0 scale to reflect the real perf in benchmarking
+    k_scale = v_scale = 1.0
+    ref_kv_cache = torch.randn(kv_cache_shape, dtype=dtype)
     if kv_quant_dtype == FP8_DTYPE:
-        kv_cache, kv_scale = to_float8(kv_cache)
-        ref_kv_cache = kv_cache.to(dtype) * kv_scale
+        kv_cache, _ = to_float8(ref_kv_cache)
     else:
-        kv_scale = 1.0
-        ref_kv_cache = kv_cache
-    k_scale = v_scale = kv_scale
+        kv_cache = ref_kv_cache
 
     max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
     block_tables = torch.randint(
@@ -152,11 +156,31 @@ def time_fn(fn, warmup=10, trials=20):
         return sum(times) / len(times), torch.std(torch.tensor(times))
 
     o_scale = 1.0
+    o_sf_scale = None
     output_baseline = torch.empty(ref_query.shape, dtype=dtype)
-    output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
+    if o_quant_dtype == FP4_DTYPE:
+        o_sf_scale = 500.0
+        output_trtllm = flashinfer.utils.FP4Tensor(
+            torch.empty(query.shape[:-1] + (query.shape[-1] // 2,), dtype=torch.uint8),
+            torch.empty(
+                (
+                    round_up(query.shape[0], 128),
+                    round_up(query.shape[1] * query.shape[2] // 16, 4),
+                ),
+                dtype=torch.float8_e4m3fn,
+            ),
+        )
+    else:
+        output_trtllm = torch.empty(query.shape, dtype=o_quant_dtype)
 
     def baseline_prefill():
-        return wrapper.run(ref_query, ref_kv_cache, out=output_baseline)
+        return wrapper.run(
+            ref_query,
+            ref_kv_cache,
+            k_scale=k_scale,
+            v_scale=v_scale,
+            out=output_baseline,
+        )
 
     def trtllm_prefill():
         return flashinfer.prefill.trtllm_batch_context_with_kv_cache(
@@ -172,6 +196,7 @@ def trtllm_prefill():
             batch_size=batch_size,
             cum_seq_lens_q=q_indptr,
             cum_seq_lens_kv=kv_indptr,
+            o_sf_scale=o_sf_scale,
             out=output_trtllm,
         )
 
@@ -250,6 +275,7 @@ def write_results_to_csv(results, filename=None):
         # (q_quant_dtype, kv_quant_dtype, o_quant_dtype)
         (None, None, None),
         (FP8_DTYPE, FP8_DTYPE, FP8_DTYPE),
+        (FP8_DTYPE, FP8_DTYPE, FP4_DTYPE),
     ]
 
     for quant_dtype in quant_dtypes:
 
@@ -8,11 +8,12 @@
 from vllm import LLM, SamplingParams
 from vllm.compilation.activation_quant_fusion import ActivationQuantFusionPass
 from vllm.compilation.fix_functionalization import FixFunctionalizationPass
-from vllm.compilation.fusion import (FUSED_OPS, FusionPass, QuantKey,
-                                     kFp8DynamicTokenSym, kFp8StaticTensorSym)
+from vllm.compilation.fusion import FUSED_OPS, FusionPass
 from vllm.compilation.fx_utils import find_auto_fn, find_auto_fn_maybe, is_func
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.config import CompilationConfig, PassConfig, VllmConfig
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    QuantKey, kFp8DynamicTokenSym, kFp8StaticTensorSym)
 
 from .backend import TestBackend
 
 
@@ -7,11 +7,13 @@
 import vllm.envs as envs
 import vllm.plugins
 from vllm.compilation.fusion import (FUSED_OPS, QUANT_OPS, FusedRMSQuantKey,
-                                     FusionPass, GroupShape, QuantKey)
+                                     FusionPass)
 from vllm.compilation.noop_elimination import NoOpEliminationPass
 from vllm.config import (CompilationConfig, CompilationLevel, PassConfig,
                          VllmConfig)
 from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+    GroupShape, QuantKey, ScaleDesc)
 from vllm.model_executor.layers.quantization.utils.w8a8_utils import (
     CUTLASS_FP8_SUPPORTED, Fp8LinearOp, maybe_create_device_identity)
 from vllm.platforms import current_platform
@@ -30,10 +32,8 @@ def __init__(self, hidden_size: int, eps: float, static: bool,
         self.norm = [RMSNorm(hidden_size, eps) for _ in range(3)]
         self.wscale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
         group_shape = GroupShape.PER_TENSOR if static else GroupShape.PER_TOKEN
-        self.key = QuantKey(dtype=FP8_DTYPE,
-                            static=static,
-                            group_shape=group_shape,
-                            symmetric=True)
+        quant_scale = ScaleDesc(torch.float32, static, group_shape)
+        self.key = QuantKey(dtype=FP8_DTYPE, scale=quant_scale, symmetric=True)
         if static:
             self.scale = [torch.rand(1, dtype=torch.float32) for _ in range(2)]
         else: