bugfix: Fix missing v_scale for prefill wrapper. (#1416)

weireweire · web-flow · commit 209b7b1e2eb6 · 2025-08-10T03:59:42.000-07:00
## 📌 Description We do need k,v scale !=1 for llama3 fp4 model. ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [ ] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/flashinfer/decode.py b/flashinfer/decode.py
@@ -57,6 +57,7 @@
     canonicalize_torch_dtype,
     device_support_pdl,
     get_device_sm_count,
+    is_float8,
     register_custom_op,
     register_fake_op,
 )
@@ -1318,8 +1319,8 @@ def run(
             self._cached_module.run(*run_args)
         if v_scale is not None:
             # TODO(Zihao): fused into kernel
-            if out.itemsize == 1:
-                out = (out.to(float) * v_scale).to(out.dtype)
+            if is_float8(out):
+                out = (out.to(torch.float32) * v_scale).to(out.dtype)
             else:
                 out *= v_scale
 
diff --git a/flashinfer/prefill.py b/flashinfer/prefill.py
@@ -2092,7 +2092,12 @@ def run(
             ]
 
         self._cached_module.paged_run(*run_args)
-
+        if v_scale is not None:
+            # TODO(Zihao): fused into kernel
+            if is_float8(out):
+                out = (out.to(torch.float32) * v_scale).to(out.dtype)
+            else:
+                out *= v_scale
         return (out, lse) if return_lse else out
 
     run_return_lse = functools.partialmethod(run, return_lse=True)
diff --git a/tests/test_trtllm_gen_context.py b/tests/test_trtllm_gen_context.py
@@ -489,6 +489,10 @@ def test_trtllm_batch_prefill(
             k_scale=k_scale,
             v_scale=v_scale / o_scale,
         )
-        # v_scale, o_scale is not supported in wrapper api yet.
+        # v_scale, o_scale in wrapper is emulated by multiplying output by v_scale instead of fused into kernel.
         if v_scale == o_scale == 1.0:
             assert (output2 == output).all()
+        else:
+            torch.testing.assert_close(
+                output.float(), output2.float(), rtol=1e-1, atol=1e-1
+            )
diff --git a/tests/test_trtllm_gen_decode.py b/tests/test_trtllm_gen_decode.py
@@ -377,9 +377,13 @@ def test_trtllm_batch_decode_fmha(
             k_scale=k_scale,
             v_scale=v_scale / o_scale,
         )
-        # v_scale, o_scale is not supported in wrapper api yet.
+        # v_scale, o_scale in wrapper is emulated by multiplying output by v_scale instead of fused into kernel.
         if v_scale == o_scale == 1.0:
             assert (output2 == output).all()
+        else:
+            torch.testing.assert_close(
+                output.float(), output2.float(), rtol=1e-1, atol=1e-1
+            )
 
 
 @pytest.mark.parametrize(