Merge remote-tracking branch 'origin/main' into upstream_merge_2025_06_04

gshtras · gshtras · commit ccfa3b866607 · 2025-06-05T15:21:32.000Z
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
@@ -15,6 +15,7 @@
 
 # To check compatibility
 IS_TURING = current_platform.get_device_capability() == (7, 5)
+float8_info = torch.finfo(current_platform.fp8_dtype())
 
 
 # Here's an example autotuner config for this kernel. This config does provide
@@ -83,7 +84,9 @@ def _fwd_kernel(Q,
                 SKIP_DECODE: tl.constexpr,
                 USE_FP8: tl.constexpr,
                 MAX_Q_LEN: tl.constexpr = 0,
-                MAX_CTX_LEN: tl.constexpr = 0):
+                MAX_CTX_LEN: tl.constexpr = 0,
+                FP8_MIN: tl.constexpr = float8_info.min,
+                FP8_MAX: tl.constexpr = float8_info.max):
 
     cur_batch = tl.program_id(0)
     cur_head = tl.program_id(1)
@@ -278,6 +281,7 @@ def _fwd_kernel(Q,
     out_ptrs = Out + off_o
     if USE_FP8:
         acc = acc / tl.load(out_scale)
+        acc = tl.clamp(acc, FP8_MIN, FP8_MAX)
     tl.store(out_ptrs,
              acc,
              mask=dim_mask[None, :] & (offs_m[:, None] < cur_batch_query_len))