add additional casts into _fwd_kernel_flash_attn_v2 and _fwd_kernel_alibi

rasmith · micah-wil · commit d457cf2d657d · 2025-09-02T19:52:37.000Z
Signed-off-by: Randall Smith &lt;Randall.Smith@amd.com&gt;
diff --git a/vllm/attention/ops/prefix_prefill.py b/vllm/attention/ops/prefix_prefill.py
@@ -375,7 +375,7 @@ def _fwd_kernel_flash_attn_v2(
         bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
                      ((start_n + offs_n) // block_size) * stride_b_loc_s,
                      mask=(start_n + offs_n) < cur_batch_ctx_len,
-                     other=0)
+                     other=0).to(tl.int64)
         off_k = (
             bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
             (offs_d[:, None] // x) * stride_k_cache_d +
@@ -583,7 +583,7 @@ def _fwd_kernel_alibi(
         bn = tl.load(B_Loc + cur_batch * stride_b_loc_b +
                      ((start_n + offs_n) // block_size) * stride_b_loc_s,
                      mask=(start_n + offs_n) < cur_batch_ctx_len,
-                     other=0)
+                     other=0).to(tl.int64)
         off_k = (
             bn[None, :] * stride_k_cache_bs + cur_kv_head * stride_k_cache_h +
             (offs_d[:, None] // x) * stride_k_cache_d +