Merge remote-tracking branch 'upstream/main' into upstream_merge_2025_05_29

gshtras · gshtras · commit 7bf92f9405fe · 2025-05-29T17:16:10.000Z
diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu
@@ -48,7 +48,7 @@
 // TODO(woosuk): Tune NUM_THREADS.
 template <typename T, typename CACHE_T, int BLOCK_SIZE,
           vllm::Fp8KVCacheDataType KV_DTYPE, bool IS_BLOCK_SPARSE,
-          int NUM_THREADS>
+          int NUM_THREADS = 128>
 void paged_attention_v1_launcher(
     torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
     torch::Tensor& value_cache, int num_kv_heads, float scale,
@@ -133,38 +133,19 @@ void paged_attention_v1_launcher(
   }
 }
 
-#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE,  \
-                         NUM_THREADS)                                        \
+#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE)  \
   paged_attention_v1_launcher<T, CACHE_T, BLOCK_SIZE, KV_DTYPE,              \
-                              IS_BLOCK_SPARSE, NUM_THREADS>(                 \
+                              IS_BLOCK_SPARSE>(                              \
       out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
       seq_lens, max_seq_len, alibi_slopes, k_scale, v_scale, tp_rank,        \
       blocksparse_local_blocks, blocksparse_vert_stride,                     \
       blocksparse_block_size, blocksparse_head_sliding_step);
 
-#define CALL_V1_LAUNCHER_W_NUM_THREADS(T, CACHE_T, BLOCK_SIZE,           \
-                                       IS_FP8_KV_CACHE, IS_BLOCK_SPARSE) \
-  switch (num_threads) {                                                 \
-    case 128:                                                            \
-      CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE,          \
-                       IS_BLOCK_SPARSE, 128);                            \
-      break;                                                             \
-    case 1024:                                                           \
-      CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE,          \
-                       IS_BLOCK_SPARSE, 1024);                           \
-      break;                                                             \
-    default:                                                             \
-      TORCH_CHECK(false, "Unsupported num threads: ", num_threads);      \
-      break;                                                             \
-  }
-
-#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE)  \
-  if (is_block_sparse) {                                                    \
-    CALL_V1_LAUNCHER_W_NUM_THREADS(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, \
-                                   true);                                   \
-  } else {                                                                  \
-    CALL_V1_LAUNCHER_W_NUM_THREADS(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, \
-                                   false);                                  \
+#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
+  if (is_block_sparse) {                                                   \
+    CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);       \
+  } else {                                                                 \
+    CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);      \
   }
 
 // NOTE(woosuk): To reduce the compilation time, we omitted block sizes
@@ -202,7 +183,7 @@ void paged_attention_v1(
     torch::Tensor& v_scale, const int64_t tp_rank,
     const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step, const int64_t num_threads) {
+    const int64_t blocksparse_head_sliding_step) {
   const bool is_block_sparse = (blocksparse_vert_stride > 1);
 
   DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu
@@ -48,7 +48,7 @@
 
 template <typename T, typename CACHE_T, int BLOCK_SIZE,
           vllm::Fp8KVCacheDataType KV_DTYPE, bool IS_BLOCK_SPARSE,
-          int NUM_THREADS, int PARTITION_SIZE = 512>
+          int NUM_THREADS = 128, int PARTITION_SIZE = 512>
 void paged_attention_v2_launcher(
     torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
@@ -139,39 +139,20 @@ void paged_attention_v2_launcher(
   }
 }
 
-#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE,   \
-                         NUM_THREADS, PARTITION_SIZE)                         \
+#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE)   \
   paged_attention_v2_launcher<T, CACHE_T, BLOCK_SIZE, KV_DTYPE,               \
-                              IS_BLOCK_SPARSE, NUM_THREADS, PARTITION_SIZE>(  \
+                              IS_BLOCK_SPARSE>(                               \
       out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache,      \
       num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \
       k_scale, v_scale, tp_rank, blocksparse_local_blocks,                    \
       blocksparse_vert_stride, blocksparse_block_size,                        \
       blocksparse_head_sliding_step);
 
-#define CALL_V2_LAUNCHER_W_NUM_THREADS(T, CACHE_T, BLOCK_SIZE,           \
-                                       IS_FP8_KV_CACHE, IS_BLOCK_SPARSE) \
-  switch (num_threads) {                                                 \
-    case 128:                                                            \
-      CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE,          \
-                       IS_BLOCK_SPARSE, 128, 512);                       \
-      break;                                                             \
-    case 1024:                                                           \
-      CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE,          \
-                       IS_BLOCK_SPARSE, 1024, 1024);                     \
-      break;                                                             \
-    default:                                                             \
-      TORCH_CHECK(false, "Unsupported num threads: ", num_threads);      \
-      break;                                                             \
-  }
-
-#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE)  \
-  if (is_block_sparse) {                                                    \
-    CALL_V2_LAUNCHER_W_NUM_THREADS(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, \
-                                   true);                                   \
-  } else {                                                                  \
-    CALL_V2_LAUNCHER_W_NUM_THREADS(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, \
-                                   false);                                  \
+#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
+  if (is_block_sparse) {                                                   \
+    CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true);       \
+  } else {                                                                 \
+    CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false);      \
   }
 
 // NOTE(woosuk): To reduce the compilation time, we omitted block sizes
@@ -213,7 +194,7 @@ void paged_attention_v2(
     torch::Tensor& v_scale, const int64_t tp_rank,
     const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step, const int64_t num_threads) {
+    const int64_t blocksparse_head_sliding_step) {
   const bool is_block_sparse = (blocksparse_vert_stride > 1);
   DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
                              CALL_V2_LAUNCHER_BLOCK_SIZE)
diff --git a/csrc/ops.h b/csrc/ops.h
@@ -38,7 +38,7 @@ void paged_attention_v1(
     torch::Tensor& v_scale, const int64_t tp_rank,
     const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step, const int64_t num_threads);
+    const int64_t blocksparse_head_sliding_step);
 
 void paged_attention_v2(
     torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
@@ -50,7 +50,7 @@ void paged_attention_v2(
     torch::Tensor& v_scale, const int64_t tp_rank,
     const int64_t blocksparse_local_blocks,
     const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
-    const int64_t blocksparse_head_sliding_step, const int64_t num_threads);
+    const int64_t blocksparse_head_sliding_step);
 
 #ifndef USE_ROCM
 void merge_attn_states(torch::Tensor& output,
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
@@ -47,8 +47,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "    str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
       "    int tp_rank, int blocksparse_local_blocks,"
       "    int blocksparse_vert_stride, int blocksparse_block_size,"
-      "    int blocksparse_head_sliding_step,"
-      "    int num_threads) -> ()");
+      "    int blocksparse_head_sliding_step) -> ()");
   ops.impl("paged_attention_v1", torch::kCUDA, &paged_attention_v1);
 
   // PagedAttention V2.
@@ -62,8 +61,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "    str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
       "    int tp_rank, int blocksparse_local_blocks,"
       "    int blocksparse_vert_stride, int blocksparse_block_size,"
-      "    int blocksparse_head_sliding_step,"
-      "    int num_threads) -> ()");
+      "    int blocksparse_head_sliding_step) -> ()");
   ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2);
 
 #ifndef USE_ROCM
diff --git a/tests/entrypoints/llm/test_init.py b/tests/entrypoints/llm/test_init.py
diff --git a/tests/kernels/attention/test_attention.py b/tests/kernels/attention/test_attention.py
@@ -10,7 +10,7 @@
 from tests.kernels.utils import opcheck
 from vllm import _custom_ops as ops
 from vllm.platforms import current_platform
-from vllm.utils import get_max_shared_memory_bytes, is_navi
+from vllm.utils import get_max_shared_memory_bytes
 
 if not current_platform.is_rocm():
     from xformers import ops as xops
@@ -37,7 +37,7 @@
 
 # This should be sync with get_supported_head_sizes() in
 # vllm.attention.ops.paged_attn.PagedAttention
-HEAD_SIZES = [64, 80, 96, 112, 120, 128, 192, 256]
+HEAD_SIZES = [32, 64, 80, 96, 112, 120, 128, 192, 256]
 
 BLOCK_SIZES = [16, 32]
 USE_ALIBI = [False, True]
@@ -195,10 +195,6 @@ def test_paged_attention(
     # Using default kv_scale
     k_scale = v_scale = torch.tensor(1.0, dtype=torch.float32, device=device)
 
-    # additional argument for v1/v2 pa kernel
-    num_threads = 1024 if current_platform.is_rocm() \
-                and not is_navi() else 128
-
     # Call the paged attention kernel.
     output = torch.empty_like(query)
     if version == "v1":
@@ -219,12 +215,12 @@ def test_paged_attention(
             v_scale,
         )
 
-        opcheck(
-            torch.ops._C.paged_attention_v1,
-            (output, query, key_cache, value_cache, num_kv_heads, scale,
-             block_tables, seq_lens, block_size, max_seq_len, alibi_slopes,
-             kv_cache_dtype, k_scale, v_scale, 0, 0, 0, 64, 0, num_threads),
-            cond=(head_size == HEAD_SIZES[0] and block_size == BLOCK_SIZES[0]))
+        opcheck(torch.ops._C.paged_attention_v1,
+                (output, query, key_cache, value_cache, num_kv_heads, scale,
+                 block_tables, seq_lens, block_size, max_seq_len, alibi_slopes,
+                 kv_cache_dtype, k_scale, v_scale, 0, 0, 0, 64, 0),
+                cond=(head_size == HEAD_SIZES[0]
+                      and block_size == BLOCK_SIZES[0]))
 
     elif version in ("v2", "rocm"):
         if current_platform.is_rocm() and version == "rocm":
@@ -263,14 +259,13 @@ def test_paged_attention(
                 v_scale,
             )
 
-            opcheck(
-                torch.ops._C.paged_attention_v2,
-                (output, exp_sums, max_logits, tmp_output, query, key_cache,
-                 value_cache, num_kv_heads, scale, block_tables, seq_lens,
-                 block_size, max_seq_len, alibi_slopes, kv_cache_dtype,
-                 k_scale, v_scale, 0, 0, 0, 64, 0, num_threads),
-                cond=(head_size == HEAD_SIZES[0]
-                      and block_size == BLOCK_SIZES[0]))
+            opcheck(torch.ops._C.paged_attention_v2,
+                    (output, exp_sums, max_logits, tmp_output, query,
+                     key_cache, value_cache, num_kv_heads, scale, block_tables,
+                     seq_lens, block_size, max_seq_len, alibi_slopes,
+                     kv_cache_dtype, k_scale, v_scale, 0, 0, 0, 64, 0),
+                    cond=(head_size == HEAD_SIZES[0]
+                          and block_size == BLOCK_SIZES[0]))
 
         else:
             ops.paged_attention_rocm(
diff --git a/tests/kernels/attention/test_blocksparse_attention.py b/tests/kernels/attention/test_blocksparse_attention.py
@@ -22,7 +22,7 @@
 # There may not be enough gpu memory due to large NUM_BLOCKS.
 # Reduce NUM_BLOCKS when it happens.
 NUM_BLOCKS = 4321  # Arbitrary values for testing
-PARTITION_SIZE = 512 if not current_platform.is_rocm() else 1024
+PARTITION_SIZE = 512
 DTYPES = [torch.half, torch.bfloat16]
 NUM_GEN_SEQS = [3]  # Arbitrary values for testing
 NUM_PREFILL_SEQS = [3]  # Arbitrary values for testing
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
@@ -64,9 +64,7 @@ def paged_attention_v1(
         seq_lens, block_size, max_seq_len, alibi_slopes, kv_cache_dtype,
         k_scale, v_scale, tp_rank, blocksparse_local_blocks,
         blocksparse_vert_stride, blocksparse_block_size,
-        blocksparse_head_sliding_step,
-        num_threads = 1024 if current_platform.is_rocm() \
-            and not is_navi() else 128)
+        blocksparse_head_sliding_step)
 
 
 def paged_attention_v2(
@@ -98,9 +96,7 @@ def paged_attention_v2(
         num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len,
         alibi_slopes, kv_cache_dtype, k_scale, v_scale, tp_rank,
         blocksparse_local_blocks, blocksparse_vert_stride,
-        blocksparse_block_size, blocksparse_head_sliding_step,
-        num_threads = 1024 if current_platform.is_rocm() \
-            and not is_navi() else 128)
+        blocksparse_block_size, blocksparse_head_sliding_step)
 
 
 def paged_attention_rocm(
diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
@@ -766,12 +766,13 @@ def forward(
                             query.dtype,
                             seq_lens,
                             make_attn_mask=causal_mask)  # type: ignore
+                    use_fp8_scales = (layer._q_scale and layer._k_scale
+                                      and layer._v_scale and layer._prob_scale
+                                      and envs.VLLM_USE_ROCM_FP8_FLASH_ATTN)
                     full_scales = (
                         layer._q_scale.item(), layer._k_scale.item(),
-                        layer._v_scale.item(), layer._prob_scale.item()) if (
-                            layer._out_scale and layer._q_scale
-                            and layer._prob_scale
-                            and envs.VLLM_USE_ROCM_FP8_FLASH_ATTN) else None
+                        layer._v_scale.item(),
+                        layer._prob_scale.item()) if use_fp8_scales else None
                     self.triton_attn_func(
                         query,
                         key,
diff --git a/vllm/attention/ops/paged_attn.py b/vllm/attention/ops/paged_attn.py
@@ -6,15 +6,13 @@
 import torch
 
 from vllm import _custom_ops as ops
-from vllm.platforms import current_platform
 from vllm.triton_utils import HAS_TRITON
-from vllm.utils import is_navi
 
 if HAS_TRITON:
     from vllm.attention.ops.prefix_prefill import context_attention_fwd
 
 # Should be the same as PARTITION_SIZE in `paged_attention_v2_launcher`.
-_PARTITION_SIZE = 512 if not current_platform.is_rocm() or is_navi() else 1024
+_PARTITION_SIZE = 512
 
 
 @dataclass
diff --git a/vllm/attention/ops/triton_flash_attention.py b/vllm/attention/ops/triton_flash_attention.py
@@ -22,11 +22,10 @@
 """
 
 import torch
-import triton
-import triton.language as tl
 
 from vllm.platforms import current_platform
-from vllm.utils import is_navi
+from vllm.platforms.rocm import on_gfx1x
+from vllm.triton_utils import tl, triton
 
 torch_dtype: tl.constexpr = torch.float16
 
@@ -384,7 +383,7 @@ def get_rdna_autotune_configs():
 
 
 def get_autotune_configs():
-    if is_navi():
+    if on_gfx1x():
         return get_rdna_autotune_configs()
     else:
         return get_cdna_autotune_configs()
@@ -832,7 +831,7 @@ def forward(
         sm_scale=1.0,
         bias=None,
         fp8_scales=None,
-        o_scale_ptr=None,
+        fp8_out_scale=None,
     ):
         if fp8_scales is not None:
             use_fp8 = True
@@ -918,11 +917,11 @@ def check_and_convert(t, scale):
             bias_strides = (0, 0, 0, 0)
 
         p_descale = 1.0 / p_scale
-        o_descale = 1.0 / o_scale_ptr.item(
-        ) if o_scale_ptr is not None else 1.0
+        o_descale = 1.0 / fp8_out_scale.item(
+        ) if fp8_out_scale is not None else 1.0
 
-        arg_max_seqlens_q = 0 if is_navi() else max_seqlens_q
-        arg_max_seqlens_k = 0 if is_navi() else max_seqlens_k
+        arg_max_seqlens_q = 0 if on_gfx1x() else max_seqlens_q
+        arg_max_seqlens_k = 0 if on_gfx1x() else max_seqlens_k
 
         attn_fwd[grid](
             q,
@@ -961,7 +960,7 @@ def check_and_convert(t, scale):
             ENABLE_DROPOUT=False,
             RETURN_ENCODED_SOFTMAX=False,
             USE_FP8=use_fp8,
-            USE_FP8_OUT=o_scale_ptr is not None,
+            USE_FP8_OUT=fp8_out_scale is not None,
         )
 
         ctx.grid = grid
diff --git a/vllm/config.py b/vllm/config.py
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py
diff --git a/vllm/utils.py b/vllm/utils.py