Rebase the ck_tile_gemm branch to rocm/355_wip

eliotwang · eliotwang · commit 5800181f5aa8 · 2025-09-02T07:22:51.000Z
diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu
@@ -51,8 +51,8 @@ __global__ void rms_norm_kernel(
 template <typename scalar_t, int width>
 __global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
 fused_add_rms_norm_kernel(
-    scalar_t* __restrict__ output,   // [..., hidden_size]
-    const scalar_t* __restrict__ input,     // [..., hidden_size]
+    scalar_t* __restrict__ output,       // [..., hidden_size]
+    const scalar_t* __restrict__ input,  // [..., hidden_size]
     const int64_t input_stride,
     scalar_t* __restrict__ residual_out,    // [..., hidden_size]
     const scalar_t* __restrict__ residual,  // [..., hidden_size]
@@ -114,8 +114,8 @@ fused_add_rms_norm_kernel(
 template <typename scalar_t, int width>
 __global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
 fused_add_rms_norm_kernel(
-    scalar_t* __restrict__ output,   // [..., hidden_size]
-    const scalar_t* __restrict__ input,     // [..., hidden_size]
+    scalar_t* __restrict__ output,       // [..., hidden_size]
+    const scalar_t* __restrict__ input,  // [..., hidden_size]
     const int64_t input_stride,
     scalar_t* __restrict__ residual_out,    // [..., hidden_size]
     const scalar_t* __restrict__ residual,  // [..., hidden_size]
@@ -221,9 +221,10 @@ void fused_add_rms_norm(torch::Tensor& out,           // [..., hidden_size]
   constexpr int req_alignment_bytes =
       vector_width * 2;  // vector_width * sizeof(bfloat16 or float16) (float32
                          // falls back to non-vectorized version anyway)
-  bool ptrs_are_aligned = out_ptr % 16 == 0 && inp_ptr % req_alignment_bytes == 0 &&
-                          res_out_ptr % 16 == 0 && res_ptr % req_alignment_bytes == 0 &&
-                          wt_ptr % req_alignment_bytes == 0;
+  bool ptrs_are_aligned =
+      out_ptr % 16 == 0 && inp_ptr % req_alignment_bytes == 0 &&
+      res_out_ptr % 16 == 0 && res_ptr % req_alignment_bytes == 0 &&
+      wt_ptr % req_alignment_bytes == 0;
   bool offsets_are_multiple_of_vector_width =
       hidden_size % vector_width == 0 && input_stride % vector_width == 0;
   if (ptrs_are_aligned && offsets_are_multiple_of_vector_width) {
diff --git a/csrc/layernorm_quant_kernels.cu b/csrc/layernorm_quant_kernels.cu
@@ -64,8 +64,8 @@ __global__ void rms_norm_static_fp8_quant_kernel(
 template <typename scalar_t, int width, typename fp8_type>
 __global__ std::enable_if_t<(width > 0) && _typeConvert<scalar_t>::exists>
 fused_add_rms_norm_static_fp8_quant_kernel(
-    fp8_type* __restrict__ out,           // [..., hidden_size]
-    scalar_t* __restrict__ input,         // [..., hidden_size]
+    fp8_type* __restrict__ out,    // [..., hidden_size]
+    scalar_t* __restrict__ input,  // [..., hidden_size]
     const int input_stride,
     scalar_t* __restrict__ residual_out,  // [..., hidden_size]
     scalar_t* __restrict__ residual,      // [..., hidden_size]
@@ -132,8 +132,8 @@ fused_add_rms_norm_static_fp8_quant_kernel(
 template <typename scalar_t, int width, typename fp8_type>
 __global__ std::enable_if_t<(width == 0) || !_typeConvert<scalar_t>::exists>
 fused_add_rms_norm_static_fp8_quant_kernel(
-    fp8_type* __restrict__ out,           // [..., hidden_size]
-    scalar_t* __restrict__ input,         // [..., hidden_size]
+    fp8_type* __restrict__ out,    // [..., hidden_size]
+    scalar_t* __restrict__ input,  // [..., hidden_size]
     const int input_stride,
     scalar_t* __restrict__ residual_out,  // [..., hidden_size]
     scalar_t* __restrict__ residual,      // [..., hidden_size]
@@ -210,8 +210,8 @@ void rms_norm_static_fp8_quant(torch::Tensor& out,     // [..., hidden_size]
                                                                width, fp8_t> \
                   <<<grid, block, 0, stream>>>(                              \
                       out.data_ptr<fp8_t>(), input.data_ptr<scalar_t>(),     \
-                      input_stride, residual_out.data_ptr<scalar_t>(),                     \
-                      residual.data_ptr<scalar_t>(),           \
+                      input_stride, residual_out.data_ptr<scalar_t>(),       \
+                      residual.data_ptr<scalar_t>(),                         \
                       weight.data_ptr<scalar_t>(), scale.data_ptr<float>(),  \
                       epsilon, num_tokens, hidden_size);                     \
             });                                                              \
diff --git a/tests/kernels/core/test_layernorm.py b/tests/kernels/core/test_layernorm.py
@@ -130,7 +130,8 @@ def test_fused_rms_norm_quant(
         out_unfused = torch.empty_like(x_unfused)
         torch.ops._C.fused_add_rms_norm(out_unfused, x_unfused, residual_out,
                                         residual, weight, 1e-6)
-        torch.ops._C.static_scaled_fp8_quant(out_quant, out_unfused.contiguous(),
+        torch.ops._C.static_scaled_fp8_quant(out_quant,
+                                             out_unfused.contiguous(),
                                              quant_scale_t)
 
         torch.cuda.synchronize()
diff --git a/vllm/attention/ops/chunked_prefill_paged_decode.py b/vllm/attention/ops/chunked_prefill_paged_decode.py
@@ -25,47 +25,47 @@ def cdiv_fn(x, y):
 
 @triton.jit
 def kernel_paged_attention_2d(
-        output_ptr,  # [num_tokens, num_query_heads, head_size]
-        query_ptr,  # [num_tokens, num_query_heads, head_size]
-        key_cache_ptr,  # [num_blks, num_kv_heads, head_size // x, blk_size, x]
-        value_cache_ptr,  # [num_blks, num_kv_heads, head_size, blk_size]
-        sink_ptr,  # [num_query_heads]
-        block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
-        seq_lens_ptr,  # [num_seqs]
-        alibi_slopes_ptr,  # [num_query_heads]
-        scale,  # float32
-        k_scale,  # float32
-        v_scale,  # float32
-        out_scale,
-        num_query_heads: tl.constexpr,  # int
-        num_queries_per_kv: tl.constexpr,  # int
-        num_queries_per_kv_padded: tl.constexpr,  # int
-        block_table_stride: tl.int64,  # int
-        query_stride_0: tl.int64,  # int
-        query_stride_1: tl.int64,  # int, should be equal to head_size
-        output_stride_0: tl.int64,  # int
-        output_stride_1: tl.int64,  # int, should be equal to head_size
-        BLOCK_SIZE: tl.constexpr,  # int
-        HEAD_SIZE: tl.constexpr,  # int
-        HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
-        USE_ALIBI_SLOPES: tl.constexpr,  # bool
-        SLIDING_WINDOW: tl.constexpr,  # int
-        x: tl.constexpr,  # int
-        stride_k_cache_0: tl.int64,  # int
-        stride_k_cache_1: tl.int64,  # int
-        stride_k_cache_2: tl.int64,  # int
-        stride_k_cache_3: tl.int64,  # int
-        stride_k_cache_4: tl.int64,  # int
-        stride_v_cache_0: tl.int64,  # int
-        stride_v_cache_1: tl.int64,  # int
-        stride_v_cache_2: tl.int64,  # int
-        stride_v_cache_3: tl.int64,  # int
-        filter_by_query_len: tl.constexpr,  # bool
-        query_start_len_ptr,  # [num_seqs+1]
-        USE_FP8: tl.constexpr,
-        USE_SINKS: tl.constexpr,  # bool
-        FP8_MIN: tl.constexpr = float8_info.min,
-        FP8_MAX: tl.constexpr = float8_info.max,
+    output_ptr,  # [num_tokens, num_query_heads, head_size]
+    query_ptr,  # [num_tokens, num_query_heads, head_size]
+    key_cache_ptr,  # [num_blks, num_kv_heads, head_size // x, blk_size, x]
+    value_cache_ptr,  # [num_blks, num_kv_heads, head_size, blk_size]
+    sink_ptr,  # [num_query_heads]
+    block_tables_ptr,  # [num_seqs, max_num_blocks_per_seq]
+    seq_lens_ptr,  # [num_seqs]
+    alibi_slopes_ptr,  # [num_query_heads]
+    scale,  # float32
+    k_scale,  # float32
+    v_scale,  # float32
+    out_scale,
+    num_query_heads: tl.constexpr,  # int
+    num_queries_per_kv: tl.constexpr,  # int
+    num_queries_per_kv_padded: tl.constexpr,  # int
+    block_table_stride: tl.int64,  # int
+    query_stride_0: tl.int64,  # int
+    query_stride_1: tl.int64,  # int, should be equal to head_size
+    output_stride_0: tl.int64,  # int
+    output_stride_1: tl.int64,  # int, should be equal to head_size
+    BLOCK_SIZE: tl.constexpr,  # int
+    HEAD_SIZE: tl.constexpr,  # int
+    HEAD_SIZE_PADDED: tl.constexpr,  # int, must be power of 2
+    USE_ALIBI_SLOPES: tl.constexpr,  # bool
+    SLIDING_WINDOW: tl.constexpr,  # int
+    x: tl.constexpr,  # int
+    stride_k_cache_0: tl.int64,  # int
+    stride_k_cache_1: tl.int64,  # int
+    stride_k_cache_2: tl.int64,  # int
+    stride_k_cache_3: tl.int64,  # int
+    stride_k_cache_4: tl.int64,  # int
+    stride_v_cache_0: tl.int64,  # int
+    stride_v_cache_1: tl.int64,  # int
+    stride_v_cache_2: tl.int64,  # int
+    stride_v_cache_3: tl.int64,  # int
+    filter_by_query_len: tl.constexpr,  # bool
+    query_start_len_ptr,  # [num_seqs+1]
+    USE_FP8: tl.constexpr,
+    USE_SINKS: tl.constexpr,  # bool
+    FP8_MIN: tl.constexpr = float8_info.min,
+    FP8_MAX: tl.constexpr = float8_info.max,
 ):
     seq_idx = tl.program_id(0)
     kv_head_idx = tl.program_id(1)
diff --git a/vllm/attention/ops/triton_unified_attention.py b/vllm/attention/ops/triton_unified_attention.py
@@ -12,7 +12,6 @@
 from vllm.logger import init_logger
 from vllm.platforms import current_platform
 from vllm.triton_utils import tl, triton
-from vllm.platforms import current_platform
 
 logger = init_logger(__name__)
 float8_info = torch.finfo(current_platform.fp8_dtype())
diff --git a/vllm/envs.py b/vllm/envs.py
@@ -754,7 +754,7 @@ def get_vllm_port() -> Optional[int]:
     "VLLM_ROCM_USE_AITER_CK_TILE_LINEAR":
     lambda: (os.getenv("VLLM_ROCM_USE_AITER_CK_TILE_LINEAR", "True").lower() in
              ("true", "1")),
-    
+
     # Whether to use aiter moe ops.
     # By default is enabled.
     "VLLM_ROCM_USE_AITER_MOE":
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
@@ -205,11 +205,11 @@ def __init__(self, quant_config: Fp8Config):
                                            and envs.VLLM_ROCM_USE_AITER
                                            and envs.VLLM_ROCM_USE_AITER_LINEAR
                                            and current_platform.is_fp8_fnuz())
-        self.use_ck_tile_and_is_supported = (current_platform.is_rocm()
-                                           and envs.VLLM_ROCM_USE_AITER
-                                           and envs.VLLM_ROCM_USE_AITER_CK_TILE_LINEAR
-                                           and current_platform.is_fp8_fnuz())
-        
+        self.use_ck_tile_and_is_supported = (
+            current_platform.is_rocm() and envs.VLLM_ROCM_USE_AITER
+            and envs.VLLM_ROCM_USE_AITER_CK_TILE_LINEAR
+            and current_platform.is_fp8_fnuz())
+
         self.block_quant = self.quant_config.weight_block_size is not None
         self.act_q_static = self.quant_config.activation_scheme == "static"
         # Use per-token quantization for better perf if dynamic and cutlass
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -90,6 +90,7 @@ def rocm_aiter_gemm_w8a8_blockscale_fake(
 
         aiter_per1x128_quant = get_hip_quant(rocm_aiter.QuantType.per_1x128)
 
+
 def rocm_aiter_ck_tile_gemm_w8a8_blockscale_impl(
     A: torch.Tensor,
     B: torch.Tensor,
@@ -100,7 +101,11 @@ def rocm_aiter_ck_tile_gemm_w8a8_blockscale_impl(
 ) -> torch.Tensor:
     import aiter as rocm_aiter
 
-    return rocm_aiter.gemm_a8w8_blockscale_ck_tile(A, B, As, Bs, dtype=output_dtype)
+    return rocm_aiter.gemm_a8w8_blockscale_ck_tile(A,
+                                                   B,
+                                                   As,
+                                                   Bs,
+                                                   dtype=output_dtype)
 
 
 def rocm_aiter_ck_tile_gemm_w8a8_blockscale_fake(
@@ -136,7 +141,8 @@ def rocm_aiter_ck_tile_gemm_w8a8_blockscale_fake(
 
 
 def dispatch_w8a8_blockscale_func(
-    use_cutlass: bool, use_aiter_and_is_supported: bool, use_ck_tile_and_is_supported: bool
+    use_cutlass: bool, use_aiter_and_is_supported: bool,
+    use_ck_tile_and_is_supported: bool
 ) -> Callable[[
         torch.Tensor,
         torch.Tensor,
diff --git a/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py b/vllm/model_executor/layers/quantization/utils/mxfp4_utils.py
@@ -34,8 +34,8 @@ def _swizzle_mxfp4(quant_tensor, scale, num_warps):
     elif current_platform.is_rocm():
         from triton_kernels.target_info import is_hip
         from triton_kernels.tensor_details.layout import (
-            BlackwellMXScaleLayout, HopperMXScaleLayout, HopperMXValueLayout,
-            GFX950MXScaleLayout)
+            BlackwellMXScaleLayout, GFX950MXScaleLayout, HopperMXScaleLayout,
+            HopperMXValueLayout)
         value_layout = StridedLayout
         scale_layout = StridedLayout
         if not is_hip():
@@ -53,7 +53,8 @@ def _swizzle_mxfp4(quant_tensor, scale, num_warps):
     else:
         """ weight swizzle for mxfp4 moe, used for OAI mxfp4 kernel
         """
-        value_layout, value_layout_opts = layout.make_default_matmul_mxfp4_w_layout(
+        value_layout, value_layout_opts = \
+            layout.make_default_matmul_mxfp4_w_layout(
             mx_axis=1)
         scale_layout, scale_layout_opts = (
             layout.make_default_matmul_mxfp4_w_scale_layout(
diff --git a/vllm/model_executor/layers/rotary_embedding/base.py b/vllm/model_executor/layers/rotary_embedding/base.py
@@ -9,7 +9,9 @@
 from vllm.model_executor.custom_op import CustomOp
 
 from .common import apply_rotary_emb_dispatch, apply_rotary_emb_torch
-from .rocm_aiter_rope_ops import is_rocm_rotary_embedding_enabled, is_rocm_triton_rotary_embedding_enabled
+from .rocm_aiter_rope_ops import (is_rocm_rotary_embedding_enabled,
+                                  is_rocm_triton_rotary_embedding_enabled)
+
 
 @CustomOp.register("rotary_embedding")
 class RotaryEmbedding(CustomOp):
@@ -36,8 +38,11 @@ def __init__(
         cache = cache.to(dtype)
         self.cos_sin_cache: torch.Tensor
         self.register_buffer("cos_sin_cache", cache, persistent=False)
-        self.is_rocm_aiter_enabled = is_rocm_rotary_embedding_enabled()
-        self.is_rocm_aiter_triton_enabled = is_rocm_triton_rotary_embedding_enabled()
+        self.is_rocm_aiter_enabled = \
+            is_rocm_rotary_embedding_enabled()
+        self.is_rocm_aiter_triton_enabled = \
+            is_rocm_triton_rotary_embedding_enabled(
+        )
 
     def _compute_inv_freq(self, base: float) -> torch.Tensor:
         """Compute the inverse frequency."""
diff --git a/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py b/vllm/model_executor/layers/rotary_embedding/rocm_aiter_rope_ops.py
@@ -13,8 +13,10 @@
 def is_rocm_rotary_embedding_enabled() -> bool:
     return (current_platform.is_rocm() and envs.VLLM_ROCM_USE_AITER)
 
+
 def is_rocm_triton_rotary_embedding_enabled() -> bool:
-    return (current_platform.is_rocm() and envs.VLLM_ROCM_USE_AITER and envs.VLLM_USE_AITER_TRITON_ROPE)
+    return (current_platform.is_rocm() and envs.VLLM_ROCM_USE_AITER
+            and envs.VLLM_USE_AITER_TRITON_ROPE)
 
 
 def rocm_aiter_rotary_emb_without_key_forward_hip_impl(
@@ -130,7 +132,6 @@ def rocm_aiter_rotary_emb_without_key_forward_hip_fake(
     )
 
 
-
 def rocm_aiter_rotary_emb_with_key_forward_triton_impl(
     positions: torch.Tensor,
     sin: torch.Tensor,
@@ -188,4 +189,4 @@ def rocm_aiter_rotary_emb_with_key_forward_triton_fake(
         mutates_args=["key", "query"],
         fake_impl=rocm_aiter_rotary_emb_with_key_forward_triton_fake,
         dispatch_key=current_platform.dispatch_key,
-    )
+    )
diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py
@@ -1,6 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """Utility methods for model layers."""
+import os
 from typing import Callable, Optional
 
 import torch
@@ -9,10 +10,13 @@
 from vllm import envs
 from vllm.platforms import current_platform
 from vllm.utils import direct_register_custom_op
-import os
+
 if current_platform.is_rocm():
     from aiter.ops.triton.gemm_a16w16 import gemm_a16w16
-VLLM_USE_AITER_TRITON_GEMM = (os.getenv("VLLM_USE_AITER_TRITON_GEMM", "False").lower() in ("true", "1"))
+
+VLLM_USE_AITER_TRITON_GEMM = (os.getenv("VLLM_USE_AITER_TRITON_GEMM",
+                                        "False").lower() in ("true", "1"))
+
 
 def shuffle_weight(w: torch.Tensor) -> torch.Tensor:
     # Shuffle weight along the last dimension so that
@@ -94,14 +98,12 @@ def default_unquantized_gemm(layer: torch.nn.Module,
                              bias: Optional[torch.Tensor] = None):
     return torch.nn.functional.linear(x, weight, bias)
 
+
 def aiter_GEMM_check(m, n, k):
-    if ((n == 5120 and k == 2880)
-        or (n == 2880 and k == 4096)
-        or (n == 128 and k == 2880)
-        or (n == 640 and k == 2880)
-        or (n == 2880 and k == 512)):
-        return True
-    return False
+    return ((n == 5120 and k == 2880) or (n == 2880 and k == 4096)
+            or (n == 128 and k == 2880) or (n == 640 and k == 2880)
+            or (n == 2880 and k == 512))
+
 
 def rocm_unquantized_gemm_impl(
         x: torch.Tensor,
diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py
diff --git a/vllm/platforms/rocm.py b/vllm/platforms/rocm.py