Fused qk rope cat and cache mla (ROCm#1380)

yzhou103 · valarLip · web-flow · commit 23ff415c8608 · 2025-12-09T14:17:46.000+08:00
* first version

* opt to vec and add perf compared with triton

* opt

* add fused_qk_rope_concat_and_cache opt kenrel and opt concat_and_cache

* use buffer_o.template set_raw() with bf16 output as set() will result in
2 busffer store dwordsx2

* fix lint error

* refactor interface and fix error when is_neox=false

* fix rot_dim!=64 and is_nope_first=false

* fix error when input is not contiguous

---------

Co-authored-by: Lingpeng Jin &lt;103567126+valarLip@users.noreply.github.com&gt;
diff --git a/aiter/ops/cache.py b/aiter/ops/cache.py
@@ -115,3 +115,22 @@ def cp_gather_indexer_k_quant_cache(
     block_table: Tensor,
     cu_seq_lens: Tensor,
 ) -> None: ...
+
+
+@compile_ops("module_cache")
+def fused_qk_rope_concat_and_cache_mla(
+    q_nope: Tensor,
+    q_pe: Tensor,
+    kv_c: Tensor,
+    k_pe: Tensor,  # key tensor
+    kv_cache: Tensor,
+    q_out: Tensor,
+    slot_mapping: Tensor,
+    k_scale: Tensor,
+    q_scale: Tensor,
+    positions: Tensor,
+    cos_cache: Tensor,
+    sin_cache: Tensor,
+    is_neox: bool,
+    is_nope_first: bool,
+) -> None: ...
diff --git a/csrc/include/cache.h b/csrc/include/cache.h
@@ -85,4 +85,22 @@ void cp_gather_indexer_k_quant_cache(
     torch::Tensor& dst_scale,          // [num_tokens, head_dim / quant_block_size * 4]
     const torch::Tensor& block_table,  // [batch_size, num_blocks]
     const torch::Tensor& cu_seq_lens); // [batch_size + 1]
+
+void fused_qk_rope_concat_and_cache_mla(
+    torch::Tensor& q_nope,       // [num_tokens, num_heads, qk_lora_rank]
+    torch::Tensor& q_pe,         // [num_tokens, num_heads, pe_dim]
+    torch::Tensor& kv_c,         // [num_tokens, kv_lora_rank]
+    torch::Tensor& k_pe,         // [num_tokens, pe_dim]
+    torch::Tensor& kv_cache,     // [num_blocks, block_size, (kv_lora_rank +
+                                 // pe_dim)]
+    torch::Tensor& q_out,        // [num_tokens, num_heads, qk_lora_rank+pe_dim]
+    torch::Tensor& slot_mapping, // [num_tokens] or [num_actual_tokens]
+    torch::Tensor& k_scale,
+    torch::Tensor& q_scale,
+    torch::Tensor& positions, // [num_tokens]
+    torch::Tensor& cos_cache, // [max_positions, pe_dim//2]
+    torch::Tensor& sin_cache, // [max_positions, pe_dim//2]
+    bool is_neox,
+    bool is_nope_first);
+
 } // namespace aiter
diff --git a/csrc/include/quant_utils.cuh b/csrc/include/quant_utils.cuh
@@ -1,6 +1,6 @@
 #pragma once
 /*
- * Copyright © Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (C) Advanced Micro Devices, Inc. All rights reserved.
  * Copyright (C) 2024-2025, The vLLM team.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
@@ -783,6 +783,69 @@ __inline__ __device__ Tout scaled_convert(const Tin& x, const float scale)
         }                                                                                \
     }
 
+    #define DISPATCH_BY_KV_CACHE_QUERY_DTYPE(SRC_DTYPE, KV_DTYPE, QUERY_DTYPE, FN)                                                      \
+    if(KV_DTYPE == "auto" && QUERY_DTYPE == "auto")                                                                                     \
+    {                                                                                                                                   \
+        if(SRC_DTYPE == at::ScalarType::Float)                                                                                          \
+        {                                                                                                                               \
+            FN(float, float, float, vllm::Fp8KVCacheDataType::kAuto, vllm::Fp8KVCacheDataType::kAuto);                                  \
+        }                                                                                                                               \
+        else if(SRC_DTYPE == at::ScalarType::Half)                                                                                      \
+        {                                                                                                                               \
+            FN(ck_tile::fp16_t, ck_tile::fp16_t, ck_tile::fp16_t, vllm::Fp8KVCacheDataType::kAuto, vllm::Fp8KVCacheDataType::kAuto);    \
+        }                                                                                                                               \
+        else if(SRC_DTYPE == at::ScalarType::BFloat16)                                                                                  \
+        {                                                                                                                               \
+            FN(ck_tile::bf16_t, ck_tile::bf16_t, ck_tile::bf16_t, vllm::Fp8KVCacheDataType::kAuto, vllm::Fp8KVCacheDataType::kAuto);    \
+        }                                                                                                                               \
+        else                                                                                                                            \
+        {                                                                                                                               \
+            TORCH_CHECK(false, "Unsupported input type of kv cache: ", SRC_DTYPE);                                                      \
+        }                                                                                                                               \
+    }                                                                                                                                   \
+    else if ((KV_DTYPE == "fp8" || KV_DTYPE == "fp8_e4m3") && (QUERY_DTYPE == "auto"))                                                  \
+    {                                                                                                                                   \
+        if(SRC_DTYPE == at::ScalarType::Float)                                                                                          \
+        {                                                                                                                               \
+            FN(float, ck_tile::fp8_t, float, vllm::Fp8KVCacheDataType::kFp8E4M3, vllm::Fp8KVCacheDataType::kAuto);                      \
+        }                                                                                                                               \
+        else if(SRC_DTYPE == at::ScalarType::Half)                                                                                      \
+        {                                                                                                                               \
+            FN(ck_tile::fp16_t, ck_tile::fp8_t, ck_tile::fp16_t, vllm::Fp8KVCacheDataType::kFp8E4M3, vllm::Fp8KVCacheDataType::kAuto);  \
+        }                                                                                                                               \
+        else if(SRC_DTYPE == at::ScalarType::BFloat16)                                                                                  \
+        {                                                                                                                               \
+            FN(ck_tile::bf16_t, ck_tile::fp8_t, ck_tile::bf16_t,vllm::Fp8KVCacheDataType::kFp8E4M3, vllm::Fp8KVCacheDataType::kAuto);   \
+        }                                                                                                                               \
+        else                                                                                                                            \
+        {                                                                                                                               \
+            TORCH_CHECK(false, "Unsupported input type of kv cache: ", SRC_DTYPE);                                                      \
+        }                                                                                                                               \
+    }                                                                                                                                   \
+    else if ((KV_DTYPE == "fp8" || KV_DTYPE == "fp8_e4m3") && (QUERY_DTYPE == "fp8" || QUERY_DTYPE == "fp8_e4m3"))                      \
+    {                                                                                                                                   \
+        if(SRC_DTYPE == at::ScalarType::Float)                                                                                          \
+        {                                                                                                                               \
+            FN(float, ck_tile::fp8_t, ck_tile::fp8_t, vllm::Fp8KVCacheDataType::kFp8E4M3, vllm::Fp8KVCacheDataType::kFp8E4M3);          \
+        }                                                                                                                               \
+        else if(SRC_DTYPE == at::ScalarType::Half)                                                                                      \
+        {                                                                                                                               \
+            FN(ck_tile::fp16_t, ck_tile::fp8_t, ck_tile::fp8_t, vllm::Fp8KVCacheDataType::kFp8E4M3, vllm::Fp8KVCacheDataType::kFp8E4M3);\
+        }                                                                                                                               \
+        else if(SRC_DTYPE == at::ScalarType::BFloat16)                                                                                  \
+        {                                                                                                                               \
+            FN(ck_tile::bf16_t, ck_tile::fp8_t, ck_tile::fp8_t,vllm::Fp8KVCacheDataType::kFp8E4M3, vllm::Fp8KVCacheDataType::kFp8E4M3); \
+        }                                                                                                                               \
+        else                                                                                                                            \
+        {                                                                                                                               \
+            TORCH_CHECK(false, "Unsupported input type of kv cache: ", SRC_DTYPE);                                                      \
+        }                                                                                                                               \
+    }                                                                                                                                   \
+    else                                                                                                                                \
+    {                                                                                                                                   \
+        TORCH_CHECK(false, "Unsupported data type of kv cache: ", KV_DTYPE, "Query type: ", QUERY_DTYPE);                                   \
+    }
+
 } // namespace fp8
 #endif // USE_ROCM
 } // namespace vllm
diff --git a/csrc/include/rocm_ops.hpp b/csrc/include/rocm_ops.hpp
@@ -311,7 +311,36 @@ namespace py = pybind11;
           py::arg("dst_k"),                                                         \
           py::arg("dst_scale"),                                                     \
           py::arg("block_table"),                                                   \
-          py::arg("cu_seq_lens"));
+          py::arg("cu_seq_lens"));                                                  \
+    m.def("fused_qk_rope_concat_and_cache_mla",                                     \
+          &aiter::fused_qk_rope_concat_and_cache_mla,                               \
+          "fused_qk_rope_concat_and_cache_mla("                                     \
+          "                     Tensor q_nope, Tensor q_pe,"                        \
+          "                     Tensor kv_c, Tensor k_pe,"                          \
+          "                     Tensor! kv_cache,"                                  \
+          "                     Tensor! q_out, "                                    \
+          "                     Tensor slot_mapping,"                               \
+          "                     Tensor k_scale,"                                    \
+          "                     Tensor q_scale,"                                    \
+          "                     Tensor positions,"                                  \
+          "                     Tensor cos_cache,"                                  \
+          "                     Tensor sin_cache,"                                  \
+          "                     bool is_neox    ,"                                  \
+          "                     bool is_nope_first)->()",                           \
+          py::arg("q_nope"),                                                        \
+          py::arg("q_pe"),                                                          \
+          py::arg("kv_c"),                                                          \
+          py::arg("k_pe"),                                                          \
+          py::arg("kv_cache"),                                                      \
+          py::arg("q_out"),                                                         \
+          py::arg("slot_mapping"),                                                  \
+          py::arg("k_scale"),                                                       \
+          py::arg("q_scale"),                                                       \
+          py::arg("positions"),                                                     \
+          py::arg("cos_cache"),                                                     \
+          py::arg("sin_cache"),                                                     \
+          py::arg("is_neox"),                                                       \
+          py::arg("is_nope_first"));
 
 #define CUSTOM_ALL_REDUCE_PYBIND                                                               \
     m.def("init_custom_ar",                                                                    \
diff --git a/csrc/kernels/cache_kernels.cu b/csrc/kernels/cache_kernels.cu
diff --git a/op_tests/test_concat_cache_mla.py b/op_tests/test_concat_cache_mla.py