flashinfer-ai
diff --git a/‎csrc/flashinfer_rope_ops.cu
Lines changed: 8 additions & 0 deletions b/‎csrc/flashinfer_rope_ops.cu
Lines changed: 8 additions & 0 deletions
diff --git a/‎csrc/rope.cu
Lines changed: 95 additions & 0 deletions b/‎csrc/rope.cu
Lines changed: 95 additions & 0 deletions
diff --git a/‎flashinfer/rope.py
Lines changed: 124 additions & 0 deletions b/‎flashinfer/rope.py
Lines changed: 124 additions & 0 deletions
@@ -39,6 +39,12 @@ void apply_rope_pos_ids_cos_sin_cache(at::Tensor q, at::Tensor k, at::Tensor q_r
                                       at::Tensor k_rope, at::Tensor cos_sin_cache,
                                       at::Tensor pos_ids, bool interleave);
 
+void mla_rope_quantize(at::Tensor q_rope_in, at::Tensor k_rope_in, at::Tensor q_nope_in,
+                       at::Tensor k_nope_in, at::Tensor q_rope_out, at::Tensor k_rope_out,
+                       at::Tensor q_nope_out, at::Tensor k_nope_out, at::Tensor cos_sin_cache,
+                       at::Tensor pos_ids, double quant_scale_q, double quant_scale_kv,
+                       bool interleave);
+
 TORCH_LIBRARY_FRAGMENT(TORCH_EXTENSION_NAME, m) {
   // "Apply RoPE"
   m.def("apply_rope", apply_rope);
@@ -50,4 +56,6 @@ TORCH_LIBRARY_FRAGMENT(TORCH_EXTENSION_NAME, m) {
   m.def("apply_llama31_rope_pos_ids", apply_llama31_rope_pos_ids);
   // "Apply RoPE with positional ids and cosine/sine cache"
   m.def("apply_rope_pos_ids_cos_sin_cache", apply_rope_pos_ids_cos_sin_cache);
+  // "MLA RoPE Quantize"
+  m.def("mla_rope_quantize", mla_rope_quantize);
 }
@@ -259,3 +259,98 @@ void apply_llama31_rope_pos_ids(at::Tensor q, at::Tensor k, at::Tensor q_rope, a
     });
   });
 }
+
+void mla_rope_quantize(at::Tensor q_rope_in, at::Tensor k_rope_in, at::Tensor q_nope_in,
+                       at::Tensor k_nope_in, at::Tensor q_rope_out, at::Tensor k_rope_out,
+                       at::Tensor q_nope_out, at::Tensor k_nope_out, at::Tensor cos_sin_cache,
+                       at::Tensor pos_ids, double quant_scale_q, double quant_scale_kv,
+                       bool interleave) {
+  CHECK_LAST_DIM_CONTIGUOUS(q_rope_in);
+  CHECK_LAST_DIM_CONTIGUOUS(k_rope_in);
+  CHECK_LAST_DIM_CONTIGUOUS(q_nope_in);
+  CHECK_LAST_DIM_CONTIGUOUS(k_nope_in);
+  CHECK_LAST_DIM_CONTIGUOUS(q_rope_out);
+  CHECK_LAST_DIM_CONTIGUOUS(k_rope_out);
+  CHECK_LAST_DIM_CONTIGUOUS(q_nope_out);
+  CHECK_LAST_DIM_CONTIGUOUS(k_nope_out);
+  CHECK_INPUT(cos_sin_cache);
+  CHECK_INPUT(pos_ids);
+
+  CHECK_EQ(q_rope_in.size(-1), 64);
+  CHECK_EQ(k_rope_in.size(-1), 64);
+  CHECK_EQ(q_nope_in.size(-1), 512);
+  CHECK_EQ(k_nope_in.size(-1), 512);
+  CHECK_EQ(q_rope_out.size(-1), 64);
+  CHECK_EQ(k_rope_out.size(-1), 64);
+  CHECK_EQ(q_nope_out.size(-1), 512);
+  CHECK_EQ(k_nope_out.size(-1), 512);
+  auto scalar_type_in = q_rope_in.scalar_type();
+  TORCH_CHECK(scalar_type_in == k_rope_in.scalar_type());
+  TORCH_CHECK(scalar_type_in == q_nope_in.scalar_type());
+  TORCH_CHECK(scalar_type_in == k_nope_in.scalar_type());
+  auto quant_type_out = q_rope_out.scalar_type();
+  TORCH_CHECK(quant_type_out == k_rope_out.scalar_type());
+  TORCH_CHECK(quant_type_out == q_nope_out.scalar_type());
+  TORCH_CHECK(quant_type_out == k_nope_out.scalar_type());
+
+  CHECK_DIM(3, q_rope_in);   // q_rope_in: (nnz, H_Q, 64)
+  CHECK_DIM(3, q_nope_in);   // q_nope_in: (nnz, H_Q, 512)
+  CHECK_DIM(2, k_rope_in);   // k_rope_in: (nnz, 64)
+  CHECK_DIM(2, k_nope_in);   // k_nope_in: (nnz, 512)
+  CHECK_DIM(3, q_rope_out);  // q_rope_out: (nnz, H_Q, 64)
+  CHECK_DIM(3, q_nope_out);  // q_nope_out: (nnz, H_Q, 512)
+  CHECK_DIM(2, k_rope_out);  // k_rope_out: (nnz, 64)
+  CHECK_DIM(2, k_nope_out);  // k_nope_out: (nnz, 512)
+  uint32_t nnz = q_rope_in.size(0);
+  CHECK_EQ(q_nope_in.size(0), nnz);
+  CHECK_EQ(k_nope_in.size(0), nnz);
+  CHECK_EQ(q_rope_out.size(0), nnz);
+  CHECK_EQ(k_rope_out.size(0), nnz);
+  CHECK_EQ(q_nope_out.size(0), nnz);
+  CHECK_EQ(k_nope_out.size(0), nnz);
+  uint32_t num_heads = q_rope_in.size(1);
+  CHECK_EQ(q_rope_in.size(1), num_heads);
+  CHECK_EQ(q_nope_in.size(1), num_heads);
+  CHECK_EQ(q_rope_out.size(1), num_heads);
+  CHECK_EQ(q_nope_out.size(1), num_heads);
+
+  const uint32_t q_rope_in_stride_n = q_rope_in.stride(0);
+  const uint32_t q_rope_in_stride_h = q_rope_in.stride(1);
+  const uint32_t q_nope_in_stride_n = q_nope_in.stride(0);
+  const uint32_t q_nope_in_stride_h = q_nope_in.stride(1);
+  const uint32_t q_rope_out_stride_n = q_rope_out.stride(0);
+  const uint32_t q_rope_out_stride_h = q_rope_out.stride(1);
+  const uint32_t q_nope_out_stride_n = q_nope_out.stride(0);
+  const uint32_t q_nope_out_stride_h = q_nope_out.stride(1);
+  const uint32_t k_rope_in_stride = k_rope_in.stride(0);
+  const uint32_t k_nope_in_stride = k_nope_in.stride(0);
+  const uint32_t k_rope_out_stride = k_rope_out.stride(0);
+  const uint32_t k_nope_out_stride = k_nope_out.stride(0);
+
+  const c10::cuda::OptionalCUDAGuard device_guard(q_rope_in.device());
+  auto stream = at::cuda::getCurrentCUDAStream();
+
+  DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP16(scalar_type_in, c_type, [&] {
+    return DISPATCH_PYTORCH_DTYPE_TO_CTYPE_FP8(quant_type_out, c_quant_type, [&] {
+      return DISPATCH_PYTORCH_IDTYPE_TO_CTYPE(pos_ids.scalar_type(), c_idtype, [&] {
+        cudaError_t status = MLARopeQuantize(
+            static_cast<c_type*>(q_rope_in.data_ptr()), static_cast<c_type*>(k_rope_in.data_ptr()),
+            static_cast<c_type*>(q_nope_in.data_ptr()), static_cast<c_type*>(k_nope_in.data_ptr()),
+            static_cast<c_quant_type*>(q_rope_out.data_ptr()),
+            static_cast<c_quant_type*>(k_rope_out.data_ptr()),
+            static_cast<c_quant_type*>(q_nope_out.data_ptr()),
+            static_cast<c_quant_type*>(k_nope_out.data_ptr()),
+            static_cast<float*>(cos_sin_cache.data_ptr()),
+            static_cast<c_idtype*>(pos_ids.data_ptr()), nnz, num_heads, q_rope_in_stride_n,
+            q_rope_in_stride_h, q_nope_in_stride_n, q_nope_in_stride_h, q_rope_out_stride_n,
+            q_rope_out_stride_h, q_nope_out_stride_n, q_nope_out_stride_h, k_rope_in_stride,
+            k_nope_in_stride, k_rope_out_stride, k_nope_out_stride, quant_scale_q, quant_scale_kv,
+            interleave, stream);
+        TORCH_CHECK(status == cudaSuccess,
+                    "BatchQKApplyRotaryPosIdsCosSinCache failed with error code " +
+                        std::string(cudaGetErrorString(status)));
+        return true;
+      });
+    });
+  });
+}
@@ -175,6 +175,61 @@ def _fake_apply_rope_pos_ids(
     pass
 
 
+@register_custom_op(
+    "flashinfer::mla_rope_quantize",
+    mutates_args=("q_rope_out", "k_rope_out", "q_nope_out", "k_nope_out"),
+)
+def _mla_rope_quantize(
+    q_rope_in: torch.Tensor,
+    k_rope_in: torch.Tensor,
+    q_nope_in: torch.Tensor,
+    k_nope_in: torch.Tensor,
+    cos_sin_cache: torch.Tensor,
+    pos_ids: torch.Tensor,
+    q_rope_out: torch.Tensor,
+    k_rope_out: torch.Tensor,
+    q_nope_out: torch.Tensor,
+    k_nope_out: torch.Tensor,
+    quant_scale_q: float,
+    quant_scale_kv: float,
+    interleave: bool,
+) -> None:
+    get_rope_module().mla_rope_quantize(
+        q_rope_in,
+        k_rope_in,
+        q_nope_in,
+        k_nope_in,
+        q_rope_out,
+        k_rope_out,
+        q_nope_out,
+        k_nope_out,
+        cos_sin_cache,
+        pos_ids,
+        quant_scale_q,
+        quant_scale_kv,
+        interleave,
+    )
+
+
+@register_fake_op("flashinfer::mla_rope_quantize")
+def _fake_mla_rope_quantize(
+    q_rope_in: torch.Tensor,
+    k_rope_in: torch.Tensor,
+    q_nope_in: torch.Tensor,
+    k_nope_in: torch.Tensor,
+    cos_sin_cache: torch.Tensor,
+    pos_ids: torch.Tensor,
+    q_rope_out: torch.Tensor,
+    k_rope_out: torch.Tensor,
+    q_nope_out: torch.Tensor,
+    k_nope_out: torch.Tensor,
+    quant_scale_q: float,
+    quant_scale_kv: float,
+    interleave: bool,
+) -> None:
+    pass
+
+
 @register_custom_op(
     "flashinfer::apply_rope_pos_ids_cos_sin_cache", mutates_args=("q_rope", "k_rope")
 )
@@ -1094,3 +1149,72 @@ def apply_rope_with_cos_sin_cache_inplace(
         pos_ids=positions,
         interleave=(not is_neox),
     )
+
+
+def mla_rope_quantize_fp8(
+    q_rope: torch.Tensor,
+    k_rope: torch.Tensor,
+    q_nope: torch.Tensor,
+    k_nope: torch.Tensor,
+    cos_sin_cache: torch.Tensor,
+    pos_ids: torch.Tensor,
+    is_neox: bool = True,
+    quantize_dtype: Optional[torch.dtype] = None,
+    quant_scale_q: float = 1.0,
+    quant_scale_kv: float = 1.0,
+    q_rope_out: Optional[torch.Tensor] = None,
+    k_rope_out: Optional[torch.Tensor] = None,
+    q_nope_out: Optional[torch.Tensor] = None,
+    k_nope_out: Optional[torch.Tensor] = None,
+) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    if cos_sin_cache.dtype != torch.float32:
+        raise ValueError("cos_sin_cache should be float32")
+
+    # Infer quantize_dtype from output tensors or default to float8_e4m3fn
+    if quantize_dtype is None:
+        for out in (q_rope_out, k_rope_out, q_nope_out, k_nope_out):
+            if out is not None:
+                quantize_dtype = out.dtype
+                break
+        else:
+            quantize_dtype = torch.float8_e4m3fn
+
+    # Allocate output tensors if not provided
+    q_rope_out = (
+        q_rope_out
+        if q_rope_out is not None
+        else torch.empty_like(q_rope, dtype=quantize_dtype)
+    )
+    k_rope_out = (
+        k_rope_out
+        if k_rope_out is not None
+        else torch.empty_like(k_rope, dtype=quantize_dtype)
+    )
+    q_nope_out = (
+        q_nope_out
+        if q_nope_out is not None
+        else torch.empty_like(q_nope, dtype=quantize_dtype)
+    )
+    k_nope_out = (
+        k_nope_out
+        if k_nope_out is not None
+        else torch.empty_like(k_nope, dtype=quantize_dtype)
+    )
+
+    _mla_rope_quantize(
+        q_rope,
+        k_rope,
+        q_nope,
+        k_nope,
+        cos_sin_cache,
+        pos_ids,
+        q_rope_out,
+        k_rope_out,
+        q_nope_out,
+        k_nope_out,
+        quant_scale_q,
+        quant_scale_kv,
+        not is_neox,  # interleave
+    )
+
+    return q_rope_out, k_rope_out, q_nope_out, k_nope_out