feat - refactor fmha python in cudagraph mode

Nancheng-11 · Nancheng-11 · commit a4c43198c938 · 2025-12-17T14:20:44.000+08:00
diff --git a/rtp_llm/cpp/devices/cuda_impl/CudaGraphDecode.cc b/rtp_llm/cpp/devices/cuda_impl/CudaGraphDecode.cc
@@ -44,6 +44,8 @@ void CudaGraphRunner::captureDecode() {
         prepareCaptureInputs(inputs, bs, bs * num_tokens_per_bs_);
 
         graph_instances_[bs].mem_hold_ = createCaptureMemoryHold(inputs, bs * num_tokens_per_bs_);
+        graph_instances_[bs].mem_hold_.attn_pyobj_ =
+            py_attn_pyobj_method_(graph_instances_[bs].mem_hold_.py_model_inputs_, true);
         captureDecodeOneBatchSize(bs);
         replayAndSyncCheck(bs, "batch size");
         RTP_LLM_LOG_INFO("capture success for batch size: %d", bs);
diff --git a/rtp_llm/cpp/devices/cuda_impl/CudaGraphRunner.cc b/rtp_llm/cpp/devices/cuda_impl/CudaGraphRunner.cc
@@ -39,7 +39,9 @@ GraphBase* CudaDevice::getDeviceGraphRunner(const DeviceInitParams& params,
 }
 
 py::object CudaGraphRunner::normalForward(PyModelInputs& inputs) {
-    return py_forward_method_(inputs);
+    auto attn_pyobj = py_attn_pyobj_method_(inputs, false);
+    attn_pyobj.attr("prepare")(inputs);
+    return py_forward_method_(inputs, attn_pyobj);
 }
 
 // column dimension
@@ -97,12 +99,8 @@ void CudaGraphRunner::prepareInputs(PyModelInputs& inputs) {
         optimizedCopy(inputs.attention_inputs.padding_offset,
                       py_model_inputs_.attention_inputs.padding_offset,
                       inputs.attention_inputs.padding_offset.size(0) * sizeof(int));
-        graph_instances_[state_.current_real_graph_bs].mem_hold_.params_ptr->fillParams(
-            inputs.attention_inputs.sequence_lengths,
-            inputs.attention_inputs.input_lengths,
-            inputs.attention_inputs.kv_cache_block_id_host,
-            state_.current_batch_size,
-            seq_size_per_block_);
+        auto attn_pyobj = graph_instances_[state_.current_real_graph_bs].mem_hold_.attn_pyobj_;
+        attn_pyobj.attr("prepare_replay")(inputs);
     } else {
         auto& py_model_inputs_ = graph_instances_[state_.current_real_graph_seq_len].mem_hold_.py_model_inputs_;
 
@@ -343,8 +341,10 @@ void CudaGraphRunner::initCapture() {
         capture_mem_hold_ = CaptureMemoryHold(output, inputs, kv_cache_block_offset_, is_prefill_cuda_graph_mode_);
         initKernelInternalMemory();
         // get real output data type
+        auto attn_pyobj = py_attn_pyobj_method_(capture_mem_hold_.py_model_inputs_, true);
+        attn_pyobj.attr("prepare")(capture_mem_hold_.py_model_inputs_);
         RTP_LLM_LOG_INFO("initCapture forward for output datatype start");
-        auto py_outputs_obj = py_forward_method_(capture_mem_hold_.py_model_inputs_);
+        auto py_outputs_obj = py_forward_method_(capture_mem_hold_.py_model_inputs_, attn_pyobj);
         RTP_LLM_LOG_INFO("initCapture forward for output datatype end");
         auto outputs        = py_outputs_obj.cast<PyModelOutputs>();
         options_cuda_float_ = torch::TensorOptions()
@@ -382,8 +382,10 @@ void CudaGraphRunner::captureOneGraphInstance(int key, const char* key_type) {
     auto inputs = graph_instances_[key].mem_hold_.py_model_inputs_;
     // WarmUp twice
     RTP_LLM_LOG_INFO("WarmUp for %s %d start.", key_type, key);
-    py_forward_method_(inputs);
-    py_forward_method_(inputs);
+    auto attn_pyobj = graph_instances_[key].mem_hold_.attn_pyobj_;
+    attn_pyobj.attr("prepare")(inputs);
+    py_forward_method_(inputs, attn_pyobj);
+    py_forward_method_(inputs, attn_pyobj);
     RTP_LLM_LOG_INFO("WarmUp for %s %d successfully.", key_type, key);
 
     {
@@ -399,7 +401,7 @@ void CudaGraphRunner::captureOneGraphInstance(int key, const char* key_type) {
         {
             graph.capture_begin();
             CudaGraphCaptureGuard capture_guard;
-            auto                  py_outputs_obj = py_forward_method_(inputs);
+            auto                  py_outputs_obj = py_forward_method_(inputs, attn_pyobj);
             outputs                              = py_outputs_obj.cast<PyModelOutputs>();
             graph_instances_[key].mem_hold_.decoder_layer_hidden_states_.copy_(outputs.hidden_states);
             graph.capture_end();
diff --git a/rtp_llm/cpp/devices/cuda_impl/CudaGraphRunner.h b/rtp_llm/cpp/devices/cuda_impl/CudaGraphRunner.h
@@ -38,6 +38,7 @@ class CudaGraphRunner: public GraphBase {
         } else {
             max_bs_ = params.concurrency_config.concurrency_limit;
         }
+        py_attn_pyobj_method_  = py_instance_.attr("prepare_fmha_impl");
         py_forward_method_     = py_instance_.attr("forward");
         py_fill_params_method_ = py_instance_.attr("fill_params");
         options_cuda_int32_    = torch::TensorOptions().dtype(torch::kInt32).device(torch::kCUDA).requires_grad(false);
@@ -99,6 +100,7 @@ class CudaGraphRunner: public GraphBase {
     void                 initCaptureBertEmbeddingInputs(PyModelInputs& inputs, int max_bs, int max_num_token);
     void                 initCaptureAttentionInputsPost();
     py::object           py_forward_method_;
+    py::object           py_attn_pyobj_method_;
     py::object           py_fill_params_method_;
     bool                 enable_cuda_graph_{false};
     bool                 is_prefill_cuda_graph_mode_{false};
diff --git a/rtp_llm/cpp/devices/cuda_impl/CudaGraphUtils.h b/rtp_llm/cpp/devices/cuda_impl/CudaGraphUtils.h
@@ -43,6 +43,7 @@ class CaptureMemoryHold {
     at::Tensor decoder_layer_hidden_states_;
     // for input
     PyModelInputs py_model_inputs_;
+    py::object    attn_pyobj_;
 };
 
 class GraphInstance {
diff --git a/rtp_llm/cpp/models/PyWrappedModel.cc b/rtp_llm/cpp/models/PyWrappedModel.cc
@@ -256,10 +256,12 @@ GptModelOutputs PyWrappedModel::forward(const GptModelInputs& inputs) {
             hidden_states                                = torchTensor2Buffer(py_model_outputs.hidden_states);
         } else {
             DevicePerfWrapper wrapper(device_, "normal forward");
-            auto              py_model_forward = py_model_.attr("forward");
-            auto              outputs          = py_model_forward(py_model_inputs);
-            py_model_outputs                   = outputs.cast<PyModelOutputs>();
-            hidden_states                      = device_->clone({*torchTensor2Buffer(py_model_outputs.hidden_states)});
+            auto              attn_pyobj = py_model_.attr("prepare_fmha_impl")(py_model_inputs, false);
+            attn_pyobj.attr("prepare")(py_model_inputs);
+            auto py_model_forward = py_model_.attr("forward");
+            auto outputs          = py_model_forward(py_model_inputs, attn_pyobj);
+            py_model_outputs      = outputs.cast<PyModelOutputs>();
+            hidden_states         = device_->clone({*torchTensor2Buffer(py_model_outputs.hidden_states)});
         }
 
         RTP_LLM_LOG_DEBUG("Python object instance forward method called successfully.");
diff --git a/rtp_llm/models_py/bindings/cuda/DebugKernelOp.cc b/rtp_llm/models_py/bindings/cuda/DebugKernelOp.cc
@@ -0,0 +1,49 @@
+#include "rtp_llm/models_py/bindings/cuda/DebugKernelOp.h"
+#include "rtp_llm/cpp/core/Dispatch.h"
+#include "rtp_llm/cpp/utils/AssertUtils.h"
+#include "rtp_llm/cpp/core/torch_utils/BufferTorchUtils.h"
+
+namespace rtp_llm {
+
+void DebugKernelOp::forward(const torch::Tensor& data,
+                            int64_t              start_row,
+                            int64_t              start_col,
+                            int64_t              m,
+                            int64_t              n,
+                            int64_t              row_len,
+                            int64_t              info_id) {
+    // Validate input tensor
+    RTP_LLM_CHECK_WITH_INFO(data.is_cuda(), "Input tensor must be on CUDA device");
+    RTP_LLM_CHECK_WITH_INFO(data.is_contiguous(), "Input tensor must be contiguous");
+
+    // Get CUDA stream
+    auto stream = c10::cuda::getCurrentCUDAStream(data.get_device());
+
+    // Dispatch based on data type
+    DISPATCH_CUDA_FUNCTION_DATA_TYPE(torchDTypeToDataType(data.dtype()),
+                                     invoke_debug_kernel2,
+                                     data.data_ptr(),
+                                     static_cast<int>(start_row),
+                                     static_cast<int>(start_col),
+                                     static_cast<int>(m),
+                                     static_cast<int>(n),
+                                     static_cast<int>(row_len),
+                                     static_cast<int>(info_id),
+                                     stream);
+}
+
+void registerDebugKernelOp(const py::module& m) {
+    pybind11::class_<DebugKernelOp>(m, "DebugKernelOp")
+        .def(pybind11::init<>())
+        .def("forward",
+             &DebugKernelOp::forward,
+             py::arg("data"),
+             py::arg("start_row") = 0,
+             py::arg("start_col") = 0,
+             py::arg("m")         = 30,
+             py::arg("n")         = 10,
+             py::arg("row_len")   = 0,  // Will use data.sizes()[1] if 0
+             py::arg("info_id")   = 1);
+}
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/models_py/bindings/cuda/DebugKernelOp.h b/rtp_llm/models_py/bindings/cuda/DebugKernelOp.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <torch/extension.h>
+#include <c10/cuda/CUDAStream.h>
+#include "rtp_llm/cpp/kernels/unfused_attention_kernels.h"
+
+namespace rtp_llm {
+
+class DebugKernelOp {
+public:
+    DebugKernelOp() = default;
+
+    /// @brief Debug kernel to print 2D data blocks
+    /// @param data Input tensor to debug
+    /// @param start_row Starting row index
+    /// @param start_col Starting column index
+    /// @param m Number of rows to print
+    /// @param n Number of columns to print
+    /// @param row_len Length of each row (stride)
+    /// @param info_id Debug identifier
+    void forward(const torch::Tensor& data,
+                 int64_t              start_row,
+                 int64_t              start_col,
+                 int64_t              m,
+                 int64_t              n,
+                 int64_t              row_len,
+                 int64_t              info_id);
+};
+
+void registerDebugKernelOp(const py::module& m);
+
+}  // namespace rtp_llm
diff --git a/rtp_llm/models_py/bindings/cuda/FusedRopeKVCacheOp.cc b/rtp_llm/models_py/bindings/cuda/FusedRopeKVCacheOp.cc
@@ -225,8 +225,19 @@ torch::Tensor FusedRopeKVCacheDecodeOp::forward(const torch::Tensor&
 }
 
 void registerFusedRopeKVCacheOp(const py::module& m) {
-    pybind11::class_<KVBlockArray>(m, "KVBlockArray").def(pybind11::init<>());
-    pybind11::class_<TRTAttn, std::shared_ptr<TRTAttn>, rtp_llm::ParamsBase>(m, "TRTAttn").def(pybind11::init<>());
+    pybind11::class_<KVBlockArray>(m, "KVBlockArray")
+        .def(pybind11::init<>())
+        .def(
+            "__cpp_ptr__",
+            [](KVBlockArray& self) { return reinterpret_cast<uintptr_t>(&self); },
+            "Get C++ object pointer address");
+    pybind11::class_<TRTAttn, std::shared_ptr<TRTAttn>, rtp_llm::ParamsBase>(m, "TRTAttn")
+        .def(pybind11::init<>())
+        .def_readwrite("kv_cache_offset", &TRTAttn::kv_cache_offset)
+        .def(
+            "__cpp_ptr__",
+            [](TRTAttn& self) { return reinterpret_cast<uintptr_t>(&self); },
+            "Get C++ object pointer address");
     pybind11::class_<FusedRopeKVCachePrefillOp>(m, "FusedRopeKVCachePrefillOp")
         .def(pybind11::init<GptInitParameter>(), py::arg("gpt_init_parameter"))
         .def("prepare", &FusedRopeKVCachePrefillOp::prepare, py::arg("attn_inputs"))
diff --git a/rtp_llm/models_py/bindings/cuda/RegisterCudaOps.cc b/rtp_llm/models_py/bindings/cuda/RegisterCudaOps.cc
@@ -4,6 +4,7 @@
 #include "rtp_llm/cpp/cuda/cutlass/cutlass_kernels/fp8_group_gemm/fp8_group_gemm.h"
 #include "rtp_llm/cpp/kernels/scaled_fp8_quant.h"
 #include "rtp_llm/cpp/kernels/moe/ep_utils.h"
+#include "rtp_llm/models_py/bindings/cuda/DebugKernelOp.h"
 
 namespace rtp_llm {
 
@@ -94,6 +95,7 @@ void registerPyModuleOps(py::module& rtp_ops_m) {
 
     registerBaseCudaBindings(rtp_ops_m);
     registerAttnOpBindings(rtp_ops_m);
+    registerDebugKernelOp(rtp_ops_m);
 }
 
 }  // namespace rtp_llm
diff --git a/rtp_llm/models_py/bindings/cuda/XQAAttnOp.cc b/rtp_llm/models_py/bindings/cuda/XQAAttnOp.cc
@@ -90,7 +90,12 @@ XQAAttnOp::forward(const torch::Tensor& input, std::optional<torch_ext::KVCache>
 
 void registerXQAAttnOp(const py::module& m) {
     pybind11::class_<XQAParams, std::shared_ptr<XQAParams>, rtp_llm::ParamsBase>(m, "XQAParams")
-        .def(pybind11::init<>());
+        .def(pybind11::init<>())
+        .def(
+            "__cpp_ptr__",
+            [](XQAParams& self) { return reinterpret_cast<uintptr_t>(&self); },
+            "Get C++ object pointer address")
+        .def_readwrite("kv_cache_offset", &XQAParams::kv_cache_offset);
     pybind11::class_<XQAAttnOp>(m, "XQAAttnOp")
         .def(pybind11::init<GptInitParameter>(), py::arg("gpt_init_parameter"))
         .def("support", &XQAAttnOp::support, py::arg("attn_inputs").noconvert())
diff --git a/rtp_llm/models_py/model_desc/generic_moe.py b/rtp_llm/models_py/model_desc/generic_moe.py
@@ -8,7 +8,6 @@
 from rtp_llm.model_loader.model_weight_info import ModelWeights
 from rtp_llm.models_py.model_desc.module_base import GptModelBase
 from rtp_llm.models_py.modules import (
-    AttnImplFactory,
     CausalAttention,
     Embedding,
     FMHAImplBase,
@@ -20,12 +19,8 @@
     RMSNorm,
     SelectTopk,
 )
-from rtp_llm.ops.compute_ops import (
-    KVCache,
-    PyAttentionInputs,
-    PyModelInputs,
-    PyModelOutputs,
-)
+from rtp_llm.models_py.modules.factory.attention.attn_pyobj import AttnPyObj
+from rtp_llm.ops.compute_ops import KVCache, PyModelInputs, PyModelOutputs
 from rtp_llm.utils.model_weight import W
 
 
@@ -216,14 +211,11 @@ def __init__(self, config: GptInitModelParameters, weights: ModelWeights):
             weights.get_global_weight(W.final_ln_gamma), eps=config.layernorm_eps
         )
 
-    def forward(self, inputs: PyModelInputs) -> PyModelOutputs:
+    def forward(self, inputs: PyModelInputs, attn_pyobj: AttnPyObj) -> PyModelOutputs:
         input_ids: torch.Tensor = inputs.input_ids
         inputs_embeds = self.embed_tokens(input_ids)
         hidden_states = inputs_embeds
-        attention_inputs: PyAttentionInputs = inputs.attention_inputs
-        fmha_impl = AttnImplFactory.get_fmha_impl(
-            self.config, self.weight, attention_inputs
-        )
+        fmha_impl = attn_pyobj.fmha_impl
 
         for i, decoder_layer in enumerate(self.layers[: self.layer_num]):
             hidden_states = decoder_layer(
diff --git a/rtp_llm/models_py/model_desc/module_base.py b/rtp_llm/models_py/model_desc/module_base.py
@@ -5,11 +5,10 @@
 
 from rtp_llm.config.gpt_init_model_parameters import GptInitModelParameters
 from rtp_llm.model_loader.model_weight_info import ModelWeights
-from rtp_llm.models_py.distributed.symm_mem import get_symm_mem_communicator
+from rtp_llm.models_py.modules import AttnImplFactory, AttnPyObj
 from rtp_llm.ops.compute_ops import (
     DeviceType,
     KVCache,
-    PyAttentionInputs,
     PyModelInitResources,
     PyModelInputs,
     PyModelOutputs,
@@ -74,5 +73,13 @@ def fill_params(
             seq_size_per_block,
         )
 
-    def forward(self, inputs: PyModelInputs) -> PyModelOutputs:
+    def prepare_fmha_impl(
+        self, inputs: PyModelInputs, is_cuda_graph: bool
+    ) -> AttnPyObj:
+        fmha_impl = AttnImplFactory.get_fmha_impl(
+            self.config, self.weight, inputs.attention_inputs
+        )
+        return AttnPyObj(is_cuda_graph, fmha_impl)
+
+    def forward(self, inputs: PyModelInputs, attn_pyobj: AttnPyObj) -> PyModelOutputs:
         raise NotImplementedError("forward method must be implemented in subclass")
diff --git a/rtp_llm/models_py/model_desc/qwen3.py b/rtp_llm/models_py/model_desc/qwen3.py
@@ -7,19 +7,14 @@
 from rtp_llm.model_loader.model_weight_info import ModelWeights
 from rtp_llm.models_py.model_desc.module_base import GptModelBase
 from rtp_llm.models_py.modules import (
-    AttnImplFactory,
     CausalAttention,
     Embedding,
     FMHAImplBase,
     FusedSiluActDenseMLP,
     RMSNorm,
 )
-from rtp_llm.ops.compute_ops import (
-    KVCache,
-    PyAttentionInputs,
-    PyModelInputs,
-    PyModelOutputs,
-)
+from rtp_llm.models_py.modules.factory.attention.attn_pyobj import AttnPyObj
+from rtp_llm.ops.compute_ops import KVCache, PyModelInputs, PyModelOutputs
 from rtp_llm.utils.model_weight import W
 
 
@@ -76,15 +71,11 @@ def __init__(self, config: GptInitModelParameters, weights: ModelWeights):
             weights.get_global_weight(W.final_ln_gamma), eps=config.layernorm_eps
         )
 
-    def forward(self, inputs: PyModelInputs) -> PyModelOutputs:
+    def forward(self, inputs: PyModelInputs, attn_pyobj: AttnPyObj) -> PyModelOutputs:
         input_ids: torch.Tensor = inputs.input_ids
         inputs_embeds = self.embed_tokens(input_ids)
         hidden_states = inputs_embeds
-
-        attention_inputs: PyAttentionInputs = inputs.attention_inputs
-        fmha_impl = AttnImplFactory.get_fmha_impl(
-            self.config, self.weight, attention_inputs
-        )
+        fmha_impl = attn_pyobj.fmha_impl
         for i, decoder_layer in enumerate(self.layers[: self.layer_num]):
             hidden_states = decoder_layer(
                 hidden_states,
diff --git a/rtp_llm/models_py/modules/__init__.py b/rtp_llm/models_py/modules/__init__.py
@@ -21,6 +21,7 @@
 # Import from factory module
 from rtp_llm.models_py.modules.factory import (
     AttnImplFactory,
+    AttnPyObj,
     FMHAImplBase,
     FusedMoeFactory,
     LinearFactory,
@@ -58,6 +59,7 @@
     "FusedMoeFactory",
     "LinearFactory",
     "AttnImplFactory",
+    "AttnPyObj",
     "FMHAImplBase",
     # Hybrid modules
     "CausalAttention",
diff --git a/rtp_llm/models_py/modules/factory/__init__.py b/rtp_llm/models_py/modules/factory/__init__.py
@@ -3,7 +3,7 @@
 Provides factories to hide the selection process.
 """
 
-from .attention import AttnImplFactory, FMHAImplBase
+from .attention import AttnImplFactory, AttnPyObj, FMHAImplBase
 from .fused_moe import FusedMoeFactory
 from .linear import LinearFactory
 
@@ -12,4 +12,5 @@
     "LinearFactory",
     "AttnImplFactory",
     "FMHAImplBase",
+    "AttnPyObj",
 ]
diff --git a/rtp_llm/models_py/modules/factory/attention/__init__.py b/rtp_llm/models_py/modules/factory/attention/__init__.py
diff --git a/rtp_llm/models_py/modules/factory/attention/attn_pyobj.py b/rtp_llm/models_py/modules/factory/attention/attn_pyobj.py
diff --git a/rtp_llm/models_py/modules/factory/attention/cuda_impl/flash_infer.py b/rtp_llm/models_py/modules/factory/attention/cuda_impl/flash_infer.py
diff --git a/rtp_llm/models_py/modules/factory/attention/cuda_impl/xqa.py b/rtp_llm/models_py/modules/factory/attention/cuda_impl/xqa.py
diff --git a/rtp_llm/models_py/modules/factory/attention/fmha_impl_base.py b/rtp_llm/models_py/modules/factory/attention/fmha_impl_base.py
diff --git a/rtp_llm/models_py/utils/debug.py b/rtp_llm/models_py/utils/debug.py