diff --git a/include/infinicore/context/context.hpp b/include/infinicore/context/context.hpp
index 1612db830..eb46dc622 100644
--- a/include/infinicore/context/context.hpp
+++ b/include/infinicore/context/context.hpp
@@ -3,6 +3,8 @@
 #include "../device.hpp"
 #include "../memory.hpp"
 
+#include "../graph/graph.hpp"
+
 #include <infiniop.h>
 #include <infinirt.h>
 
@@ -40,6 +42,12 @@ void destroyEvent(infinirtEvent_t event);
 float elapsedTime(infinirtEvent_t start, infinirtEvent_t end);
 void streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event);
 
+// Graph recording APIs
+bool isGraphRecording();
+void startGraphRecording();
+void addGraphOperator(std::shared_ptr<graph::GraphOperator> op);
+std::shared_ptr<graph::Graph> stopGraphRecording();
+
 } // namespace context
 
 } // namespace infinicore
diff --git a/include/infinicore/graph/graph.hpp b/include/infinicore/graph/graph.hpp
new file mode 100644
index 000000000..c63b3272d
--- /dev/null
+++ b/include/infinicore/graph/graph.hpp
@@ -0,0 +1,92 @@
+#pragma once
+
+#include <memory>
+#include <vector>
+
+#include "../tensor.hpp"
+
+namespace infinicore::graph {
+// Forward declarations
+class GraphManager;
+
+class GraphTensor : public Tensor {
+public:
+    GraphTensor(const Tensor &);
+};
+
+class GraphOperator {
+
+public:
+    void run() const;
+    ~GraphOperator();
+
+protected:
+    using run_schema = void (*)(void *);
+    using cleanup_schema = void (*)(void **);
+    void *planned_meta_;
+    run_schema runner_;
+    cleanup_schema deleter_;
+};
+
+class Graph {
+public:
+    Graph() = default;
+    ~Graph() = default;
+
+    void run() const;
+
+protected:
+    void add_operator(std::shared_ptr<GraphOperator> op);
+
+    std::vector<std::shared_ptr<GraphOperator>> op_list_;
+
+    friend class GraphManager;
+};
+} // namespace infinicore::graph
+
+#define INFINICORE_GRAPH_OP_CLASS(__OP_NAME__, ...)                        \
+    class __OP_NAME__ : public graph::GraphOperator {                      \
+    public:                                                                \
+        using schema = void (*)(__VA_ARGS__);                              \
+        using plan_schema = void *(*)(__VA_ARGS__);                        \
+        static common::OpDispatcher<plan_schema> &plan_dispatcher();       \
+        static common::OpDispatcher<run_schema> &run_dispatcher();         \
+        static common::OpDispatcher<cleanup_schema> &cleanup_dispatcher(); \
+        __OP_NAME__(__VA_ARGS__);                                          \
+        static void execute(__VA_ARGS__);                                  \
+    };
+
+#define INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(__OP_NAME__)                                  \
+    common::OpDispatcher<__OP_NAME__::plan_schema> &__OP_NAME__::plan_dispatcher() {       \
+        static common::OpDispatcher<__OP_NAME__::plan_schema> dispatcher_;                 \
+        return dispatcher_;                                                                \
+    }                                                                                      \
+    common::OpDispatcher<__OP_NAME__::run_schema> &__OP_NAME__::run_dispatcher() {         \
+        static common::OpDispatcher<__OP_NAME__::run_schema> dispatcher_;                  \
+        return dispatcher_;                                                                \
+    }                                                                                      \
+    common::OpDispatcher<__OP_NAME__::cleanup_schema> &__OP_NAME__::cleanup_dispatcher() { \
+        static common::OpDispatcher<__OP_NAME__::cleanup_schema> dispatcher_;              \
+        return dispatcher_;                                                                \
+    }
+
+#define INFINICORE_GRAPH_OP_DISPATCH(__DEVICE_TYPE__, ...)                  \
+    planned_meta_ = plan_dispatcher().lookup(__DEVICE_TYPE__)(__VA_ARGS__); \
+    runner_ = run_dispatcher().lookup(__DEVICE_TYPE__);                     \
+    deleter_ = cleanup_dispatcher().lookup(__DEVICE_TYPE__);
+
+#define INFINICORE_GRAPH_OP_RECORD_OR_RUN(__OP_NAME__, ...) \
+    auto op = std::make_shared<__OP_NAME__>(__VA_ARGS__);   \
+    if (context::isGraphRecording()) {                      \
+        context::addGraphOperator(op);                      \
+    } else {                                                \
+        op->run();                                          \
+    }
+
+#define INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(__OP_NAME__, __PLAN_F__, __RUN_F__, __CLEANUP_F__) \
+    static bool registered = []() {                                                               \
+        __OP_NAME__::plan_dispatcher().registerAll(__PLAN_F__, false);                            \
+        __OP_NAME__::run_dispatcher().registerAll(__RUN_F__, false);                              \
+        __OP_NAME__::cleanup_dispatcher().registerAll(__CLEANUP_F__, false);                      \
+        return true;                                                                              \
+    }();
diff --git a/include/infinicore/ops/gemm.hpp b/include/infinicore/ops/gemm.hpp
index 6562f087d..481d47cf6 100644
--- a/include/infinicore/ops/gemm.hpp
+++ b/include/infinicore/ops/gemm.hpp
@@ -1,16 +1,12 @@
 #pragma once
 
 #include "../device.hpp"
+#include "../graph/graph.hpp"
 #include "common/op.hpp"
 
 namespace infinicore::op {
 
-class Gemm {
-public:
-    using schema = void (*)(Tensor, Tensor, Tensor, float, float);
-    static void execute(Tensor c, Tensor a, Tensor b, float alpha, float beta);
-    static common::OpDispatcher<schema> &dispatcher();
-};
+INFINICORE_GRAPH_OP_CLASS(Gemm, Tensor, Tensor, Tensor, float, float);
 
 Tensor gemm(Tensor a, Tensor b, float alpha = 1.0f, float beta = 0.0f);
 void gemm_(Tensor c, Tensor a, Tensor b, float alpha, float beta);
diff --git a/include/infinicore/ops/paged_caching.hpp b/include/infinicore/ops/paged_caching.hpp
index 4cf8d4f02..e357cda38 100644
--- a/include/infinicore/ops/paged_caching.hpp
+++ b/include/infinicore/ops/paged_caching.hpp
@@ -8,10 +8,10 @@ namespace infinicore::op {
 class PagedCaching {
 public:
     using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor);
-    static void execute(Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, Tensor slot_mapping);
+    static void execute(Tensor k_cache, Tensor v_cache, Tensor k, Tensor v, Tensor slot_mapping);
     static common::OpDispatcher<schema> &dispatcher();
 };
 
-void paged_caching_(Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, Tensor slot_mapping);
+void paged_caching_(Tensor k_cache, Tensor v_cache, Tensor k, Tensor v, Tensor slot_mapping);
 
 } // namespace infinicore::op
diff --git a/include/infinicore/tensor.hpp b/include/infinicore/tensor.hpp
index 58a8f59e7..e9f210186 100644
--- a/include/infinicore/tensor.hpp
+++ b/include/infinicore/tensor.hpp
@@ -133,6 +133,8 @@ class TensorImpl : public std::enable_shared_from_this<TensorImpl> {
 
     void debug() const;
 
+    Tensor to_blob() const;
+
     ///
     /// Data Transfer APIs
     ///
@@ -294,7 +296,7 @@ class TensorImpl : public std::enable_shared_from_this<TensorImpl> {
 
     friend class Tensor;
 
-private:
+protected:
     TensorMetaData meta_;
     TensorData data_;
 };
diff --git a/include/infiniop/ops/paged_caching.h b/include/infiniop/ops/paged_caching.h
index cf162ba38..33764138d 100644
--- a/include/infiniop/ops/paged_caching.h
+++ b/include/infiniop/ops/paged_caching.h
@@ -14,20 +14,20 @@ typedef struct InfiniopDescriptor *infiniopPagedCachingDescriptor_t;
  *
  * @param handle The handle to the InfiniOP library context.
  * @param desc_ptr A pointer to store the created descriptor.
- * @param k_desc Descriptor for the source key tensor.
- * @param v_desc Descriptor for the source value tensor.
  * @param k_cache_desc Descriptor for the key cache pool tensor.
  * @param v_cache_desc Descriptor for the value cache pool tensor.
+ * @param k_desc Descriptor for the source key tensor.
+ * @param v_desc Descriptor for the source value tensor.
  * @param slot_mapping_desc Descriptor for the slot mapping tensor.
  * @return infiniStatus_t Status code of the operation.
  */
 __C __export infiniStatus_t infiniopCreatePagedCachingDescriptor(
     infiniopHandle_t handle,
     infiniopPagedCachingDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t k_desc,
-    infiniopTensorDescriptor_t v_desc,
     infiniopTensorDescriptor_t k_cache_desc,
     infiniopTensorDescriptor_t v_cache_desc,
+    infiniopTensorDescriptor_t k_desc,
+    infiniopTensorDescriptor_t v_desc,
     infiniopTensorDescriptor_t slot_mapping_desc);
 
 /**
@@ -46,10 +46,10 @@ __C __export infiniStatus_t infiniopGetPagedCachingWorkspaceSize(
  * @param desc The Paged Caching descriptor.
  * @param workspace Pointer to the workspace memory.
  * @param workspace_size The size of the workspace.
- * @param k Pointer to the source key tensor data.
- * @param v Pointer to the source value tensor data.
  * @param k_cache Pointer to the key cache pool data.
  * @param v_cache Pointer to the value cache pool data.
+ * @param k Pointer to the source key tensor data.
+ * @param v Pointer to the source value tensor data.
  * @param slot_mapping Pointer to the slot mapping data.
  * @param stream The CUDA stream for the operation. Can be NULL.
  * @return infiniStatus_t Status code of the operation.
@@ -58,10 +58,10 @@ __C __export infiniStatus_t infiniopPagedCaching(
     infiniopPagedCachingDescriptor_t desc,
     void *workspace,
     size_t workspace_size,
-    const void *k,
-    const void *v,
     void *k_cache,
     void *v_cache,
+    const void *k,
+    const void *v,
     const void *slot_mapping,
     void *stream);
 
diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
index b7288f3ac..c6b01d5aa 100644
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
@@ -8,7 +8,10 @@
     get_device,
     get_device_count,
     get_stream,
+    is_graph_recording,
     set_device,
+    start_graph_recording,
+    stop_graph_recording,
     sync_device,
     sync_stream,
 )
@@ -81,6 +84,9 @@
     "set_device",
     "sync_device",
     "sync_stream",
+    "is_graph_recording",
+    "start_graph_recording",
+    "stop_graph_recording",
     # Data Types.
     "bfloat16",
     "bool",
diff --git a/python/infinicore/context.py b/python/infinicore/context.py
index d74a839f2..1181738c5 100644
--- a/python/infinicore/context.py
+++ b/python/infinicore/context.py
@@ -1,4 +1,5 @@
 import infinicore.device
+from infinicore.graph import Graph
 from infinicore.lib import _infinicore
 
 
@@ -49,3 +50,24 @@ def get_stream():
         stream: The current stream object
     """
     return _infinicore.get_stream()
+
+
+def is_graph_recording():
+    """Check if the current graph is recording.
+
+    Returns:
+        bool: True if the current graph is recording, False otherwise
+    """
+    return _infinicore.is_graph_recording()
+
+
+def start_graph_recording(device=None):
+    """Start recording the current graph."""
+    if device is not None:
+        set_device(device)
+    _infinicore.start_graph_recording()
+
+
+def stop_graph_recording():
+    """Stop recording the current graph."""
+    return Graph(_infinicore.stop_graph_recording())
diff --git a/python/infinicore/graph.py b/python/infinicore/graph.py
new file mode 100644
index 000000000..7d7feb970
--- /dev/null
+++ b/python/infinicore/graph.py
@@ -0,0 +1,18 @@
+from infinicore.lib import _infinicore
+
+
+class Graph:
+    """
+    Python wrapper around a InfiniCore Graph instance.
+    """
+
+    def __init__(self, graph: _infinicore.Graph):
+        if not isinstance(graph, _infinicore.Graph):
+            raise TypeError("Expected _infinicore.Graph")
+        self._graph = graph
+
+    def run(self):
+        return self._graph.run()
+
+    def __repr__(self):
+        return f"<Graph wrapper of {self._graph!r}>"
diff --git a/python/infinicore/ops/paged_caching.py b/python/infinicore/ops/paged_caching.py
index 59459eba1..e3b8d63fb 100644
--- a/python/infinicore/ops/paged_caching.py
+++ b/python/infinicore/ops/paged_caching.py
@@ -3,18 +3,18 @@
 
 
 def paged_caching(
-    k: Tensor,
-    v: Tensor,
     k_cache: Tensor,
     v_cache: Tensor,
+    k: Tensor,
+    v: Tensor,
     slot_mapping: Tensor,
 ):
     Tensor(
         _infinicore.paged_caching_(
-            k._underlying,
-            v._underlying,
             k_cache._underlying,
             v_cache._underlying,
+            k._underlying,
+            v._underlying,
             slot_mapping._underlying,
         )
     )
diff --git a/src/infinicore/context/allocators/device_pinned_allocator.cc b/src/infinicore/context/allocators/device_pinned_allocator.cc
index b9e8ea217..36ee599ff 100644
--- a/src/infinicore/context/allocators/device_pinned_allocator.cc
+++ b/src/infinicore/context/allocators/device_pinned_allocator.cc
@@ -12,12 +12,18 @@ DevicePinnedHostAllocator::~DevicePinnedHostAllocator() {
 }
 
 std::byte *DevicePinnedHostAllocator::allocate(size_t size) {
+    if (size == 0) {
+        return nullptr;
+    }
     void *ptr;
     INFINICORE_CHECK_ERROR(infinirtMallocHost(&ptr, size));
     return (std::byte *)ptr;
 }
 
 void DevicePinnedHostAllocator::deallocate(std::byte *ptr) {
+    if (ptr == nullptr) {
+        return;
+    }
     if (owner_ == context::getDevice()) {
         INFINICORE_CHECK_ERROR(infinirtFreeHost(ptr));
         gc();
diff --git a/src/infinicore/context/allocators/host_allocator.cc b/src/infinicore/context/allocators/host_allocator.cc
index a06ca8284..41a55b453 100644
--- a/src/infinicore/context/allocators/host_allocator.cc
+++ b/src/infinicore/context/allocators/host_allocator.cc
@@ -4,10 +4,16 @@
 
 namespace infinicore {
 std::byte *HostAllocator::allocate(size_t size) {
+    if (size == 0) {
+        return nullptr;
+    }
     return (std::byte *)std::malloc(size);
 }
 
 void HostAllocator::deallocate(std::byte *ptr) {
+    if (ptr == nullptr) {
+        return;
+    }
     std::free(ptr);
 }
 
diff --git a/src/infinicore/context/allocators/pinnable_block_allocator.cc b/src/infinicore/context/allocators/pinnable_block_allocator.cc
index ad81367a3..f41800d7c 100644
--- a/src/infinicore/context/allocators/pinnable_block_allocator.cc
+++ b/src/infinicore/context/allocators/pinnable_block_allocator.cc
@@ -1,5 +1,7 @@
 #include "pinnable_block_allocator.hpp"
 
+#include "../context_impl.hpp"
+
 #include "../../utils.hpp"
 
 #include <algorithm>
@@ -35,6 +37,9 @@ PinnableBlockAllocator::PinnableBlockAllocator(Device device)
 
 // ------------------- allocate -------------------
 std::byte *PinnableBlockAllocator::allocate(size_t size) {
+    if (size == 0) {
+        return nullptr;
+    }
     std::lock_guard<std::mutex> lock(mutex_);
 
     // Align size to 256 bytes for GPU
@@ -92,7 +97,7 @@ std::byte *PinnableBlockAllocator::allocate(size_t size) {
 
 // ------------------- deallocate -------------------
 void PinnableBlockAllocator::deallocate(std::byte *ptr) {
-    if (!ptr) {
+    if (ptr == nullptr) {
         return;
     }
 
diff --git a/src/infinicore/context/allocators/pinnable_block_allocator.hpp b/src/infinicore/context/allocators/pinnable_block_allocator.hpp
index 65aac7784..8911d2a6d 100644
--- a/src/infinicore/context/allocators/pinnable_block_allocator.hpp
+++ b/src/infinicore/context/allocators/pinnable_block_allocator.hpp
@@ -2,8 +2,6 @@
 
 #include "memory_allocator.hpp"
 
-#include "../context_impl.hpp"
-
 #include <mutex>
 #include <unordered_map>
 #include <vector>
@@ -25,7 +23,7 @@ class PinnableBlockAllocator : public MemoryAllocator {
     };
 
 public:
-    explicit PinnableBlockAllocator(Device device);
+    PinnableBlockAllocator(Device device);
     ~PinnableBlockAllocator();
 
     std::byte *allocate(size_t size) override;
diff --git a/src/infinicore/context/allocators/stream_ordered_allocator.cc b/src/infinicore/context/allocators/stream_ordered_allocator.cc
index 5e5148973..3bd0aaab1 100644
--- a/src/infinicore/context/allocators/stream_ordered_allocator.cc
+++ b/src/infinicore/context/allocators/stream_ordered_allocator.cc
@@ -8,12 +8,18 @@ namespace infinicore {
 StreamOrderedAllocator::StreamOrderedAllocator(Device device) : MemoryAllocator(), device_(device) {}
 
 std::byte *StreamOrderedAllocator::allocate(size_t size) {
+    if (size == 0) {
+        return nullptr;
+    }
     void *ptr = nullptr;
     INFINICORE_CHECK_ERROR(infinirtMallocAsync(&ptr, size, context::getStream()));
     return (std::byte *)ptr;
 }
 
 void StreamOrderedAllocator::deallocate(std::byte *ptr) {
+    if (ptr == nullptr) {
+        return;
+    }
     INFINICORE_CHECK_ERROR(infinirtFreeAsync(ptr, context::getStream()));
 }
 } // namespace infinicore
diff --git a/src/infinicore/context/context_impl.cc b/src/infinicore/context/context_impl.cc
index c5ed7acd1..5faf3bfe7 100644
--- a/src/infinicore/context/context_impl.cc
+++ b/src/infinicore/context/context_impl.cc
@@ -39,6 +39,10 @@ void ContextImpl::setDevice(Device device) {
         return;
     }
 
+    if (getCurrentRuntime()->isGraphRecording()) {
+        spdlog::warn("Switching device runtime during graph recording may break the graph!");
+    }
+
     if (runtime_table_[int(device.getType())][device.getIndex()] == nullptr) {
         // Lazy initialization of runtime if never set before.
         runtime_table_[int(device.getType())][device.getIndex()] = std::unique_ptr<Runtime>(new Runtime(device));
@@ -178,6 +182,21 @@ void streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event) {
     ContextImpl::singleton().getCurrentRuntime()->streamWaitEvent(stream, event);
 }
 
+bool isGraphRecording() {
+    return ContextImpl::singleton().getCurrentRuntime()->isGraphRecording();
+}
+
+void startGraphRecording() {
+    ContextImpl::singleton().getCurrentRuntime()->startGraphRecording();
+}
+
+void addGraphOperator(std::shared_ptr<graph::GraphOperator> op) {
+    ContextImpl::singleton().getCurrentRuntime()->addGraphOperator(op);
+}
+
+std::shared_ptr<graph::Graph> stopGraphRecording() {
+    return ContextImpl::singleton().getCurrentRuntime()->stopGraphRecording();
+}
 } // namespace context
 
 } // namespace infinicore
diff --git a/src/infinicore/context/runtime/runtime.cc b/src/infinicore/context/runtime/runtime.cc
index 85ab094df..a6dd7eb7e 100644
--- a/src/infinicore/context/runtime/runtime.cc
+++ b/src/infinicore/context/runtime/runtime.cc
@@ -8,12 +8,12 @@
 #include "../allocators/stream_ordered_allocator.hpp"
 
 namespace infinicore {
-Runtime::Runtime(Device device) : device_(device) {
+Runtime::Runtime(Device device) : device_(device), graph_manager_(std::make_unique<graph::GraphManager>()) {
     activate();
     INFINICORE_CHECK_ERROR(infinirtStreamCreate(&stream_));
     INFINICORE_CHECK_ERROR(infiniopCreateHandle(&infiniop_handle_));
     if (device_.getType() == Device::Type::CPU) {
-        device_memory_allocator_ = std::make_unique<HostAllocator>();
+        device_memory_allocator_ = std::make_unique<PinnableBlockAllocator>(device);
     } else {
         device_memory_allocator_ = std::make_unique<PinnableBlockAllocator>(device);
         pinned_host_memory_allocator_ = std::make_unique<DevicePinnedHostAllocator>(device);
@@ -145,6 +145,25 @@ void Runtime::streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event) {
     INFINICORE_CHECK_ERROR(infinirtStreamWaitEvent(stream, event));
 }
 
+bool Runtime::isGraphRecording() const {
+    return graph_manager_->is_recording();
+}
+
+void Runtime::startGraphRecording() {
+    device_memory_allocator_->set_pin_mode(true);
+    return graph_manager_->start_recording();
+}
+
+void Runtime::addGraphOperator(std::shared_ptr<graph::GraphOperator> op) {
+    return graph_manager_->add_operator(op);
+}
+
+std::shared_ptr<graph::Graph> Runtime::stopGraphRecording() {
+    auto graph = graph_manager_->stop_recording();
+    device_memory_allocator_->set_pin_mode(false);
+    return graph;
+}
+
 std::string Runtime::toString() const {
     return fmt::format("Runtime({})", device_.toString());
 }
diff --git a/src/infinicore/context/runtime/runtime.hpp b/src/infinicore/context/runtime/runtime.hpp
index c731b7804..58d8bd424 100644
--- a/src/infinicore/context/runtime/runtime.hpp
+++ b/src/infinicore/context/runtime/runtime.hpp
@@ -1,8 +1,11 @@
 #pragma once
 
-#include "../allocators/memory_allocator.hpp"
+#include "../allocators/pinnable_block_allocator.hpp"
+
 #include "infinicore/context/context.hpp"
 
+#include "../../graph/graph_manager.hpp"
+
 #include <infiniop.h>
 #include <infinirt.h>
 
@@ -13,8 +16,9 @@ class Runtime {
     Device device_;
     infinirtStream_t stream_;
     infiniopHandle_t infiniop_handle_;
-    std::unique_ptr<MemoryAllocator> device_memory_allocator_;
+    std::unique_ptr<PinnableBlockAllocator> device_memory_allocator_;
     std::unique_ptr<MemoryAllocator> pinned_host_memory_allocator_;
+    std::unique_ptr<graph::GraphManager> graph_manager_;
 
 protected:
     Runtime(Device device);
@@ -48,6 +52,12 @@ class Runtime {
     float elapsedTime(infinirtEvent_t start, infinirtEvent_t end);
     void streamWaitEvent(infinirtStream_t stream, infinirtEvent_t event);
 
+    // Graph
+    bool isGraphRecording() const;
+    void startGraphRecording();
+    void addGraphOperator(std::shared_ptr<graph::GraphOperator> op);
+    std::shared_ptr<graph::Graph> stopGraphRecording();
+
     std::string toString() const;
 
     friend class ContextImpl;
diff --git a/src/infinicore/graph/graph.cc b/src/infinicore/graph/graph.cc
new file mode 100644
index 000000000..86944af36
--- /dev/null
+++ b/src/infinicore/graph/graph.cc
@@ -0,0 +1,67 @@
+#include "graph_manager.hpp"
+
+#include "../utils.hpp"
+
+namespace infinicore::graph {
+
+/* =========================
+ * GraphTensor
+ * ========================= */
+
+GraphTensor::GraphTensor(const Tensor &tensor) : Tensor(tensor->to_blob()) {
+}
+
+/* =========================
+ * GraphOperator
+ * ========================= */
+
+void GraphOperator::run() const {
+    runner_(planned_meta_);
+}
+
+GraphOperator::~GraphOperator() {
+    if (deleter_) {
+        deleter_(&planned_meta_);
+    }
+}
+
+/* =========================
+ * Graph
+ * ========================= */
+
+void Graph::run() const {
+    for (auto &op : op_list_) {
+        op->run();
+    }
+}
+
+void Graph::add_operator(std::shared_ptr<GraphOperator> op) {
+    op_list_.push_back(op);
+}
+
+/* =========================
+ * GraphManager
+ * ========================= */
+
+bool GraphManager::is_recording() const {
+    return recording_;
+}
+
+void GraphManager::start_recording() {
+    recording_ = true;
+    graph_ = std::make_shared<Graph>();
+}
+
+void GraphManager::add_operator(std::shared_ptr<GraphOperator> op) {
+    INFINICORE_ASSERT(recording_);
+
+    graph_->add_operator(op);
+}
+
+std::shared_ptr<Graph> GraphManager::stop_recording() {
+
+    recording_ = false;
+    return std::exchange(graph_, nullptr);
+}
+
+} // namespace infinicore::graph
diff --git a/src/infinicore/graph/graph_manager.hpp b/src/infinicore/graph/graph_manager.hpp
new file mode 100644
index 000000000..9cc040366
--- /dev/null
+++ b/src/infinicore/graph/graph_manager.hpp
@@ -0,0 +1,25 @@
+#pragma once
+
+#include "infinicore/graph/graph.hpp"
+
+#include <memory>
+#include <vector>
+
+namespace infinicore::graph {
+
+class GraphManager {
+public:
+    GraphManager() = default;
+    ~GraphManager() = default;
+
+    bool is_recording() const;
+    void start_recording();
+    void add_operator(std::shared_ptr<GraphOperator> op);
+    std::shared_ptr<Graph> stop_recording();
+
+private:
+    std::shared_ptr<Graph> graph_;
+    bool recording_ = false;
+};
+
+} // namespace infinicore::graph
diff --git a/src/infinicore/ops/gemm/gemm.cc b/src/infinicore/ops/gemm/gemm.cc
index ecaaa4960..e2b3924f7 100644
--- a/src/infinicore/ops/gemm/gemm.cc
+++ b/src/infinicore/ops/gemm/gemm.cc
@@ -3,16 +3,15 @@
 #include "../../utils.hpp"
 
 namespace infinicore::op {
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Gemm);
 
-common::OpDispatcher<Gemm::schema> &Gemm::dispatcher() {
-    static common::OpDispatcher<Gemm::schema> dispatcher_;
-    return dispatcher_;
-};
+Gemm::Gemm(Tensor c, Tensor a, Tensor b, float alpha, float beta) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b);
+    INFINICORE_GRAPH_OP_DISPATCH(c->device().getType(), c, a, b, alpha, beta);
+}
 
 void Gemm::execute(Tensor c, Tensor a, Tensor b, float alpha, float beta) {
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b);
-    infinicore::context::setDevice(c->device());
-    dispatcher().lookup(c->device().getType())(c, a, b, alpha, beta);
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(Gemm, c, a, b, alpha, beta);
 }
 
 Tensor gemm(Tensor a, Tensor b, float alpha, float beta) {
diff --git a/src/infinicore/ops/gemm/gemm_infiniop.cc b/src/infinicore/ops/gemm/gemm_infiniop.cc
index 3577be190..670fdbc2a 100644
--- a/src/infinicore/ops/gemm/gemm_infiniop.cc
+++ b/src/infinicore/ops/gemm/gemm_infiniop.cc
@@ -1,50 +1,49 @@
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
+#include "../infiniop_impl.hpp"
 #include "infinicore/ops/gemm.hpp"
-#include <infiniop.h>
 
 namespace infinicore::op::gemm_impl::infiniop {
 
-thread_local common::OpCache<size_t, infiniopGemmDescriptor_t> caches(
-    100, // capacity
-    [](infiniopGemmDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyGemmDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-void calculate(Tensor c, Tensor a, Tensor b, float alpha, float beta) {
-    size_t seed = hash_combine(c, b, a, alpha, beta);
-
-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
-
-    auto desc_opt = cache.get(seed);
-    infiniopGemmDescriptor_t desc = nullptr;
-
-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateGemmDescriptor(
-            context::getInfiniopHandle(device), &desc,
-            c->desc(), a->desc(), b->desc()));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
-
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetGemmWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Gemm, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, c, a, b;
+    float alpha, beta;
+};
+
+void *plan(Tensor c, Tensor a, Tensor b, float alpha, float beta) {
+    size_t seed = hash_combine(c, a, b);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, Gemm,
+        seed, c->desc(), a->desc(), b->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, Gemm, descriptor);
+
+    auto planned = new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(c),
+        graph::GraphTensor(a),
+        graph::GraphTensor(b),
+        alpha, beta};
+
+    return planned;
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
 
     INFINICORE_CHECK_ERROR(infiniopGemm(
-        desc, workspace->data(), workspace_size,
-        c->data(), a->data(), b->data(), alpha, beta, context::getStream()));
+        planned->descriptor->desc, planned->workspace->data(), planned->workspace->numel(),
+        planned->c->data(), planned->a->data(), planned->b->data(), planned->alpha, planned->beta, context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
 }
 
-static bool registered = []() {
-    Gemm::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Gemm, &plan, &run, &cleanup);
 
 } // namespace infinicore::op::gemm_impl::infiniop
diff --git a/src/infinicore/ops/infiniop_impl.hpp b/src/infinicore/ops/infiniop_impl.hpp
new file mode 100644
index 000000000..2bf38c8c6
--- /dev/null
+++ b/src/infinicore/ops/infiniop_impl.hpp
@@ -0,0 +1,50 @@
+#pragma once
+
+#include "../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+#define INFINIOP_CACHABLE_DESCRIPTOR(__DESC_TYPE__, __OP_NAME__, __SIZE__)   \
+    struct __DESC_TYPE__ {                                                   \
+        infiniop##__OP_NAME__##Descriptor_t desc;                            \
+        Descriptor(infiniop##__OP_NAME__##Descriptor_t desc) : desc(desc) {} \
+        ~Descriptor() {                                                      \
+            if (desc != nullptr) {                                           \
+                infiniopDestroy##__OP_NAME__##Descriptor(desc);              \
+                desc = nullptr;                                              \
+            }                                                                \
+        }                                                                    \
+    };                                                                       \
+                                                                             \
+    thread_local common::OpCache<size_t, std::shared_ptr<__DESC_TYPE__>>     \
+        caches(                                                              \
+            __SIZE__,                                                        \
+            [](std::shared_ptr<__DESC_TYPE__> &desc) {                       \
+                desc = nullptr;                                              \
+            });
+
+#define INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(__DESC_TYPE__, __DESC_NAME__, __INFINIOP_NAME__, __HASH_KEY__, ...) \
+    std::shared_ptr<__DESC_TYPE__> __DESC_NAME__;                                                                      \
+    {                                                                                                                  \
+        auto device__ = context::getDevice();                                                                          \
+        auto &cache__ = caches.getCache(device__);                                                                     \
+        __DESC_NAME__ = cache__.get(__HASH_KEY__).value_or(nullptr);                                                   \
+        if (!__DESC_NAME__) {                                                                                          \
+            __DESC_NAME__ = std::make_shared<__DESC_TYPE__>(nullptr);                                                  \
+            INFINICORE_CHECK_ERROR(infiniopCreate##__INFINIOP_NAME__##Descriptor(                                      \
+                context::getInfiniopHandle(device__),                                                                  \
+                &__DESC_NAME__->desc,                                                                                  \
+                __VA_ARGS__));                                                                                         \
+            cache__.put(__HASH_KEY__, __DESC_NAME__);                                                                  \
+        }                                                                                                              \
+    }
+
+#define INFINIOP_WORKSPACE_TENSOR(__TENSOR_NAME__, __INFINIOP_NAME__, __DESC_NAME__)                                 \
+    Tensor __TENSOR_NAME__;                                                                                          \
+    {                                                                                                                \
+        auto device__ = context::getDevice();                                                                        \
+        size_t workspace_size = 0;                                                                                   \
+        INFINICORE_CHECK_ERROR(infiniopGet##__INFINIOP_NAME__##WorkspaceSize(__DESC_NAME__->desc, &workspace_size)); \
+        __TENSOR_NAME__ = Tensor::empty({workspace_size}, DataType::U8, device__);                                   \
+    }
diff --git a/src/infinicore/ops/linear/linear.cc b/src/infinicore/ops/linear/linear.cc
index cd766195e..097e4b48b 100644
--- a/src/infinicore/ops/linear/linear.cc
+++ b/src/infinicore/ops/linear/linear.cc
@@ -1,6 +1,6 @@
 #include "infinicore/ops/linear.hpp"
-#include "infinicore/ops/add.hpp"
-#include "infinicore/ops/matmul.hpp"
+#include "infinicore/ops/gemm.hpp"
+#include "infinicore/ops/rearrange.hpp"
 
 namespace infinicore::op {
 
@@ -42,16 +42,18 @@ void linear_(Tensor out,
 
     // linear transformation
     Tensor out_view = out->view({N, out_features});
-    matmul_(out_view,
-            input->view({N, in_features}),
-            weight->permute({1, 0}));
-
     // Add bias
+    float alpha = 1.0f;
+    float beta = 0.0f;
     if (bias.has_value()) {
-        add_(out_view,
-             out_view,
-             bias.value()->as_strided({N, out_features}, {0, 1}));
+        rearrange_(out_view,
+                   bias.value()->as_strided({N, out_features}, {0, 1}));
+        beta = 1.0f;
     }
+
+    gemm_(out_view,
+          input->view({N, in_features}),
+          weight->permute({1, 0}), alpha, beta);
 }
 
 } // namespace infinicore::op
diff --git a/src/infinicore/ops/paged_caching/paged_caching.cc b/src/infinicore/ops/paged_caching/paged_caching.cc
index b1fe104a9..cc14bf236 100644
--- a/src/infinicore/ops/paged_caching/paged_caching.cc
+++ b/src/infinicore/ops/paged_caching/paged_caching.cc
@@ -9,14 +9,14 @@ common::OpDispatcher<PagedCaching::schema> &PagedCaching::dispatcher() {
     return dispatcher_;
 };
 
-void PagedCaching::execute(Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, Tensor slot_mapping) {
-    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(k, v, k_cache, v_cache, slot_mapping);
-    infinicore::context::setDevice(k->device());
-    dispatcher().lookup(k->device().getType())(k, v, k_cache, v_cache, slot_mapping);
+void PagedCaching::execute(Tensor k_cache, Tensor v_cache, Tensor k, Tensor v, Tensor slot_mapping) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(k_cache, v_cache, k, v, slot_mapping);
+    infinicore::context::setDevice(k_cache->device());
+    dispatcher().lookup(k_cache->device().getType())(k_cache, v_cache, k, v, slot_mapping);
 }
 
-void paged_caching_(Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, Tensor slot_mapping) {
-    PagedCaching::execute(k, v, k_cache, v_cache, slot_mapping);
+void paged_caching_(Tensor k_cache, Tensor v_cache, Tensor k, Tensor v, Tensor slot_mapping) {
+    PagedCaching::execute(k_cache, v_cache, k, v, slot_mapping);
 }
 
 } // namespace infinicore::op
diff --git a/src/infinicore/ops/paged_caching/paged_caching_infiniop.cc b/src/infinicore/ops/paged_caching/paged_caching_infiniop.cc
index f312ef817..7dcaf47a0 100644
--- a/src/infinicore/ops/paged_caching/paged_caching_infiniop.cc
+++ b/src/infinicore/ops/paged_caching/paged_caching_infiniop.cc
@@ -15,8 +15,8 @@ thread_local common::OpCache<size_t, infiniopPagedCachingDescriptor_t> caches(
         }
     });
 
-void calculate(Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, Tensor slot_mapping) {
-    size_t seed = hash_combine(k, v, k_cache, v_cache, slot_mapping);
+void calculate(Tensor k_cache, Tensor v_cache, Tensor k, Tensor v, Tensor slot_mapping) {
+    size_t seed = hash_combine(k_cache, v_cache, k, v, slot_mapping);
 
     auto device = context::getDevice();
     auto &cache = caches.getCache(device);
@@ -27,7 +27,7 @@ void calculate(Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, Tensor slot_m
     if (!desc_opt) {
         INFINICORE_CHECK_ERROR(infiniopCreatePagedCachingDescriptor(
             context::getInfiniopHandle(device), &desc,
-            k->desc(), v->desc(), k_cache->desc(), v_cache->desc(), slot_mapping->desc()));
+            k_cache->desc(), v_cache->desc(), k->desc(), v->desc(), slot_mapping->desc()));
         cache.put(seed, desc);
     } else {
         desc = *desc_opt;
@@ -39,7 +39,7 @@ void calculate(Tensor k, Tensor v, Tensor k_cache, Tensor v_cache, Tensor slot_m
 
     INFINICORE_CHECK_ERROR(infiniopPagedCaching(
         desc, workspace->data(), workspace_size,
-        k->data(), v->data(), k_cache->data(), v_cache->data(), slot_mapping->data(), context::getStream()));
+        k_cache->data(), v_cache->data(), k->data(), v->data(), slot_mapping->data(), context::getStream()));
 }
 
 static bool registered = []() {
diff --git a/src/infinicore/pybind11/context.hpp b/src/infinicore/pybind11/context.hpp
index 657e30877..9c24a322e 100644
--- a/src/infinicore/pybind11/context.hpp
+++ b/src/infinicore/pybind11/context.hpp
@@ -24,6 +24,11 @@ inline void bind(py::module &m) {
     // Synchronization
     m.def("sync_stream", &syncStream, "Synchronize the current stream");
     m.def("sync_device", &syncDevice, "Synchronize the current device");
+
+    // Graph
+    m.def("is_graph_recording", &isGraphRecording, "Check if graph recording is turned on");
+    m.def("start_graph_recording", &startGraphRecording, "Start graph recording");
+    m.def("stop_graph_recording", &stopGraphRecording, "Stop graph recording and return the graph");
 }
 
 } // namespace infinicore::context
diff --git a/src/infinicore/pybind11/graph.hpp b/src/infinicore/pybind11/graph.hpp
new file mode 100644
index 000000000..d45c9b32c
--- /dev/null
+++ b/src/infinicore/pybind11/graph.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "infinicore.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::graph {
+inline void bind(py::module_ &m) {
+    py::class_<infinicore::graph::Graph,
+               std::shared_ptr<infinicore::graph::Graph>>(m, "Graph")
+        .def(py::init<>()) // allow construction
+        .def("run", &infinicore::graph::Graph::run);
+}
+} // namespace infinicore::graph
diff --git a/src/infinicore/pybind11/infinicore.cc b/src/infinicore/pybind11/infinicore.cc
index 152adefd9..937a281aa 100644
--- a/src/infinicore/pybind11/infinicore.cc
+++ b/src/infinicore/pybind11/infinicore.cc
@@ -6,6 +6,7 @@
 #include "device.hpp"
 #include "device_event.hpp"
 #include "dtype.hpp"
+#include "graph.hpp"
 #include "ops.hpp"
 #include "tensor.hpp"
 
@@ -18,6 +19,7 @@ PYBIND11_MODULE(_infinicore, m) {
     dtype::bind(m);
     ops::bind(m);
     tensor::bind(m);
+    graph::bind(m);
 }
 
 } // namespace infinicore
diff --git a/src/infinicore/pybind11/ops/paged_caching.hpp b/src/infinicore/pybind11/ops/paged_caching.hpp
index 87c130f5d..4320b4eef 100644
--- a/src/infinicore/pybind11/ops/paged_caching.hpp
+++ b/src/infinicore/pybind11/ops/paged_caching.hpp
@@ -11,10 +11,10 @@ namespace infinicore::ops {
 inline void bind_paged_caching(py::module &m) {
     m.def("paged_caching_",
           &op::paged_caching_,
-          py::arg("k"),
-          py::arg("v"),
           py::arg("k_cache"),
           py::arg("v_cache"),
+          py::arg("k"),
+          py::arg("v"),
           py::arg("slot_mapping"),
           R"doc(Paged caching of key and value tensors.)doc");
 }
diff --git a/src/infinicore/tensor/tensor.cc b/src/infinicore/tensor/tensor.cc
index 7c2ab5428..2acc6dec8 100644
--- a/src/infinicore/tensor/tensor.cc
+++ b/src/infinicore/tensor/tensor.cc
@@ -275,4 +275,12 @@ std::shared_ptr<TensorImpl> TensorImpl::strided_from_blob(
     return t;
 }
 
+Tensor TensorImpl::to_blob() const {
+    auto t = std::shared_ptr<TensorImpl>(new TensorImpl(shape(), strides(), dtype()));
+    t->data_.offset = this->data_.offset;
+    t->data_.memory = std::make_shared<Memory>(this->data_.memory->data(), this->data_.memory->size(), this->data_.memory->device(), nullptr);
+
+    return Tensor{t};
+}
+
 } // namespace infinicore
diff --git a/src/infinicore/utils.hpp b/src/infinicore/utils.hpp
index 11b70455f..2a5d3e20d 100644
--- a/src/infinicore/utils.hpp
+++ b/src/infinicore/utils.hpp
@@ -47,3 +47,14 @@ inline struct SpdlogInitializer {
             }                                                                     \
         }                                                                         \
     } while (0)
+
+#define INFINICORE_ASSERT(CONDITION__)                                                                                                         \
+    do {                                                                                                                                       \
+        if (!(CONDITION__)) {                                                                                                                  \
+            SPDLOG_ERROR(                                                                                                                      \
+                "Assertion `{}` failed from {} at {}:{}",                                                                                      \
+                #CONDITION__, __func__, __FILE__, __LINE__);                                                                                   \
+            throw std::runtime_error(                                                                                                          \
+                std::string("Assertion `") + #CONDITION__ + "` failed from " + __func__ + " at " + __FILE__ + ":" + std::to_string(__LINE__)); \
+        }                                                                                                                                      \
+    } while (0)
diff --git a/src/infiniop/ops/paged_attention/info.h b/src/infiniop/ops/paged_attention/info.h
index 17b8bf65b..216bb2360 100644
--- a/src/infiniop/ops/paged_attention/info.h
+++ b/src/infiniop/ops/paged_attention/info.h
@@ -67,11 +67,9 @@ class PagedAttentionInfo {
         size_t num_heads = q_shape[1];
         size_t head_size = q_shape[2];
 
-        if (head_size != 128) {
-            // 输出具体的错误原因和当前的参数值
-            std::cerr << "[Error] Now only supports head_size = 128, but got "
+        if (head_size != 16 && head_size != 32 && head_size != 64 && head_size != 128 && head_size != 256) {
+            std::cerr << "[Error] Now only supports head_size = 16/32/64/128/256, but got "
                       << head_size << "." << std::endl;
-            // 建议返回 SHAPE 相关的错误码
             return INFINI_STATUS_BAD_TENSOR_SHAPE;
         }
 
diff --git a/src/infiniop/ops/paged_attention/nvidia/paged_attention_nvidia.cu b/src/infiniop/ops/paged_attention/nvidia/paged_attention_nvidia.cu
index 9f86133e2..d544fd34a 100644
--- a/src/infiniop/ops/paged_attention/nvidia/paged_attention_nvidia.cu
+++ b/src/infiniop/ops/paged_attention/nvidia/paged_attention_nvidia.cu
@@ -98,37 +98,49 @@ infiniStatus_t Descriptor::calculate(
     const void *block_tables, const void *seq_lens, const void *alibi_slopes,
     void *stream_) const {
     cudaStream_t stream = (cudaStream_t)stream_;
+
+#define LAUNCH_HEADSIZE_BLOCKSIZE(__H_SIZE, __B_SIZE)                                    \
+    launchKernel<__H_SIZE, __B_SIZE>(                                                    \
+        out, q, k_cache, v_cache, _info.dtype, block_tables, seq_lens, alibi_slopes,     \
+        _info.num_heads, _info.num_seqs,                                                 \
+        _info.num_kv_heads, _info.scale, _info.max_num_blocks_per_seq, _info.block_size, \
+        _info.q_stride, _info.kv_block_stride, _info.kv_head_stride, _info.o_stride,     \
+        stream);
+
+#define SWITCH_HEAD_SIZE(__B_SIZE)               \
+    switch (_info.head_size) {                   \
+    case 16:                                     \
+        LAUNCH_HEADSIZE_BLOCKSIZE(16, __B_SIZE)  \
+        break;                                   \
+    case 32:                                     \
+        LAUNCH_HEADSIZE_BLOCKSIZE(32, __B_SIZE)  \
+        break;                                   \
+    case 64:                                     \
+        LAUNCH_HEADSIZE_BLOCKSIZE(64, __B_SIZE)  \
+        break;                                   \
+    case 128:                                    \
+        LAUNCH_HEADSIZE_BLOCKSIZE(128, __B_SIZE) \
+        break;                                   \
+    case 256:                                    \
+        LAUNCH_HEADSIZE_BLOCKSIZE(256, __B_SIZE) \
+        break;                                   \
+    default:                                     \
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;   \
+    }
+
     if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
-        if (_info.head_size == 128) {
-            launchKernel<128, CUDA_BLOCK_SIZE_1024>(
-                out, q, k_cache, v_cache, _info.dtype, block_tables, seq_lens, alibi_slopes,
-                _info.num_heads, _info.num_seqs,
-                _info.num_kv_heads, _info.scale, _info.max_num_blocks_per_seq, _info.block_size,
-                _info.q_stride, _info.kv_block_stride, _info.kv_head_stride, _info.o_stride,
-                stream);
-        }
+        SWITCH_HEAD_SIZE(CUDA_BLOCK_SIZE_1024)
     } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
-        if (_info.head_size == 128) {
-            launchKernel<128, CUDA_BLOCK_SIZE_512>(
-                out, q, k_cache, v_cache, _info.dtype, block_tables, seq_lens, alibi_slopes,
-                _info.num_heads, _info.num_seqs,
-                _info.num_kv_heads, _info.scale, _info.max_num_blocks_per_seq, _info.block_size,
-                _info.q_stride, _info.kv_block_stride, _info.kv_head_stride, _info.o_stride,
-                stream);
-        }
+        SWITCH_HEAD_SIZE(CUDA_BLOCK_SIZE_512)
     } else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
-        if (_info.head_size == 128) {
-            launchKernel<128, CUDA_BLOCK_SIZE_4096>(
-                out, q, k_cache, v_cache, _info.dtype, block_tables, seq_lens, alibi_slopes,
-                _info.num_heads, _info.num_seqs,
-                _info.num_kv_heads, _info.scale, _info.max_num_blocks_per_seq, _info.block_size,
-                _info.q_stride, _info.kv_block_stride, _info.kv_head_stride, _info.o_stride,
-                stream);
-        }
+        SWITCH_HEAD_SIZE(CUDA_BLOCK_SIZE_4096)
     } else {
         return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
     }
 
+#undef LAUNCH_HEADSIZE_BLOCKSIZE
+#undef SWITCH_HEAD_SIZE
+
     return INFINI_STATUS_SUCCESS;
 }
 
diff --git a/src/infiniop/ops/paged_attention/operator.cc b/src/infiniop/ops/paged_attention/operator.cc
index f41adb2cb..1d7d4fee3 100644
--- a/src/infiniop/ops/paged_attention/operator.cc
+++ b/src/infiniop/ops/paged_attention/operator.cc
@@ -5,9 +5,9 @@
 #ifdef ENABLE_NVIDIA_API
 #include "nvidia/paged_attention_nvidia.cuh"
 #endif
-#ifdef ENABLE_METAX_API
-#include "metax/paged_attention_metax.h"
-#endif
+// #ifdef ENABLE_METAX_API
+// #include "metax/paged_attention_metax.h"
+// #endif
 
 __C infiniStatus_t infiniopCreatePagedAttentionDescriptor(
     infiniopHandle_t handle,
@@ -34,11 +34,12 @@ __C infiniStatus_t infiniopCreatePagedAttentionDescriptor(
 #ifdef ENABLE_NVIDIA_API
         CREATE(INFINI_DEVICE_NVIDIA, nvidia)
 #endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax)
-#endif
+    // #ifdef ENABLE_METAX_API
+    //         CREATE(INFINI_DEVICE_METAX, metax)
+    // #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
     }
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
 __C infiniStatus_t infiniopGetPagedAttentionWorkspaceSize(
@@ -54,11 +55,12 @@ __C infiniStatus_t infiniopGetPagedAttentionWorkspaceSize(
 #ifdef ENABLE_NVIDIA_API
         GET(INFINI_DEVICE_NVIDIA, nvidia)
 #endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax)
-#endif
+    // #ifdef ENABLE_METAX_API
+    //         GET(INFINI_DEVICE_METAX, metax)
+    // #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
     }
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
 __C infiniStatus_t infiniopPagedAttention(
@@ -78,11 +80,12 @@ __C infiniStatus_t infiniopPagedAttention(
 #ifdef ENABLE_NVIDIA_API
         CALCULATE(INFINI_DEVICE_NVIDIA, nvidia)
 #endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax)
-#endif
+    // #ifdef ENABLE_METAX_API
+    //         CALCULATE(INFINI_DEVICE_METAX, metax)
+    // #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
     }
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
 __C infiniStatus_t infiniopDestroyPagedAttentionDescriptor(
@@ -97,9 +100,10 @@ __C infiniStatus_t infiniopDestroyPagedAttentionDescriptor(
 #ifdef ENABLE_NVIDIA_API
         DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
 #endif
-#ifdef ENABLE_METAX_API
-        DESTROY(INFINI_DEVICE_METAX, metax)
-#endif
+    // #ifdef ENABLE_METAX_API
+    //         DESTROY(INFINI_DEVICE_METAX, metax)
+    // #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
     }
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
diff --git a/src/infiniop/ops/paged_caching/info.h b/src/infiniop/ops/paged_caching/info.h
index 5b8a9234d..87f6b5181 100644
--- a/src/infiniop/ops/paged_caching/info.h
+++ b/src/infiniop/ops/paged_caching/info.h
@@ -28,10 +28,10 @@ class PagedCachingInfo {
     ptrdiff_t v_cache_block_stride;
 
     static utils::Result<PagedCachingInfo> create(
-        infiniopTensorDescriptor_t k_desc,
-        infiniopTensorDescriptor_t v_desc,
         infiniopTensorDescriptor_t k_cache_desc,
         infiniopTensorDescriptor_t v_cache_desc,
+        infiniopTensorDescriptor_t k_desc,
+        infiniopTensorDescriptor_t v_desc,
         infiniopTensorDescriptor_t slot_mapping_desc) {
 
         auto dtype = k_desc->dtype();
diff --git a/src/infiniop/ops/paged_caching/nvidia/paged_caching_nvidia.cu b/src/infiniop/ops/paged_caching/nvidia/paged_caching_nvidia.cu
index b0344999e..9b87b3f7f 100644
--- a/src/infiniop/ops/paged_caching/nvidia/paged_caching_nvidia.cu
+++ b/src/infiniop/ops/paged_caching/nvidia/paged_caching_nvidia.cu
@@ -31,13 +31,13 @@ Descriptor::~Descriptor() {
 infiniStatus_t Descriptor::create(
     infiniopHandle_t handle,
     Descriptor **desc_ptr,
-    infiniopTensorDescriptor_t k_desc,
-    infiniopTensorDescriptor_t v_desc,
     infiniopTensorDescriptor_t k_cache_desc,
     infiniopTensorDescriptor_t v_cache_desc,
+    infiniopTensorDescriptor_t k_desc,
+    infiniopTensorDescriptor_t v_desc,
     infiniopTensorDescriptor_t slot_mapping_desc) {
 
-    auto info = PagedCachingInfo::create(k_desc, v_desc, k_cache_desc, v_cache_desc, slot_mapping_desc);
+    auto info = PagedCachingInfo::create(k_cache_desc, v_cache_desc, k_desc, v_desc, slot_mapping_desc);
     CHECK_RESULT(info);
 
     // Create and return the Descriptor instance.
@@ -121,8 +121,8 @@ infiniStatus_t launchKernel(const PagedCachingInfo &info,
 // Execution method implementation
 infiniStatus_t Descriptor::calculate(
     void *workspace, size_t workspace_size,
-    const void *k, const void *v,
     void *k_cache, void *v_cache,
+    const void *k, const void *v,
     const void *slot_mapping,
     void *stream_) const {
 
diff --git a/src/infiniop/ops/paged_caching/operator.cc b/src/infiniop/ops/paged_caching/operator.cc
index a69b0e07e..3bfd92280 100644
--- a/src/infiniop/ops/paged_caching/operator.cc
+++ b/src/infiniop/ops/paged_caching/operator.cc
@@ -5,17 +5,17 @@
 #ifdef ENABLE_NVIDIA_API
 #include "nvidia/paged_caching_nvidia.cuh"
 #endif
-#ifdef ENABLE_METAX_API
-#include "metax/paged_caching_metax.h"
-#endif
+// #ifdef ENABLE_METAX_API
+// #include "metax/paged_caching_metax.h"
+// #endif
 
 __C infiniStatus_t infiniopCreatePagedCachingDescriptor(
     infiniopHandle_t handle,
     infiniopPagedCachingDescriptor_t *desc_ptr,
-    infiniopTensorDescriptor_t k_desc,
-    infiniopTensorDescriptor_t v_desc,
     infiniopTensorDescriptor_t k_cache_desc,
     infiniopTensorDescriptor_t v_cache_desc,
+    infiniopTensorDescriptor_t k_desc,
+    infiniopTensorDescriptor_t v_desc,
     infiniopTensorDescriptor_t slot_mapping_desc) {
 
 #define CREATE(CASE, NAMESPACE)                                                      \
@@ -23,17 +23,18 @@ __C infiniStatus_t infiniopCreatePagedCachingDescriptor(
         return op::paged_caching::NAMESPACE::Descriptor::create(                     \
             handle,                                                                  \
             reinterpret_cast<op::paged_caching::NAMESPACE::Descriptor **>(desc_ptr), \
-            k_desc, v_desc, k_cache_desc, v_cache_desc, slot_mapping_desc);
+            k_cache_desc, v_cache_desc, k_desc, v_desc, slot_mapping_desc);
 
     switch (handle->device) {
 #ifdef ENABLE_NVIDIA_API
         CREATE(INFINI_DEVICE_NVIDIA, nvidia)
 #endif
-#ifdef ENABLE_METAX_API
-        CREATE(INFINI_DEVICE_METAX, metax)
-#endif
+    // #ifdef ENABLE_METAX_API
+    //         CREATE(INFINI_DEVICE_METAX, metax)
+    // #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
     }
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
 __C infiniStatus_t infiniopGetPagedCachingWorkspaceSize(
@@ -49,35 +50,37 @@ __C infiniStatus_t infiniopGetPagedCachingWorkspaceSize(
 #ifdef ENABLE_NVIDIA_API
         GET(INFINI_DEVICE_NVIDIA, nvidia)
 #endif
-#ifdef ENABLE_METAX_API
-        GET(INFINI_DEVICE_METAX, metax)
-#endif
+    // #ifdef ENABLE_METAX_API
+    //         GET(INFINI_DEVICE_METAX, metax)
+    // #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
     }
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
 __C infiniStatus_t infiniopPagedCaching(
     infiniopPagedCachingDescriptor_t desc,
     void *workspace, size_t workspace_size,
-    const void *k, const void *v,
     void *k_cache, void *v_cache,
+    const void *k, const void *v,
     const void *slot_mapping,
     void *stream) {
 
 #define CALCULATE(CASE, NAMESPACE)                                                            \
     case CASE:                                                                                \
         return reinterpret_cast<op::paged_caching::NAMESPACE::Descriptor *>(desc)->calculate( \
-            workspace, workspace_size, k, v, k_cache, v_cache, slot_mapping, stream);
+            workspace, workspace_size, k_cache, v_cache, k, v, slot_mapping, stream);
 
     switch (desc->device_type) {
 #ifdef ENABLE_NVIDIA_API
         CALCULATE(INFINI_DEVICE_NVIDIA, nvidia)
 #endif
-#ifdef ENABLE_METAX_API
-        CALCULATE(INFINI_DEVICE_METAX, metax)
-#endif
+    // #ifdef ENABLE_METAX_API
+    //         CALCULATE(INFINI_DEVICE_METAX, metax)
+    // #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
     }
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
 
 __C infiniStatus_t infiniopDestroyPagedCachingDescriptor(
@@ -92,9 +95,10 @@ __C infiniStatus_t infiniopDestroyPagedCachingDescriptor(
 #ifdef ENABLE_NVIDIA_API
         DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
 #endif
-#ifdef ENABLE_METAX_API
-        DESTROY(INFINI_DEVICE_METAX, metax)
-#endif
+    // #ifdef ENABLE_METAX_API
+    //         DESTROY(INFINI_DEVICE_METAX, metax)
+    // #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
     }
-    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
 }
diff --git a/src/infiniop/ops/paged_caching/paged_caching.h b/src/infiniop/ops/paged_caching/paged_caching.h
index 473ddb5fd..cbd7a2701 100644
--- a/src/infiniop/ops/paged_caching/paged_caching.h
+++ b/src/infiniop/ops/paged_caching/paged_caching.h
@@ -32,16 +32,16 @@
         static infiniStatus_t create(                            \
             infiniopHandle_t handle,                             \
             Descriptor **desc_ptr,                               \
-            infiniopTensorDescriptor_t k_desc,                   \
-            infiniopTensorDescriptor_t v_desc,                   \
             infiniopTensorDescriptor_t k_cache_desc,             \
             infiniopTensorDescriptor_t v_cache_desc,             \
+            infiniopTensorDescriptor_t k_desc,                   \
+            infiniopTensorDescriptor_t v_desc,                   \
             infiniopTensorDescriptor_t slot_mapping_desc);       \
                                                                  \
         infiniStatus_t calculate(                                \
             void *workspace, size_t workspace_size,              \
-            const void *k, const void *v,                        \
             void *k_cache, void *v_cache,                        \
+            const void *k, const void *v,                        \
             const void *slot_mapping,                            \
             void *stream) const;                                 \
     };                                                           \
diff --git a/test/infinicore/framework/tensor.py b/test/infinicore/framework/tensor.py
index 10a4655ab..317135d65 100644
--- a/test/infinicore/framework/tensor.py
+++ b/test/infinicore/framework/tensor.py
@@ -60,7 +60,12 @@ def _create_contiguous_tensor(shape, torch_dtype, torch_device_str, mode, **kwar
 
         # Handle real floating-point types
         if mode == TensorInitializer.RANDOM:
-            return torch.rand(shape, dtype=torch_dtype, device=torch_device_str)
+            scale = kwargs.get("scale", 1.0)
+            bias = kwargs.get("bias", 0.0)
+            return (
+                torch.rand(shape, dtype=torch_dtype, device=torch_device_str) * scale
+                + bias
+            )
         elif mode == TensorInitializer.ZEROS:
             return torch.zeros(shape, dtype=torch_dtype, device=torch_device_str)
         elif mode == TensorInitializer.ONES:
diff --git a/test/infinicore/graph/graph.py b/test/infinicore/graph/graph.py
new file mode 100644
index 000000000..2f8927110
--- /dev/null
+++ b/test/infinicore/graph/graph.py
@@ -0,0 +1,85 @@
+import sys
+import os
+
+sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
+
+import torch
+import infinicore
+from framework import BaseOperatorTest, TensorSpec, TestCase, GenericTestRunner
+
+# Test cases format: (in_shape, proj_w_shape)
+_TEST_CASES_DATA = [
+    ((32, 4096), (4096, 4096)),
+]
+
+_TOLERANCE_MAP = {
+    infinicore.float16: {"atol": 0, "rtol": 1e-2},
+    infinicore.float32: {"atol": 1e-4, "rtol": 1e-3},
+    infinicore.bfloat16: {"atol": 0, "rtol": 5e-2},
+}
+_TENSOR_DTYPES = [infinicore.float16, infinicore.float32, infinicore.bfloat16]
+
+
+def parse_test_cases():
+    cases = []
+    for in_shape, proj_w_shape in _TEST_CASES_DATA:
+        for dtype in _TENSOR_DTYPES:
+            tol = _TOLERANCE_MAP[dtype]
+            in_spec = TensorSpec.from_tensor(in_shape, dtype=dtype)
+            proj_w_spec = TensorSpec.from_tensor(proj_w_shape, dtype=dtype)
+            temp_spec = TensorSpec.from_tensor(in_shape, dtype=dtype)
+
+            # Out-of-place
+            cases.append(
+                TestCase(
+                    inputs=[in_spec, proj_w_spec, temp_spec],
+                    kwargs={},
+                    output_spec=None,
+                    comparison_target=None,
+                    tolerance=tol,
+                    description="Graph",
+                )
+            )
+
+    return cases
+
+
+class OpTest(BaseOperatorTest):
+    """Test Operator Graph"""
+
+    def __init__(self):
+        super().__init__("Graph")
+
+    def get_test_cases(self):
+        return parse_test_cases()
+
+    def torch_operator(self, *args, **kwargs):
+        a = args[0]
+        b = args[1]
+
+        return torch.matmul(a, b)
+
+    def infinicore_operator(self, *args, **kwargs):
+        """Record graph and run"""
+        a = args[0]
+        b = args[1]
+        temp_a = args[2]
+
+        infinicore.start_graph_recording()
+        c = infinicore.matmul(temp_a, b)
+        op_graph = infinicore.stop_graph_recording()
+
+        temp_a.copy_(a)
+        op_graph.run()
+
+        return c
+
+
+def main():
+    """Main entry point"""
+    runner = GenericTestRunner(OpTest)
+    runner.run_and_exit()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/infinicore/ops/paged_attention.py b/test/infinicore/ops/paged_attention.py
index cc9f2977f..06515633d 100644
--- a/test/infinicore/ops/paged_attention.py
+++ b/test/infinicore/ops/paged_attention.py
@@ -25,6 +25,7 @@
     (4, 40, 40, 128, 16, 1024, False),
     (6, 40, 40, 128, 16, 1024, False),
     (3, 8, 8, 128, 16, 1024, False),
+    (3, 8, 8, 64, 16, 1024, False),
     (8, 64, 8, 128, 16, 2048, False),
 ]
 
@@ -68,8 +69,6 @@ def parse_test_cases():
             0, num_seqs * max_blocks_per_seq, dtype=torch.int64
         ).view(num_seqs, max_blocks_per_seq)
 
-        print("block_tables.shape", block_tables.shape, block_tables)
-
         q_shape = (num_seqs, num_heads, head_size)
         k_cache_shape = (num_blocks, num_kv_heads, block_size, head_size)
         v_cache_shape = (num_blocks, num_kv_heads, block_size, head_size)
diff --git a/test/infinicore/ops/paged_caching.py b/test/infinicore/ops/paged_caching.py
index 4098aa06c..65e59eaae 100644
--- a/test/infinicore/ops/paged_caching.py
+++ b/test/infinicore/ops/paged_caching.py
@@ -28,9 +28,9 @@
 
 # Tolerance configuration
 _TOLERANCE_MAP = {
-    infinicore.float16: {"atol": 0, "rtol": 1e-2},
-    infinicore.float32: {"atol": 1e-4, "rtol": 1e-3},
-    infinicore.bfloat16: {"atol": 0, "rtol": 5e-2},
+    infinicore.float16: {"atol": 0, "rtol": 1e-5},
+    infinicore.float32: {"atol": 0, "rtol": 1e-5},
+    infinicore.bfloat16: {"atol": 0, "rtol": 1e-5},
 }
 
 # Data types to test
@@ -40,15 +40,15 @@
 # ==============================================================================
 #  Reference Implementation
 # ==============================================================================
-def ref_paged_caching(key, value, key_cache_pool, value_cache_pool, slot_mapping):
+def ref_paged_caching(key_cache_pool, value_cache_pool, key, value, slot_mapping):
     """
     Reference implementation for paged_caching operator.
 
     Args:
-        key (torch.Tensor): Keys, shape [ntok, nkvh, dh]
-        value (torch.Tensor): Values, shape [ntok, nkvh, dh]
         key_cache_pool (torch.Tensor): K cache pool, shape [num_blocks, nkvh, block_size, dh]
         value_cache_pool (torch.Tensor): V cache pool, shape [num_blocks, nkvh, block_size, dh]
+        key (torch.Tensor): Keys, shape [ntok, nkvh, dh]
+        value (torch.Tensor): Values, shape [ntok, nkvh, dh]
         slot_mapping (torch.Tensor): Slot mapping, shape [ntok]
     """
     ntok = key.shape[0]
@@ -56,8 +56,8 @@ def ref_paged_caching(key, value, key_cache_pool, value_cache_pool, slot_mapping
 
     # This reference implementation operates on a cloned cache to avoid modifying the original input tensor,
     # mimicking the behavior where the custom operator writes to its output tensor.
-    k_cache_ref = key_cache_pool.clone()
-    v_cache_ref = value_cache_pool.clone()
+    k_cache_ref = key_cache_pool
+    v_cache_ref = value_cache_pool
 
     for i in range(ntok):
         slot = slot_mapping[i].item()
@@ -98,9 +98,9 @@ def parse_test_cases():
             current_slot += length.item()
 
         # Ensure we don't exceed the total number of slots in the cache
-        assert (
-            current_slot <= num_blocks * block_size
-        ), "Not enough blocks in the cache pool for this test case"
+        assert current_slot <= num_blocks * block_size, (
+            "Not enough blocks in the cache pool for this test case"
+        )
 
         slot_mapping = torch.tensor(slot_mapping_list, dtype=torch.int64)
 
@@ -119,8 +119,12 @@ def parse_test_cases():
             # Create typed tensor specs
             k_spec = TensorSpec.from_tensor(k_shape, None, dtype)
             v_spec = TensorSpec.from_tensor(v_shape, None, dtype)
-            k_cache_spec = TensorSpec.from_tensor(k_cache_shape, None, dtype)
-            v_cache_spec = TensorSpec.from_tensor(v_cache_shape, None, dtype)
+            k_cache_spec = TensorSpec.from_tensor(
+                k_cache_shape, None, dtype, init_mode=TensorInitializer.ZEROS
+            )
+            v_cache_spec = TensorSpec.from_tensor(
+                v_cache_shape, None, dtype, init_mode=TensorInitializer.ZEROS
+            )
             slot_mapping_spec = TensorSpec.from_tensor(
                 slot_mapping_shape,
                 init_mode=TensorInitializer.MANUAL,
@@ -132,10 +136,10 @@ def parse_test_cases():
             test_cases.append(
                 TestCase(
                     inputs=[
-                        k_spec,
-                        v_spec,
                         k_cache_spec,
                         v_cache_spec,
+                        k_spec,
+                        v_spec,
                         slot_mapping_spec,
                     ],
                     kwargs=None,
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index 283bdb1cd..618be2b05 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -1066,10 +1066,10 @@ def paged_caching_(lib):
     lib.infiniopCreatePagedCachingDescriptor.argtypes = [
         infiniopHandle_t,
         POINTER(infiniopOperatorDescriptor_t),
-        infiniopTensorDescriptor_t,  # k_desc
-        infiniopTensorDescriptor_t,  # v_desc
         infiniopTensorDescriptor_t,  # k_cache_desc
         infiniopTensorDescriptor_t,  # v_cache_desc
+        infiniopTensorDescriptor_t,  # k_desc
+        infiniopTensorDescriptor_t,  # v_desc
         infiniopTensorDescriptor_t,  # slot_mapping_desc
     ]
 
@@ -1086,10 +1086,10 @@ def paged_caching_(lib):
         infiniopOperatorDescriptor_t,
         c_void_p,  # workspace
         c_size_t,  # workspace_size
-        c_void_p,  # k
-        c_void_p,  # v
         c_void_p,  # k_cache
         c_void_p,  # v_cache
+        c_void_p,  # k
+        c_void_p,  # v
         c_void_p,  # slot_mapping
         c_void_p,  # stream
     ]
diff --git a/test/infiniop/paged_attention.py b/test/infiniop/paged_attention.py
index da44c8e62..882e9cfee 100644
--- a/test/infiniop/paged_attention.py
+++ b/test/infiniop/paged_attention.py
@@ -95,6 +95,7 @@ def ref_single_query_cached_kv_attention(
     (4, 40, 40, 128, 16, 1024, False),
     (6, 40, 40, 128, 16, 1024, False),
     (3, 8, 8, 128, 16, 1024, False),
+    (3, 8, 8, 64, 16, 1024, False),
     (8, 64, 8, 128, 16, 2048, False),
 ]
 
diff --git a/test/infiniop/paged_caching.py b/test/infiniop/paged_caching.py
index 947859ec9..012067465 100644
--- a/test/infiniop/paged_caching.py
+++ b/test/infiniop/paged_caching.py
@@ -22,15 +22,15 @@
 # ==============================================================================
 #  Reference Implementation
 # ==============================================================================
-def ref_paged_caching(key, value, key_cache_pool, value_cache_pool, slot_mapping):
+def ref_paged_caching(key_cache_pool, value_cache_pool, key, value, slot_mapping):
     """
     Reference implementation for paged_caching operator.
 
     Args:
-        key (torch.Tensor): Keys, shape [ntok, nkvh, dh]
-        value (torch.Tensor): Values, shape [ntok, nkvh, dh]
         key_cache_pool (torch.Tensor): K cache pool, shape [num_blocks, nkvh, block_size, dh]
         value_cache_pool (torch.Tensor): V cache pool, shape [num_blocks, nkvh, block_size, dh]
+        key (torch.Tensor): Keys, shape [ntok, nkvh, dh]
+        value (torch.Tensor): Values, shape [ntok, nkvh, dh]
         slot_mapping (torch.Tensor): Slot mapping, shape [ntok]
     """
     ntok = key.shape[0]
@@ -71,9 +71,9 @@ def ref_paged_caching(key, value, key_cache_pool, value_cache_pool, slot_mapping
 
 # Tolerance map for different data types
 _TOLERANCE_MAP = {
-    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
-    InfiniDtype.BF16: {"atol": 5e-3, "rtol": 5e-2},
-    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
+    InfiniDtype.F16: {"atol": 0, "rtol": 1e-5},
+    InfiniDtype.BF16: {"atol": 0, "rtol": 1e-5},
+    InfiniDtype.F32: {"atol": 0, "rtol": 1e-5},
 }
 
 # Global flags for controlling test behavior
@@ -123,9 +123,9 @@ def test(
         current_slot += length.item()
 
     # Ensure we don't exceed the total number of slots in the cache
-    assert (
-        current_slot <= num_blocks * block_size
-    ), "Not enough blocks in the cache pool for this test case"
+    assert current_slot <= num_blocks * block_size, (
+        "Not enough blocks in the cache pool for this test case"
+    )
 
     slot_mapping_torch = torch.tensor(slot_mapping_list, dtype=torch.int64)
 
@@ -144,10 +144,10 @@ def test(
 
     # Run reference implementation
     k_cache_ref, v_cache_ref = ref_paged_caching(
-        k.torch_tensor(),
-        v.torch_tensor(),
         k_cache_pool.torch_tensor(),
         v_cache_pool.torch_tensor(),
+        k.torch_tensor(),
+        v.torch_tensor(),
         slot_mapping.torch_tensor(),
     )
 
@@ -160,10 +160,10 @@ def test(
         LIBINFINIOP.infiniopCreatePagedCachingDescriptor(
             handle,
             ctypes.byref(descriptor),
-            k.descriptor,
-            v.descriptor,
             k_cache_pool.descriptor,
             v_cache_pool.descriptor,
+            k.descriptor,
+            v.descriptor,
             slot_mapping.descriptor,
         )
     )
@@ -191,10 +191,10 @@ def lib_paged_caching():
                 descriptor,
                 workspace.data(),
                 workspace_size.value,
-                k.data(),
-                v.data(),
                 k_cache_pool.data(),
                 v_cache_pool.data(),
+                k.data(),
+                v.data(),
                 slot_mapping.data(),
                 None,
             )
diff --git a/test/infiniop/paged_caching_prefill.py b/test/infiniop/paged_caching_prefill.py
index 1fa9957fc..f39cd2afc 100644
--- a/test/infiniop/paged_caching_prefill.py
+++ b/test/infiniop/paged_caching_prefill.py
@@ -80,7 +80,7 @@ def allocate_slots(self, request_id, num_new_tokens):
         return torch.tensor(slots, dtype=torch.int32)
 
 
-def ref_paged_caching(k_new, v_new, k_pool, v_pool, slots, block_size):
+def ref_paged_caching(k_pool, v_pool, k_new, v_new, slots, block_size):
     """Reference implementation for incremental caching."""
     for i in range(k_new.shape[0]):
         slot = slots[i].item()
@@ -152,10 +152,10 @@ def test(
         def torch_caching():
             nonlocal k_pool_ref, v_pool_ref
             return ref_paged_caching(
-                k_in.torch_tensor(),
-                v_in.torch_tensor(),
                 k_pool_ref,
                 v_pool_ref,
+                k_in.torch_tensor(),
+                v_in.torch_tensor(),
                 slots_torch,
                 block_size,
             )
@@ -168,10 +168,10 @@ def torch_caching():
             LIBINFINIOP.infiniopCreatePagedCachingDescriptor(
                 handle,
                 ctypes.byref(descriptor),
-                k_in.descriptor,
-                v_in.descriptor,
                 k_cache_pool.descriptor,
                 v_cache_pool.descriptor,
+                k_in.descriptor,
+                v_in.descriptor,
                 slot_mapping.descriptor,
             )
         )
@@ -190,10 +190,10 @@ def lib_caching():
                     descriptor,
                     workspace.data(),
                     workspace_size.value,
-                    k_in.data(),
-                    v_in.data(),
                     k_cache_pool.data(),
                     v_cache_pool.data(),
+                    k_in.data(),
+                    v_in.data(),
                     slot_mapping.data(),
                     None,
                 )
diff --git a/xmake.lua b/xmake.lua
index 0a5b2d473..d5a4ba7f7 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -268,6 +268,9 @@ target("infinirt")
         add_deps("infinirt-hygon")
     end
     set_languages("cxx17")
+    if not is_plat("windows") then
+        add_cxflags("-fPIC")
+    end
     set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
     add_files("src/infinirt/*.cc")
     add_installfiles("include/infinirt.h", {prefixdir = "include"})
@@ -390,6 +393,7 @@ target("infinicore_cpp_api")
     add_files("src/infinicore/context/*.cc")
     add_files("src/infinicore/context/*/*.cc")
     add_files("src/infinicore/tensor/*.cc")
+    add_files("src/infinicore/graph/*.cc")
     add_files("src/infinicore/nn/*.cc")
     add_files("src/infinicore/ops/*/*.cc")
     add_files("src/utils/*.cc")
@@ -418,6 +422,8 @@ target("_infinicore")
     add_packages("pybind11")
     set_languages("cxx17")
 
+    add_deps("infinicore_cpp_api")
+
     set_kind("shared")
     local INFINI_ROOT = os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")
     add_includedirs(INFINI_ROOT.."/include", { public = true })
@@ -425,14 +431,7 @@ target("_infinicore")
     add_linkdirs(INFINI_ROOT.."/lib")
     add_links("infiniop", "infinirt", "infiniccl")
 
-    add_files("src/infinicore/*.cc")
-    add_files("src/infinicore/context/*.cc")
-    add_files("src/infinicore/context/*/*.cc")
-    add_files("src/infinicore/tensor/*.cc")
-    add_files("src/infinicore/nn/*.cc")
-    add_files("src/infinicore/ops/*/*.cc")
     add_files("src/infinicore/pybind11/**.cc")
-    add_files("src/utils/*.cc")
 
     set_installdir("python/infinicore")
 target_end()