diff --git a/include/infiniop.h b/include/infiniop.h
index 58833f5c7..c3120f62a 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -4,11 +4,18 @@
 #include "infiniop/handle.h"
 #include "infiniop/ops/add.h"
 #include "infiniop/ops/attention.h"
+#include "infiniop/ops/averagepool.h"
+#include "infiniop/ops/averagepool_backward.h"
 #include "infiniop/ops/causal_softmax.h"
 #include "infiniop/ops/clip.h"
 #include "infiniop/ops/conv.h"
+#include "infiniop/ops/conv_backward.h"
+#include "infiniop/ops/cross_entropy_loss.h"
 #include "infiniop/ops/dequantize.h"
 #include "infiniop/ops/gemm.h"
+#include "infiniop/ops/interpolate_nearest.h"
+#include "infiniop/ops/maxpool.h"
+#include "infiniop/ops/maxpool_backward.h"
 #include "infiniop/ops/mul.h"
 #include "infiniop/ops/random_sample.h"
 #include "infiniop/ops/rearrange.h"
diff --git a/include/infiniop/ops/averagepool.h b/include/infiniop/ops/averagepool.h
new file mode 100644
index 000000000..87e857175
--- /dev/null
+++ b/include/infiniop/ops/averagepool.h
@@ -0,0 +1,29 @@
+#ifndef __INFINIOP_AVERAGEPOOL_H__
+#define __INFINIOP_AVERAGEPOOL_H__
+
+#include "../operator_descriptor.h"
+
+__C typedef struct InfiniopDescriptor *infiniopAvgPoolDescriptor_t;
+
+__C infiniStatus_t infiniopCreateAvgPoolDescriptor(infiniopHandle_t handle,
+                                                   infiniopAvgPoolDescriptor_t *desc_ptr,
+                                                   infiniopTensorDescriptor_t output_desc,
+                                                   infiniopTensorDescriptor_t input_desc,
+                                                   void *kernel_size,
+                                                   void *strides,
+                                                   void *pads,
+                                                   bool ceil_mode);
+
+__C infiniStatus_t infiniopGetAvgPoolWorkspaceSize(infiniopAvgPoolDescriptor_t desc,
+                                                   size_t *size);
+
+__C infiniStatus_t infiniopAvgPool(infiniopAvgPoolDescriptor_t desc,
+                                   void *workspace,
+                                   size_t workspace_size,
+                                   void *output,
+                                   const void *input,
+                                   void *stream);
+
+__C infiniStatus_t infiniopDestroyAvgPoolDescriptor(infiniopAvgPoolDescriptor_t desc);
+
+#endif // __INFINIOP_AVERAGEPOOL_H__
diff --git a/include/infiniop/ops/averagepool_backward.h b/include/infiniop/ops/averagepool_backward.h
new file mode 100644
index 000000000..9229e9e1b
--- /dev/null
+++ b/include/infiniop/ops/averagepool_backward.h
@@ -0,0 +1,31 @@
+#ifndef __INFINIOP_AVERAGEPOOL_BACKWARD_H__
+#define __INFINIOP_AVERAGEPOOL_BACKWARD_H__
+
+#include "../operator_descriptor.h"
+
+__C typedef struct InfiniopDescriptor *infiniopAvgPoolBackwardDescriptor_t;
+
+__C infiniStatus_t infiniopCreateAvgPoolBackwardDescriptor(infiniopHandle_t handle,
+                                                           infiniopAvgPoolBackwardDescriptor_t *desc_ptr,
+                                                           infiniopTensorDescriptor_t grad_input_desc,
+                                                           infiniopTensorDescriptor_t grad_output_desc,
+                                                           infiniopTensorDescriptor_t input_desc,
+                                                           void *kernel_size,
+                                                           void *strides,
+                                                           void *pads,
+                                                           bool ceil_mode);
+
+__C infiniStatus_t infiniopGetAvgPoolBackwardWorkspaceSize(infiniopAvgPoolBackwardDescriptor_t desc,
+                                                           size_t *size);
+
+__C infiniStatus_t infiniopAvgPoolBackward(infiniopAvgPoolBackwardDescriptor_t desc,
+                                           void *workspace,
+                                           size_t workspace_size,
+                                           void *grad_input,
+                                           const void *grad_output,
+                                           const void *input,
+                                           void *stream);
+
+__C infiniStatus_t infiniopDestroyAvgPoolBackwardDescriptor(infiniopAvgPoolBackwardDescriptor_t desc);
+
+#endif // __INFINIOP_AVERAGEPOOL_BACKWARD_H__
diff --git a/include/infiniop/ops/conv_backward.h b/include/infiniop/ops/conv_backward.h
new file mode 100644
index 000000000..a692ed0eb
--- /dev/null
+++ b/include/infiniop/ops/conv_backward.h
@@ -0,0 +1,34 @@
+#ifndef __INFINIOP_CONV_BACKWARD_API_H__
+#define __INFINIOP_CONV_BACKWARD_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopConvBackwardDescriptor_t;
+
+__C infiniStatus_t infiniopCreateConvBackwardDescriptor(infiniopHandle_t handle,
+                                                        infiniopConvBackwardDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t grad_output_desc,
+                                                        infiniopTensorDescriptor_t input_desc,
+                                                        infiniopTensorDescriptor_t weight_desc,
+                                                        infiniopTensorDescriptor_t bias_desc,
+                                                        void *pads,
+                                                        void *strides,
+                                                        void *dilations,
+                                                        size_t n);
+
+__C infiniStatus_t infiniopGetConvBackwardWorkspaceSize(infiniopConvBackwardDescriptor_t desc, size_t *size);
+
+__C infiniStatus_t infiniopConvBackward(infiniopConvBackwardDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *grad_input,
+                                        void *grad_weight,
+                                        void *grad_bias,
+                                        const void *grad_output,
+                                        const void *input,
+                                        const void *weight,
+                                        void *stream);
+
+__C infiniStatus_t infiniopDestroyConvBackwardDescriptor(infiniopConvBackwardDescriptor_t desc);
+
+#endif
diff --git a/include/infiniop/ops/cross_entropy_loss.h b/include/infiniop/ops/cross_entropy_loss.h
new file mode 100644
index 000000000..8b59843c9
--- /dev/null
+++ b/include/infiniop/ops/cross_entropy_loss.h
@@ -0,0 +1,27 @@
+#ifndef __INFINIOP_CROSS_ENTROPY_LOSS_API_H__
+#define __INFINIOP_CROSS_ENTROPY_LOSS_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopCrossEntropyLossDescriptor_t;
+
+__C infiniStatus_t infiniopCreateCrossEntropyLossDescriptor(infiniopHandle_t handle,
+                                                            infiniopCrossEntropyLossDescriptor_t *desc_ptr,
+                                                            infiniopTensorDescriptor_t loss_desc,
+                                                            infiniopTensorDescriptor_t logits_desc,
+                                                            infiniopTensorDescriptor_t target_desc);
+
+__C infiniStatus_t infiniopGetCrossEntropyLossWorkspaceSize(infiniopCrossEntropyLossDescriptor_t desc,
+                                                            size_t *size);
+
+__C infiniStatus_t infiniopCrossEntropyLoss(infiniopCrossEntropyLossDescriptor_t desc,
+                                            void *workspace,
+                                            size_t workspace_size,
+                                            void *loss,
+                                            const void *logits,
+                                            const void *target,
+                                            void *stream);
+
+__C infiniStatus_t infiniopDestroyCrossEntropyLossDescriptor(infiniopCrossEntropyLossDescriptor_t desc);
+
+#endif // __INFINIOP_CROSS_ENTROPY_LOSS_API_H__
diff --git a/include/infiniop/ops/interpolate_nearest.h b/include/infiniop/ops/interpolate_nearest.h
new file mode 100644
index 000000000..7f970dc38
--- /dev/null
+++ b/include/infiniop/ops/interpolate_nearest.h
@@ -0,0 +1,25 @@
+#ifndef __INFINIOP_INTERPOLATE_NEAREST_H__
+#define __INFINIOP_INTERPOLATE_NEAREST_H__
+
+#include "../operator_descriptor.h"
+
+__C typedef struct InfiniopDescriptor *infiniopInterpolateNearestDescriptor_t;
+
+__C infiniStatus_t infiniopCreateInterpolateNearestDescriptor(infiniopHandle_t handle,
+                                                              infiniopInterpolateNearestDescriptor_t *desc_ptr,
+                                                              infiniopTensorDescriptor_t output_desc,
+                                                              infiniopTensorDescriptor_t input_desc);
+
+__C infiniStatus_t infiniopGetInterpolateNearestWorkspaceSize(infiniopInterpolateNearestDescriptor_t desc,
+                                                              size_t *size);
+
+__C infiniStatus_t infiniopInterpolateNearest(infiniopInterpolateNearestDescriptor_t desc,
+                                              void *workspace,
+                                              size_t workspace_size,
+                                              void *output,
+                                              const void *input,
+                                              void *stream);
+
+__C infiniStatus_t infiniopDestroyInterpolateNearestDescriptor(infiniopInterpolateNearestDescriptor_t desc);
+
+#endif // __INFINIOP_INTERPOLATE_NEAREST_H__
diff --git a/include/infiniop/ops/maxpool.h b/include/infiniop/ops/maxpool.h
new file mode 100644
index 000000000..e47a43aed
--- /dev/null
+++ b/include/infiniop/ops/maxpool.h
@@ -0,0 +1,29 @@
+#ifndef __INFINIOP_MAX_POOL_H__
+#define __INFINIOP_MAX_POOL_H__
+
+#include "../operator_descriptor.h"
+
+__C typedef struct InfiniopDescriptor *infiniopMaxPoolDescriptor_t;
+
+__C infiniStatus_t infiniopCreateMaxPoolDescriptor(infiniopHandle_t handle,
+                                                   infiniopMaxPoolDescriptor_t *desc_ptr,
+                                                   infiniopTensorDescriptor_t output_desc,
+                                                   infiniopTensorDescriptor_t input_desc,
+                                                   void *kernel_size,
+                                                   void *strides,
+                                                   void *pads,
+                                                   bool ceil_mode);
+
+__C infiniStatus_t infiniopGetMaxPoolWorkspaceSize(infiniopMaxPoolDescriptor_t desc,
+                                                   size_t *size);
+
+__C infiniStatus_t infiniopMaxPool(infiniopMaxPoolDescriptor_t desc,
+                                   void *workspace,
+                                   size_t workspace_size,
+                                   void *output,
+                                   const void *input,
+                                   void *stream);
+
+__C infiniStatus_t infiniopDestroyMaxPoolDescriptor(infiniopMaxPoolDescriptor_t desc);
+
+#endif // __INFINIOP_MAX_POOL_H__
diff --git a/include/infiniop/ops/maxpool_backward.h b/include/infiniop/ops/maxpool_backward.h
new file mode 100644
index 000000000..361c04895
--- /dev/null
+++ b/include/infiniop/ops/maxpool_backward.h
@@ -0,0 +1,31 @@
+#ifndef __INFINIOP_MAXPOOL_BACKWARD_H__
+#define __INFINIOP_MAXPOOL_BACKWARD_H__
+
+#include "../operator_descriptor.h"
+
+__C typedef struct InfiniopDescriptor *infiniopMaxPoolBackwardDescriptor_t;
+
+__C infiniStatus_t infiniopCreateMaxPoolBackwardDescriptor(infiniopHandle_t handle,
+                                                           infiniopMaxPoolBackwardDescriptor_t *desc_ptr,
+                                                           infiniopTensorDescriptor_t grad_input_desc,
+                                                           infiniopTensorDescriptor_t grad_output_desc,
+                                                           infiniopTensorDescriptor_t input_desc,
+                                                           void *kernel_size,
+                                                           void *strides,
+                                                           void *pads,
+                                                           bool ceil_mode);
+
+__C infiniStatus_t infiniopGetMaxPoolBackwardWorkspaceSize(infiniopMaxPoolBackwardDescriptor_t desc,
+                                                           size_t *size);
+
+__C infiniStatus_t infiniopMaxPoolBackward(infiniopMaxPoolBackwardDescriptor_t desc,
+                                           void *workspace,
+                                           size_t workspace_size,
+                                           void *grad_input,
+                                           const void *grad_output,
+                                           const void *input,
+                                           void *stream);
+
+__C infiniStatus_t infiniopDestroyMaxPoolBackwardDescriptor(infiniopMaxPoolBackwardDescriptor_t desc);
+
+#endif // __INFINIOP_MAXPOOL_BACKWARD_H__
diff --git a/scripts/python_test.py b/scripts/python_test.py
index 5348c8c69..710156838 100644
--- a/scripts/python_test.py
+++ b/scripts/python_test.py
@@ -25,6 +25,13 @@ def run_tests(args):
         "sub.py",
         "swiglu.py",
         "softplus.py",
+        "averagepool_backward.py",
+        "averagepool.py",
+        "maxpool_backward.py",
+        "maxpool.py",
+        "interpolate_nearest.py",
+        "conv_backward.py",
+        "cross_entropy_loss.py",
     ]:
         result = subprocess.run(
             f"python {test} {args} --debug", text=True, encoding="utf-8", shell=True
diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp
index 3820f7cfd..cb417761f 100644
--- a/src/infiniop-test/include/ops.hpp
+++ b/src/infiniop-test/include/ops.hpp
@@ -16,6 +16,13 @@ DECLARE_INFINIOP_TEST(add)
 DECLARE_INFINIOP_TEST(causal_softmax)
 DECLARE_INFINIOP_TEST(rearrange)
 DECLARE_INFINIOP_TEST(sub)
+DECLARE_INFINIOP_TEST(cross_entropy_loss)
+DECLARE_INFINIOP_TEST(averagepool)
+DECLARE_INFINIOP_TEST(averagepool_backward)
+DECLARE_INFINIOP_TEST(interpolate_nearest)
+DECLARE_INFINIOP_TEST(conv_backward)
+DECLARE_INFINIOP_TEST(maxpool)
+DECLARE_INFINIOP_TEST(maxpool_backward)
 
 #define REGISTER_INFINIOP_TEST(name)                      \
     {                                                     \
@@ -30,19 +37,26 @@ DECLARE_INFINIOP_TEST(sub)
 /*
  * Register all the tests here
  */
-#define TEST_BUILDER_MAPPINGS                  \
-    {                                          \
-        REGISTER_INFINIOP_TEST(gemm)           \
-        REGISTER_INFINIOP_TEST(random_sample)  \
-        REGISTER_INFINIOP_TEST(add)            \
-        REGISTER_INFINIOP_TEST(mul)            \
-        REGISTER_INFINIOP_TEST(clip)           \
-        REGISTER_INFINIOP_TEST(swiglu)         \
-        REGISTER_INFINIOP_TEST(rope)           \
-        REGISTER_INFINIOP_TEST(rms_norm)       \
-        REGISTER_INFINIOP_TEST(causal_softmax) \
-        REGISTER_INFINIOP_TEST(rearrange)      \
-        REGISTER_INFINIOP_TEST(sub)            \
+#define TEST_BUILDER_MAPPINGS                        \
+    {                                                \
+        REGISTER_INFINIOP_TEST(gemm)                 \
+        REGISTER_INFINIOP_TEST(random_sample)        \
+        REGISTER_INFINIOP_TEST(add)                  \
+        REGISTER_INFINIOP_TEST(mul)                  \
+        REGISTER_INFINIOP_TEST(clip)                 \
+        REGISTER_INFINIOP_TEST(swiglu)               \
+        REGISTER_INFINIOP_TEST(rope)                 \
+        REGISTER_INFINIOP_TEST(rms_norm)             \
+        REGISTER_INFINIOP_TEST(causal_softmax)       \
+        REGISTER_INFINIOP_TEST(rearrange)            \
+        REGISTER_INFINIOP_TEST(sub)                  \
+        REGISTER_INFINIOP_TEST(cross_entropy_loss)   \
+        REGISTER_INFINIOP_TEST(averagepool)          \
+        REGISTER_INFINIOP_TEST(averagepool_backward) \
+        REGISTER_INFINIOP_TEST(interpolate_nearest)  \
+        REGISTER_INFINIOP_TEST(conv_backward)        \
+        REGISTER_INFINIOP_TEST(maxpool)              \
+        REGISTER_INFINIOP_TEST(maxpool_backward)     \
     }
 
 namespace infiniop_test {
diff --git a/src/infiniop-test/src/ops/averagepool.cpp b/src/infiniop-test/src/ops/averagepool.cpp
new file mode 100644
index 000000000..4f6a80201
--- /dev/null
+++ b/src/infiniop-test/src/ops/averagepool.cpp
@@ -0,0 +1,265 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::averagepool {
+
+struct Test::Attributes {
+    // 输入与期望输出
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> expected_output;
+
+    // 平均池化参数
+    std::vector<size_t> kernel_size;
+    std::vector<size_t> stride;
+    std::vector<size_t> padding;
+    bool ceil_mode;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+
+    if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->expected_output = tensors["output"];
+
+    // (N, C, spatial...) → 池化维度数 = rank - 2
+    size_t pool_ndim = test->_attributes->input->shape().size() - 2;
+    if (pool_ndim == 0) {
+        throw std::runtime_error(
+            "Input tensor must have at least 3 dimensions (N, C, ...)");
+    }
+
+    // ---- 解析并广播 kernel_size ----
+    auto kernel_size_data = attributes["kernel_size"];
+    if (kernel_size_data.size() % sizeof(int) != 0) {
+        throw std::runtime_error("Invalid kernel_size data size");
+    }
+    size_t kernel_size_count = kernel_size_data.size() / sizeof(int);
+    const int *kernel_size_ptr = reinterpret_cast<const int *>(kernel_size_data.data());
+
+    if (kernel_size_count == pool_ndim) {
+        test->_attributes->kernel_size.clear();
+        for (size_t i = 0; i < kernel_size_count; ++i) {
+            test->_attributes->kernel_size.push_back(
+                static_cast<size_t>(kernel_size_ptr[i]));
+        }
+    } else {
+        test->_attributes->kernel_size.assign(
+            pool_ndim, static_cast<size_t>(kernel_size_ptr[0]));
+    }
+
+    // ---- 解析并广播 stride ----
+    auto stride_data = attributes["stride"];
+    if (stride_data.size() % sizeof(int) != 0) {
+        throw std::runtime_error("Invalid stride data size");
+    }
+    size_t stride_count = stride_data.size() / sizeof(int);
+    const int *stride_ptr = reinterpret_cast<const int *>(stride_data.data());
+
+    if (stride_count == pool_ndim) {
+        test->_attributes->stride.clear();
+        for (size_t i = 0; i < stride_count; ++i) {
+            test->_attributes->stride.push_back(
+                static_cast<size_t>(stride_ptr[i]));
+        }
+    } else {
+        test->_attributes->stride.assign(
+            pool_ndim, static_cast<size_t>(stride_ptr[0]));
+    }
+
+    // ---- 解析并广播 padding ----
+    auto padding_data = attributes["padding"];
+    if (padding_data.size() % sizeof(int) != 0) {
+        throw std::runtime_error("Invalid padding data size");
+    }
+    size_t padding_count = padding_data.size() / sizeof(int);
+    const int *padding_ptr = reinterpret_cast<const int *>(padding_data.data());
+
+    if (padding_count == pool_ndim) {
+        test->_attributes->padding.clear();
+        for (size_t i = 0; i < padding_count; ++i) {
+            test->_attributes->padding.push_back(
+                static_cast<size_t>(padding_ptr[i]));
+        }
+    } else {
+        test->_attributes->padding.assign(
+            pool_ndim, static_cast<size_t>(padding_ptr[0]));
+    }
+
+    // ---- 解析 ceil_mode ----
+    auto ceil_mode_data = attributes["ceil_mode"];
+    if (ceil_mode_data.size() == sizeof(bool)) {
+        test->_attributes->ceil_mode = *reinterpret_cast<const bool *>(ceil_mode_data.data());
+    } else if (ceil_mode_data.size() == sizeof(uint8_t)) {
+        test->_attributes->ceil_mode = *reinterpret_cast<const uint8_t *>(ceil_mode_data.data()) != 0;
+    } else {
+        throw std::runtime_error("Invalid ceil_mode data size");
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id,
+    size_t warm_ups, size_t iterations) {
+
+    infiniopAvgPoolDescriptor_t op_desc;
+
+    auto input = _attributes->input->to(device, device_id);
+    auto expected_output = _attributes->expected_output;
+
+    auto input_dtype = input->ggml_type();
+    auto output_shape = expected_output->shape();
+
+    size_t output_size_bytes = 1;
+    for (auto d : output_shape) {
+        output_size_bytes *= d;
+    }
+    output_size_bytes *= ggmlTypeSize(input_dtype);
+
+    auto output_memory = std::make_shared<Memory>(output_size_bytes, device, device_id);
+
+    std::vector<ptrdiff_t> output_strides(output_shape.size());
+    if (!output_shape.empty()) {
+        output_strides[output_shape.size() - 1] = 1;
+        for (int i = static_cast<int>(output_shape.size()) - 2; i >= 0; --i) {
+            output_strides[i] = output_strides[i + 1] * output_shape[i + 1];
+        }
+    }
+
+    auto actual_output = std::make_shared<Tensor>(
+        output_memory, 0, output_shape, output_strides, input_dtype);
+
+    // 参数指针（按底层接口需要传 void*）
+    void *kernel_size_ptr = _attributes->kernel_size.data();
+    void *stride_ptr = _attributes->stride.data();
+    void *padding_ptr = _attributes->padding.data();
+
+    // ---- 创建算子描述符 ----
+    CHECK_OR(infiniopCreateAvgPoolDescriptor(
+                 handle, &op_desc,
+                 actual_output->desc(),
+                 input->desc(),
+                 kernel_size_ptr,
+                 stride_ptr,
+                 padding_ptr,
+                 _attributes->ceil_mode),
+             return TEST_FAILED(OP_CREATION_FAILED,
+                                "Failed to create avgpool descriptor."));
+
+    // ---- 获取工作空间大小 ----
+    size_t workspace_size = 0;
+    CHECK_OR(infiniopGetAvgPoolWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED,
+                                "Failed to get workspace size."));
+
+    // ---- 分配工作空间（如需要）----
+    void *workspace = nullptr;
+    if (workspace_size > 0) {
+        CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+                 return TEST_FAILED(OP_CREATION_FAILED,
+                                    "Failed to allocate workspace."));
+    }
+
+    // ---- 执行平均池化 ----
+    CHECK_OR(infiniopAvgPool(
+                 op_desc, workspace, workspace_size,
+                 actual_output->data(),
+                 input->data(),
+                 /*stream*/ nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED,
+                                "Failed during avgpool execution."));
+
+    // ---- 精度校验 ----
+    try {
+        allClose(actual_output, expected_output, _rtol, _atol);
+    } catch (const std::exception &e) {
+        if (workspace) {
+            infinirtFree(workspace);
+        }
+        infiniopDestroyAvgPoolDescriptor(op_desc);
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    // ---- 性能测试 ----
+    double elapsed_time = benchmark(
+        [=]() {
+            infiniopAvgPool(
+                op_desc, workspace, workspace_size,
+                actual_output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    // ---- 清理资源 ----
+    if (workspace) {
+        infinirtFree(workspace);
+    }
+    infiniopDestroyAvgPoolDescriptor(op_desc);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"kernel_size", "stride", "padding", "ceil_mode"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- expected_output: " << _attributes->expected_output->info() << std::endl;
+
+    oss << "- kernel_size: [";
+    for (size_t i = 0; i < _attributes->kernel_size.size(); ++i) {
+        if (i) {
+            oss << ", ";
+        }
+        oss << _attributes->kernel_size[i];
+    }
+    oss << "]\n- stride: [";
+    for (size_t i = 0; i < _attributes->stride.size(); ++i) {
+        if (i) {
+            oss << ", ";
+        }
+        oss << _attributes->stride[i];
+    }
+    oss << "]\n- padding: [";
+    for (size_t i = 0; i < _attributes->padding.size(); ++i) {
+        if (i) {
+            oss << ", ";
+        }
+        oss << _attributes->padding[i];
+    }
+    oss << "]\n- ceil_mode: "
+        << (_attributes->ceil_mode ? "true" : "false") << std::endl;
+
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::averagepool
diff --git a/src/infiniop-test/src/ops/averagepool_backward.cpp b/src/infiniop-test/src/ops/averagepool_backward.cpp
new file mode 100644
index 000000000..52949fdc1
--- /dev/null
+++ b/src/infiniop-test/src/ops/averagepool_backward.cpp
@@ -0,0 +1,254 @@
+// averagepool_backward.cpp
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::averagepool_backward {
+
+struct Test::Attributes {
+    // 张量
+    std::shared_ptr<Tensor> input;               // 前向输入 X
+    std::shared_ptr<Tensor> grad_output;         // 上游梯度 dY
+    std::shared_ptr<Tensor> expected_grad_input; // 期望梯度 dX
+
+    // 平均池化参数
+    std::vector<size_t> kernel_size;
+    std::vector<size_t> stride;
+    std::vector<size_t> padding;
+    bool ceil_mode;
+};
+
+static void broadcast_or_fill(std::vector<size_t> &dst,
+                              const int *src, size_t src_cnt,
+                              size_t ndim) {
+    dst.clear();
+    if (src_cnt == ndim) {
+        for (size_t i = 0; i < ndim; ++i) {
+            dst.push_back(static_cast<size_t>(src[i]));
+        }
+    } else {
+        // 将单个值广播到所有池化维度
+        dst.assign(ndim, static_cast<size_t>(src[0]));
+    }
+}
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+
+    if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) {
+        throw std::runtime_error("Invalid Test: missing attributes or tensors");
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->grad_output = tensors["grad_output"];
+    test->_attributes->expected_grad_input = tensors["grad_input"];
+
+    // 维度：去掉 N、C 后的空间维度数
+    const auto &in_shape = test->_attributes->input->shape();
+    if (in_shape.size() < 3) {
+        throw std::runtime_error("Input tensor rank must be >= 3 (N, C, ...)");
+    }
+    size_t pool_ndim = in_shape.size() - 2;
+
+    // --- kernel_size ---
+    {
+        const auto &buf = attributes["kernel_size"];
+        if (buf.size() % sizeof(int) != 0) {
+            throw std::runtime_error("Invalid kernel_size data size");
+        }
+        size_t cnt = buf.size() / sizeof(int);
+        const int *p = reinterpret_cast<const int *>(buf.data());
+        broadcast_or_fill(test->_attributes->kernel_size, p, cnt, pool_ndim);
+    }
+
+    // --- stride ---
+    {
+        const auto &buf = attributes["stride"];
+        if (buf.size() % sizeof(int) != 0) {
+            throw std::runtime_error("Invalid stride data size");
+        }
+        size_t cnt = buf.size() / sizeof(int);
+        const int *p = reinterpret_cast<const int *>(buf.data());
+        broadcast_or_fill(test->_attributes->stride, p, cnt, pool_ndim);
+    }
+
+    // --- padding ---
+    {
+        const auto &buf = attributes["padding"];
+        if (buf.size() % sizeof(int) != 0) {
+            throw std::runtime_error("Invalid padding data size");
+        }
+        size_t cnt = buf.size() / sizeof(int);
+        const int *p = reinterpret_cast<const int *>(buf.data());
+        broadcast_or_fill(test->_attributes->padding, p, cnt, pool_ndim);
+    }
+
+    // --- ceil_mode ---
+    {
+        const auto &buf = attributes["ceil_mode"];
+        if (buf.size() == sizeof(bool)) {
+            test->_attributes->ceil_mode = *reinterpret_cast<const bool *>(buf.data());
+        } else if (buf.size() == sizeof(uint8_t)) {
+            test->_attributes->ceil_mode = (*reinterpret_cast<const uint8_t *>(buf.data()) != 0);
+        } else {
+            throw std::runtime_error("Invalid ceil_mode data size");
+        }
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id,
+    size_t warm_ups, size_t iterations) {
+
+    // 把张量放到目标设备
+    auto input = _attributes->input->to(device, device_id);             // X
+    auto grad_output = _attributes->grad_output->to(device, device_id); // dY
+    auto expected_grad_input = _attributes->expected_grad_input;        // 参考 dX
+
+    // 构造实际输出 dX 的张量（形状等于 input，dtype 等于 input）
+    const auto &in_shape = input->shape();
+    std::vector<ptrdiff_t> in_strides(in_shape.size());
+    if (!in_shape.empty()) {
+        in_strides.back() = 1;
+        for (int i = static_cast<int>(in_shape.size()) - 2; i >= 0; --i) {
+            in_strides[i] = in_strides[i + 1] * in_shape[i + 1];
+        }
+    }
+    size_t dx_bytes = ggmlTypeSize(input->ggml_type());
+    for (auto d : in_shape) {
+        dx_bytes *= d;
+    }
+
+    auto dx_mem = std::make_shared<Memory>(dx_bytes, device, device_id);
+    auto actual_grad_input = std::make_shared<Tensor>(
+        dx_mem, 0, in_shape, in_strides, input->ggml_type());
+
+    // 参数指针
+    void *kernel_size_ptr = _attributes->kernel_size.data();
+    void *stride_ptr = _attributes->stride.data();
+    void *padding_ptr = _attributes->padding.data();
+
+    // --- 创建反向算子描述符 ---
+    infiniopAvgPoolBackwardDescriptor_t bwd_desc;
+    CHECK_OR(infiniopCreateAvgPoolBackwardDescriptor(
+                 handle, &bwd_desc,
+                 actual_grad_input->desc(), // grad_input_desc (dX)
+                 grad_output->desc(),       // grad_output_desc (dY)
+                 input->desc(),             // input_desc (X)
+                 kernel_size_ptr,
+                 stride_ptr,
+                 padding_ptr,
+                 _attributes->ceil_mode),
+             return TEST_FAILED(OP_CREATION_FAILED,
+                                "Failed to create averagepool backward descriptor."));
+
+    // --- 获取工作空间大小 ---
+    size_t workspace_size = 0;
+    CHECK_OR(infiniopGetAvgPoolBackwardWorkspaceSize(bwd_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED,
+                                "Failed to get backward workspace size."));
+
+    void *workspace = nullptr;
+    if (workspace_size > 0) {
+        CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+                 return TEST_FAILED(OP_CREATION_FAILED,
+                                    "Failed to allocate backward workspace."));
+    }
+
+    // --- 执行反向：dX = AvgPoolBackward(dY, X, ...) ---
+    CHECK_OR(infiniopAvgPoolBackward(
+                 bwd_desc, workspace, workspace_size,
+                 actual_grad_input->data(), // dX
+                 grad_output->data(),       // dY
+                 input->data(),             // X
+                 /*stream*/ nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED,
+                                "Failed during averagepool backward execution."));
+
+    // --- 校验数值 ---
+    try {
+        allClose(actual_grad_input, expected_grad_input, _rtol, _atol);
+    } catch (const std::exception &e) {
+        if (workspace) {
+            infinirtFree(workspace);
+        }
+        infiniopDestroyAvgPoolBackwardDescriptor(bwd_desc);
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    // --- 基准测试 ---
+    double elapsed_time = benchmark(
+        [=]() {
+            infiniopAvgPoolBackward(
+                bwd_desc, workspace, workspace_size,
+                actual_grad_input->data(),
+                grad_output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    // --- 清理 ---
+    if (workspace) {
+        infinirtFree(workspace);
+    }
+    infiniopDestroyAvgPoolBackwardDescriptor(bwd_desc);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"kernel_size", "stride", "padding", "ceil_mode"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    // 需要的输入张量
+    return {"input", "grad_output", "grad_input"};
+}
+
+std::vector<std::string> Test::output_names() {
+    // 无额外导出
+    return {};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << "\n";
+    oss << "- input: " << _attributes->input->info() << "\n";
+    oss << "- grad_output (dY): " << _attributes->grad_output->info() << "\n";
+    oss << "- expected_grad_input: " << _attributes->expected_grad_input->info() << "\n";
+
+    auto dump = [&](const char *name, const std::vector<size_t> &v) {
+        oss << "- " << name << ": [";
+        for (size_t i = 0; i < v.size(); ++i) {
+            if (i) {
+                oss << ", ";
+            }
+            oss << v[i];
+        }
+        oss << "]\n";
+    };
+    dump("kernel_size", _attributes->kernel_size);
+    dump("stride", _attributes->stride);
+    dump("padding", _attributes->padding);
+
+    oss << "- ceil_mode: " << (_attributes->ceil_mode ? "true" : "false") << "\n";
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << "\n";
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::averagepool_backward
diff --git a/src/infiniop-test/src/ops/conv_backward.cpp b/src/infiniop-test/src/ops/conv_backward.cpp
new file mode 100644
index 000000000..ed7814bff
--- /dev/null
+++ b/src/infiniop-test/src/ops/conv_backward.cpp
@@ -0,0 +1,335 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <cstdio>
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::conv_backward {
+
+struct Test::Attributes {
+    std::shared_ptr<Tensor> grad_output;
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> weight;
+    std::shared_ptr<Tensor> bias;
+
+    std::shared_ptr<Tensor> expected_grad_input;
+    std::shared_ptr<Tensor> expected_grad_weight;
+    std::shared_ptr<Tensor> expected_grad_bias;
+
+    std::vector<int> stride;
+    std::vector<int> padding;
+    std::vector<int> dilation;
+    int groups;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+
+    if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->grad_output = tensors["grad_output"];
+    test->_attributes->input = tensors["input"];
+    test->_attributes->weight = tensors["weight"];
+
+    auto bias_it = tensors.find("bias");
+    if (bias_it != tensors.end()) {
+        test->_attributes->bias = bias_it->second;
+    }
+
+    test->_attributes->expected_grad_input = tensors["grad_input"];
+    test->_attributes->expected_grad_weight = tensors["grad_weight"];
+
+    auto grad_bias_it = tensors.find("grad_bias");
+    if (grad_bias_it != tensors.end()) {
+        test->_attributes->expected_grad_bias = grad_bias_it->second;
+    }
+
+    size_t pool_ndim = test->_attributes->input->shape().size() - 2;
+    if (pool_ndim == 0) {
+        throw std::runtime_error("Input tensor must have at least 3 dimensions (N, C, ...)");
+    }
+
+    auto stride_data = attributes["stride"];
+    auto padding_data = attributes["padding"];
+    auto dilation_data = attributes["dilation"];
+
+    size_t stride_count = stride_data.size() / sizeof(int);
+    size_t padding_count = padding_data.size() / sizeof(int);
+    size_t dilation_count = dilation_data.size() / sizeof(int);
+
+    if (stride_data.size() % sizeof(int) != 0) {
+        throw std::runtime_error("Invalid stride data size");
+    }
+    const int *stride_ptr = reinterpret_cast<const int *>(stride_data.data());
+    if (stride_count == pool_ndim) {
+        test->_attributes->stride.clear();
+        for (size_t i = 0; i < stride_count; i++) {
+            test->_attributes->stride.push_back(static_cast<size_t>(stride_ptr[i]));
+        }
+    } else {
+        test->_attributes->stride.assign(pool_ndim, static_cast<size_t>(stride_ptr[0]));
+    }
+
+    if (padding_data.size() % sizeof(int) != 0) {
+        throw std::runtime_error("Invalid padding data size");
+    }
+    const int *padding_ptr = reinterpret_cast<const int *>(padding_data.data());
+    if (padding_count == pool_ndim) {
+        test->_attributes->padding.clear();
+        for (size_t i = 0; i < padding_count; i++) {
+            test->_attributes->padding.push_back(static_cast<size_t>(padding_ptr[i]));
+        }
+    } else {
+        test->_attributes->padding.assign(pool_ndim, static_cast<size_t>(padding_ptr[0]));
+    }
+
+    if (dilation_data.size() % sizeof(int) != 0) {
+        throw std::runtime_error("Invalid dilation data size");
+    }
+    const int *dilation_ptr = reinterpret_cast<const int *>(dilation_data.data());
+    if (dilation_count == pool_ndim) {
+        test->_attributes->dilation.clear();
+        for (size_t i = 0; i < dilation_count; i++) {
+            test->_attributes->dilation.push_back(static_cast<size_t>(dilation_ptr[i]));
+        }
+    } else {
+        test->_attributes->dilation.assign(pool_ndim, static_cast<size_t>(dilation_ptr[0]));
+    }
+
+    test->_attributes->groups = *reinterpret_cast<int *>(attributes["groups"].data());
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id,
+    size_t warm_ups, size_t iterations) {
+    infiniopConvBackwardDescriptor_t op_desc;
+
+    auto grad_output = _attributes->grad_output->to(device, device_id);
+    auto input = _attributes->input->to(device, device_id);
+    auto weight = _attributes->weight->to(device, device_id);
+    auto bias = _attributes->bias ? _attributes->bias->to(device, device_id) : nullptr;
+
+    auto expected_grad_input = _attributes->expected_grad_input;
+    auto expected_grad_weight = _attributes->expected_grad_weight;
+    auto expected_grad_bias = _attributes->expected_grad_bias;
+
+    auto input_dtype = input->ggml_type();
+
+    auto grad_input_shape = expected_grad_input->shape();
+    size_t grad_input_size = 1;
+    for (auto dim : grad_input_shape) {
+        grad_input_size *= dim;
+    }
+    grad_input_size *= ggmlTypeSize(input_dtype);
+
+    auto grad_input_memory = std::make_shared<Memory>(grad_input_size, device, device_id);
+    std::vector<ptrdiff_t> grad_input_strides(grad_input_shape.size());
+
+    if (grad_input_shape.size() > 0) {
+        grad_input_strides[grad_input_shape.size() - 1] = 1;
+        for (int i = static_cast<int>(grad_input_shape.size()) - 2; i >= 0; i--) {
+            grad_input_strides[i] = grad_input_strides[i + 1] * grad_input_shape[i + 1];
+        }
+    }
+
+    auto actual_grad_input = std::make_shared<Tensor>(
+        grad_input_memory, 0, grad_input_shape, grad_input_strides, input_dtype);
+
+    auto grad_weight_shape = expected_grad_weight->shape();
+    size_t grad_weight_size = 1;
+    for (auto dim : grad_weight_shape) {
+        grad_weight_size *= dim;
+    }
+    grad_weight_size *= ggmlTypeSize(input_dtype);
+
+    auto grad_weight_memory = std::make_shared<Memory>(grad_weight_size, device, device_id);
+    std::vector<ptrdiff_t> grad_weight_strides(grad_weight_shape.size());
+
+    if (grad_weight_shape.size() > 0) {
+        grad_weight_strides[grad_weight_shape.size() - 1] = 1;
+        for (int i = static_cast<int>(grad_weight_shape.size()) - 2; i >= 0; i--) {
+            grad_weight_strides[i] = grad_weight_strides[i + 1] * grad_weight_shape[i + 1];
+        }
+    }
+
+    auto actual_grad_weight = std::make_shared<Tensor>(
+        grad_weight_memory, 0, grad_weight_shape, grad_weight_strides, input_dtype);
+
+    std::shared_ptr<Tensor> actual_grad_bias = nullptr;
+    if (bias && expected_grad_bias) {
+        auto grad_bias_shape = expected_grad_bias->shape();
+        size_t grad_bias_size = 1;
+        for (auto dim : grad_bias_shape) {
+            grad_bias_size *= dim;
+        }
+        grad_bias_size *= ggmlTypeSize(input_dtype);
+
+        auto grad_bias_memory = std::make_shared<Memory>(grad_bias_size, device, device_id);
+        std::vector<ptrdiff_t> grad_bias_strides(grad_bias_shape.size());
+
+        if (grad_bias_shape.size() > 0) {
+            grad_bias_strides[grad_bias_shape.size() - 1] = 1;
+            for (int i = static_cast<int>(grad_bias_shape.size()) - 2; i >= 0; i--) {
+                grad_bias_strides[i] = grad_bias_strides[i + 1] * grad_bias_shape[i + 1];
+            }
+        }
+
+        actual_grad_bias = std::make_shared<Tensor>(
+            grad_bias_memory, 0, grad_bias_shape, grad_bias_strides, input_dtype);
+    }
+
+    void *pads_ptr = _attributes->padding.data();
+    void *strides_ptr = _attributes->stride.data();
+    void *dilations_ptr = _attributes->dilation.data();
+
+    CHECK_OR(infiniopCreateConvBackwardDescriptor(
+                 handle, &op_desc,
+                 grad_output->desc(),
+                 input->desc(),
+                 weight->desc(),
+                 bias ? bias->desc() : nullptr,
+                 pads_ptr,
+                 strides_ptr,
+                 dilations_ptr,
+                 _attributes->groups),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create conv_backward descriptor."));
+
+    // 获取工作空间大小
+    size_t workspace_size;
+    CHECK_OR(infiniopGetConvBackwardWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+
+    // 分配工作空间
+    void *workspace = nullptr;
+    if (workspace_size > 0) {
+        CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+                 return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    }
+
+    CHECK_OR(infiniopConvBackward(
+                 op_desc, workspace, workspace_size,
+                 actual_grad_input->data(),                             // void *grad_input
+                 actual_grad_weight->data(),                            // void *grad_weight
+                 actual_grad_bias ? actual_grad_bias->data() : nullptr, // void *grad_bias
+                 grad_output->data(),                                   // const void *grad_output
+                 input->data(),                                         // const void *input
+                 weight->data(),                                        // const void *weight
+                 nullptr),                                              // void *stream
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during conv_backward execution."));
+
+    // 验证结果
+    try {
+        allClose(actual_grad_input, expected_grad_input, _rtol, _atol);
+        allClose(actual_grad_weight, expected_grad_weight, _rtol, _atol);
+
+        if (actual_grad_bias && expected_grad_bias) {
+            allClose(actual_grad_bias, expected_grad_bias, _rtol, _atol);
+        }
+    } catch (const std::exception &e) {
+        if (workspace) {
+            infinirtFree(workspace);
+        }
+        infiniopDestroyConvBackwardDescriptor(op_desc);
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    // 性能测试
+    double elapsed_time = benchmark(
+        [=]() {
+            infiniopConvBackward(
+                op_desc, workspace, workspace_size,
+                actual_grad_input->data(),
+                actual_grad_weight->data(),
+                actual_grad_bias ? actual_grad_bias->data() : nullptr,
+                grad_output->data(),
+                input->data(),
+                weight->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    // 清理资源
+    if (workspace) {
+        infinirtFree(workspace);
+    }
+    infiniopDestroyConvBackwardDescriptor(op_desc);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"stride", "padding", "dilation", "groups"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"grad_output", "input", "weight", "bias", "grad_input", "grad_weight", "grad_bias"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- grad_output: " << _attributes->grad_output->info() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- weight: " << _attributes->weight->info() << std::endl;
+    if (_attributes->bias) {
+        oss << "- bias: " << _attributes->bias->info() << std::endl;
+    }
+    oss << "- expected_grad_input: " << _attributes->expected_grad_input->info() << std::endl;
+    oss << "- expected_grad_weight: " << _attributes->expected_grad_weight->info() << std::endl;
+    if (_attributes->expected_grad_bias) {
+        oss << "- expected_grad_bias: " << _attributes->expected_grad_bias->info() << std::endl;
+    }
+
+    oss << "- stride: [";
+    for (size_t i = 0; i < _attributes->stride.size(); ++i) {
+        if (i > 0) {
+            oss << ", ";
+        }
+        oss << _attributes->stride[i];
+    }
+    oss << "]" << std::endl;
+
+    oss << "- padding: [";
+    for (size_t i = 0; i < _attributes->padding.size(); ++i) {
+        if (i > 0) {
+            oss << ", ";
+        }
+        oss << _attributes->padding[i];
+    }
+    oss << "]" << std::endl;
+
+    oss << "- dilation: [";
+    for (size_t i = 0; i < _attributes->dilation.size(); ++i) {
+        if (i > 0) {
+            oss << ", ";
+        }
+        oss << _attributes->dilation[i];
+    }
+    oss << "]" << std::endl;
+
+    oss << "- groups: " << _attributes->groups << std::endl;
+
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::conv_backward
diff --git a/src/infiniop-test/src/ops/cross_entropy_loss.cpp b/src/infiniop-test/src/ops/cross_entropy_loss.cpp
new file mode 100644
index 000000000..7fac231e0
--- /dev/null
+++ b/src/infiniop-test/src/ops/cross_entropy_loss.cpp
@@ -0,0 +1,156 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::cross_entropy_loss {
+
+struct Test::Attributes {
+    // 输入张量
+    std::shared_ptr<Tensor> logits;
+    std::shared_ptr<Tensor> target;
+    std::shared_ptr<Tensor> loss;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+
+    // 检查必需的张量是否存在
+    if (!check_names(tensors, Test::tensor_names()) || !check_names(attributes, Test::attribute_names())) {
+        throw std::runtime_error("Invalid Test: Missing required tensors.");
+    }
+
+    test->_attributes->logits = tensors["logits"];
+    test->_attributes->target = tensors["target"];
+    test->_attributes->loss = tensors["loss"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id,
+    size_t warm_ups, size_t iterations) {
+
+    infiniopCrossEntropyLossDescriptor_t op_desc;
+
+    // 将输入张量移动到目标设备
+    auto logits = _attributes->logits->to(device, device_id);
+    auto target = _attributes->target->to(device, device_id);
+    auto loss = _attributes->loss;
+
+    // 根据期望输出的形状创建实际输出张量
+    auto output_shape = loss->shape();
+    size_t output_size = 1;
+    for (auto dim : output_shape) {
+        output_size *= dim;
+    }
+    output_size *= ggmlTypeSize(logits->ggml_type());
+
+    auto output_memory = std::make_shared<Memory>(output_size, device, device_id);
+    std::vector<ptrdiff_t> output_strides(static_cast<size_t>(output_shape.size()));
+    if (output_shape.size() > 0) {
+        output_strides[output_shape.size() - 1] = 1;
+        for (int i = static_cast<int>(output_shape.size()) - 2; i >= 0; i--) {
+            output_strides[i] = output_strides[i + 1] * output_shape[i + 1];
+        }
+    }
+    auto actual_output = std::make_shared<Tensor>(
+        output_memory, 0, output_shape, output_strides, logits->ggml_type());
+
+    // 1. 创建算子描述符
+    CHECK_OR(infiniopCreateCrossEntropyLossDescriptor(
+                 handle, &op_desc,
+                 actual_output->desc(),
+                 logits->desc(),
+                 target->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create cross entropy loss descriptor."));
+
+    // 2. 获取并分配工作空间
+    size_t workspace_size;
+    CHECK_OR(infiniopGetCrossEntropyLossWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+
+    void *workspace = nullptr;
+    if (workspace_size > 0) {
+        CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+                 return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    }
+
+    // 3. 执行计算
+    CHECK_OR(infiniopCrossEntropyLoss(
+                 op_desc, workspace, workspace_size,
+                 actual_output->data(),
+                 logits->data(),
+                 target->data(),
+                 nullptr), // stream
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during cross entropy loss execution."));
+
+    // 4. 验证结果
+    try {
+        allClose(actual_output, loss, _rtol, _atol);
+    } catch (const std::exception &e) {
+        if (workspace) {
+            infinirtFree(workspace);
+        }
+        infiniopDestroyCrossEntropyLossDescriptor(op_desc);
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    // 5. 性能测试
+    double elapsed_time = benchmark(
+        [=]() {
+            infiniopCrossEntropyLoss(
+                op_desc, workspace, workspace_size,
+                actual_output->data(),
+                logits->data(),
+                target->data(),
+                nullptr); // stream
+        },
+        warm_ups, iterations);
+
+    // 6. 清理资源
+    if (workspace) {
+        infinirtFree(workspace);
+    }
+    infiniopDestroyCrossEntropyLossDescriptor(op_desc);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+// 定义算子需要的属性名列表
+std::vector<std::string> Test::attribute_names() {
+    return {}; // CrossEntropyLoss 没有额外的属性
+}
+
+// 定义算子需要的张量名列表
+std::vector<std::string> Test::tensor_names() {
+    return {"logits", "target", "loss"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {};
+}
+
+// 打印测试信息的辅助函数
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- logits: " << _attributes->logits->info() << std::endl;
+    oss << "- target: " << _attributes->target->info() << std::endl;
+    oss << "- loss: " << _attributes->loss->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::cross_entropy_loss
diff --git a/src/infiniop-test/src/ops/interpolate_nearest.cpp b/src/infiniop-test/src/ops/interpolate_nearest.cpp
new file mode 100644
index 000000000..071527249
--- /dev/null
+++ b/src/infiniop-test/src/ops/interpolate_nearest.cpp
@@ -0,0 +1,151 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::interpolate_nearest {
+
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> expected_output;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+
+    if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) {
+        std::cout << "DEBUG: Name check failed" << std::endl;
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->input = tensors["input"];            // F32 输入数据
+    test->_attributes->expected_output = tensors["output"]; // F64 期望结果
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id,
+    size_t warm_ups, size_t iterations) {
+
+    infiniopInterpolateNearestDescriptor_t op_desc;
+
+    auto input = _attributes->input->to(device, device_id);
+    auto expected_output = _attributes->expected_output; // F64 期望结果
+
+    // 动态创建实际的输出张量，使用期望结果的形状，但使用输入的数据类型
+    auto output_shape = expected_output->shape();
+    auto input_dtype = input->ggml_type();
+
+    // 创建输出张量的内存
+    size_t output_size = 1;
+    for (auto dim : output_shape) {
+        output_size *= dim;
+    }
+    output_size *= ggmlTypeSize(input_dtype);
+
+    auto output_memory = std::make_shared<Memory>(output_size, device, device_id);
+    std::vector<ptrdiff_t> output_strides(output_shape.size());
+
+    // 计算连续的步长
+    if (output_shape.size() > 0) {
+        output_strides[output_shape.size() - 1] = 1;
+        for (int i = static_cast<int>(output_shape.size()) - 2; i >= 0; i--) {
+            output_strides[i] = output_strides[i + 1] * output_shape[i + 1];
+        }
+    }
+
+    auto actual_output = std::make_shared<Tensor>(
+        output_memory, 0, output_shape, output_strides, input_dtype);
+
+    // Create operator descriptor
+    CHECK_OR(infiniopCreateInterpolateNearestDescriptor(
+                 handle, &op_desc,
+                 actual_output->desc(),
+                 input->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+
+    // Get workspace size
+    size_t workspace_size;
+    CHECK_OR(infiniopGetInterpolateNearestWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+
+    // Allocate workspace if needed
+    void *workspace = nullptr;
+    if (workspace_size > 0) {
+        CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+                 return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    }
+
+    // Execute interpolate nearest
+    CHECK_OR(infiniopInterpolateNearest(
+                 op_desc, workspace, workspace_size,
+                 actual_output->data(),
+                 input->data(),
+                 nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    // Verify result - 比较实际输出和期望结果
+    try {
+        allClose(actual_output, expected_output, _rtol, _atol);
+    } catch (const std::exception &e) {
+        if (workspace) {
+            infinirtFree(workspace);
+        }
+        infiniopDestroyInterpolateNearestDescriptor(op_desc);
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    // Benchmark
+    double elapsed_time = benchmark(
+        [=]() {
+            infiniopInterpolateNearest(
+                op_desc, workspace, workspace_size,
+                actual_output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    // Cleanup
+    if (workspace) {
+        infinirtFree(workspace);
+    }
+    infiniopDestroyInterpolateNearestDescriptor(op_desc);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- expected_output: " << _attributes->expected_output->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::interpolate_nearest
diff --git a/src/infiniop-test/src/ops/maxpool.cpp b/src/infiniop-test/src/ops/maxpool.cpp
new file mode 100644
index 000000000..698c5ad89
--- /dev/null
+++ b/src/infiniop-test/src/ops/maxpool.cpp
@@ -0,0 +1,263 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::maxpool {
+
+struct Test::Attributes {
+    // 输入张量
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> expected_output;
+
+    // 最大池化参数
+    std::vector<size_t> kernel_size;
+    std::vector<size_t> stride;
+    std::vector<size_t> padding;
+    bool ceil_mode;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+
+    if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    auto input_tensor = tensors["input"];
+    test->_attributes->input = tensors["input"];
+    test->_attributes->expected_output = tensors["output"];
+
+    // 获取池化维度（输入张量维度 - 2，去掉batch和channel维度）
+    size_t pool_ndim = test->_attributes->input->shape().size() - 2;
+    if (pool_ndim == 0) {
+        throw std::runtime_error("Input tensor must have at least 3 dimensions (N, C, ...)");
+    }
+
+    // 解析并广播 kernel_size - 修复类型转换
+    auto kernel_size_data = attributes["kernel_size"];
+    if (kernel_size_data.size() % sizeof(int) != 0) {
+        throw std::runtime_error("Invalid kernel_size data size");
+    }
+    size_t kernel_size_count = kernel_size_data.size() / sizeof(int);
+    const int *kernel_size_ptr = reinterpret_cast<const int *>(kernel_size_data.data());
+
+    if (kernel_size_count == pool_ndim) {
+        test->_attributes->kernel_size.clear();
+        for (size_t i = 0; i < kernel_size_count; i++) {
+            test->_attributes->kernel_size.push_back(static_cast<size_t>(kernel_size_ptr[i]));
+        }
+    } else {
+        test->_attributes->kernel_size.assign(pool_ndim, static_cast<size_t>(kernel_size_ptr[0]));
+    }
+
+    // 解析并广播 stride
+    auto stride_data = attributes["stride"];
+    if (stride_data.size() % sizeof(int) != 0) {
+        throw std::runtime_error("Invalid stride data size");
+    }
+    size_t stride_count = stride_data.size() / sizeof(int);
+    const int *stride_ptr = reinterpret_cast<const int *>(stride_data.data());
+
+    if (stride_count == pool_ndim) {
+        // 直接使用提供的值
+        test->_attributes->stride.clear();
+        for (size_t i = 0; i < stride_count; i++) {
+            test->_attributes->stride.push_back(static_cast<size_t>(stride_ptr[i]));
+        }
+    } else {
+        // 广播单个值到所有维度
+        test->_attributes->stride.assign(pool_ndim, static_cast<size_t>(stride_ptr[0]));
+    }
+
+    // 解析并广播 padding
+    auto padding_data = attributes["padding"];
+    if (padding_data.size() % sizeof(int) != 0) {
+        throw std::runtime_error("Invalid padding data size");
+    }
+    size_t padding_count = padding_data.size() / sizeof(int);
+    const int *padding_ptr = reinterpret_cast<const int *>(padding_data.data());
+
+    if (padding_count == pool_ndim) {
+        test->_attributes->padding.clear();
+        for (size_t i = 0; i < padding_count; i++) {
+            test->_attributes->padding.push_back(static_cast<size_t>(padding_ptr[i]));
+        }
+    } else {
+        test->_attributes->padding.assign(pool_ndim, static_cast<size_t>(padding_ptr[0]));
+    }
+
+    // 解析 ceil_mode
+    auto ceil_mode_data = attributes["ceil_mode"];
+    if (ceil_mode_data.size() == sizeof(bool)) {
+        test->_attributes->ceil_mode = *reinterpret_cast<const bool *>(ceil_mode_data.data());
+    } else if (ceil_mode_data.size() == sizeof(uint8_t)) {
+        test->_attributes->ceil_mode = *reinterpret_cast<const uint8_t *>(ceil_mode_data.data()) != 0;
+    } else {
+        throw std::runtime_error("Invalid ceil_mode data size");
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id,
+    size_t warm_ups, size_t iterations) {
+
+    infiniopMaxPoolDescriptor_t op_desc;
+
+    auto input = _attributes->input->to(device, device_id);
+    auto expected_output = _attributes->expected_output;
+
+    auto input_dtype = input->ggml_type();
+
+    auto output_shape = expected_output->shape();
+
+    size_t output_size = 1;
+    for (auto dim : output_shape) {
+        output_size *= dim;
+    }
+    output_size *= ggmlTypeSize(input_dtype);
+
+    auto output_memory = std::make_shared<Memory>(output_size, device, device_id);
+    std::vector<ptrdiff_t> output_strides(output_shape.size());
+
+    if (output_shape.size() > 0) {
+        output_strides[output_shape.size() - 1] = 1;
+        for (int i = static_cast<int>(output_shape.size()) - 2; i >= 0; i--) {
+            output_strides[i] = output_strides[i + 1] * output_shape[i + 1];
+        }
+    }
+
+    auto actual_output = std::make_shared<Tensor>(
+        output_memory, 0, output_shape, output_strides, input_dtype);
+
+    // 准备参数指针
+    void *kernel_size_ptr = _attributes->kernel_size.data();
+    void *stride_ptr = _attributes->stride.data();
+    void *padding_ptr = _attributes->padding.data();
+
+    // 创建算子描述符
+    CHECK_OR(infiniopCreateMaxPoolDescriptor(
+                 handle, &op_desc,
+                 actual_output->desc(),
+                 input->desc(),
+                 kernel_size_ptr,
+                 stride_ptr,
+                 padding_ptr,
+                 _attributes->ceil_mode),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create maxpool descriptor."));
+
+    // 获取工作空间大小
+    size_t workspace_size;
+    CHECK_OR(infiniopGetMaxPoolWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+
+    // 分配工作空间
+    void *workspace = nullptr;
+    if (workspace_size > 0) {
+        CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+                 return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    }
+
+    // 执行最大池化
+    CHECK_OR(infiniopMaxPool(
+                 op_desc, workspace, workspace_size,
+                 actual_output->data(),
+                 input->data(),
+                 nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during maxpool execution."));
+
+    // 验证结果
+    try {
+        allClose(actual_output, expected_output, _rtol, _atol);
+    } catch (const std::exception &e) {
+        if (workspace) {
+            infinirtFree(workspace);
+        }
+        infiniopDestroyMaxPoolDescriptor(op_desc);
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    // 性能测试
+    double elapsed_time = benchmark(
+        [=]() {
+            infiniopMaxPool(
+                op_desc, workspace, workspace_size,
+                actual_output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    // 清理资源
+    if (workspace) {
+        infinirtFree(workspace);
+    }
+    infiniopDestroyMaxPoolDescriptor(op_desc);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"kernel_size", "stride", "padding", "ceil_mode"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "output"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- expected_output: " << _attributes->expected_output->info() << std::endl;
+
+    oss << "- kernel_size: [";
+    for (size_t i = 0; i < _attributes->kernel_size.size(); ++i) {
+        if (i > 0) {
+            oss << ", ";
+        }
+        oss << _attributes->kernel_size[i];
+    }
+    oss << "]" << std::endl;
+
+    oss << "- stride: [";
+    for (size_t i = 0; i < _attributes->stride.size(); ++i) {
+        if (i > 0) {
+            oss << ", ";
+        }
+        oss << _attributes->stride[i];
+    }
+    oss << "]" << std::endl;
+
+    oss << "- padding: [";
+    for (size_t i = 0; i < _attributes->padding.size(); ++i) {
+        if (i > 0) {
+            oss << ", ";
+        }
+        oss << _attributes->padding[i];
+    }
+    oss << "]" << std::endl;
+
+    oss << "- ceil_mode: " << (_attributes->ceil_mode ? "true" : "false") << std::endl;
+
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::maxpool
diff --git a/src/infiniop-test/src/ops/maxpool_backward.cpp b/src/infiniop-test/src/ops/maxpool_backward.cpp
new file mode 100644
index 000000000..2687fcf37
--- /dev/null
+++ b/src/infiniop-test/src/ops/maxpool_backward.cpp
@@ -0,0 +1,266 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::maxpool_backward {
+
+struct Test::Attributes {
+    std::shared_ptr<Tensor> grad_output;
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> expected_grad_input;
+
+    std::vector<size_t> kernel_size;
+    std::vector<size_t> stride;
+    std::vector<size_t> padding;
+    bool ceil_mode;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+
+    if (!check_names(attributes, Test::attribute_names()) || !check_names(tensors, Test::tensor_names())) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->grad_output = tensors["grad_output"];
+    test->_attributes->input = tensors["input"];
+    test->_attributes->expected_grad_input = tensors["grad_input"];
+
+    // 获取池化维度
+    size_t pool_ndim = test->_attributes->input->shape().size() - 2;
+    if (pool_ndim == 0) {
+        throw std::runtime_error("Input tensor must have at least 3 dimensions (N, C, ...)");
+    }
+
+    // 解析并广播 kernel_size
+    auto kernel_size_data = attributes["kernel_size"];
+    if (kernel_size_data.size() % sizeof(int) != 0) {
+        throw std::runtime_error("Invalid kernel_size data size");
+    }
+    size_t kernel_size_count = kernel_size_data.size() / sizeof(int);
+    const int *kernel_size_ptr = reinterpret_cast<const int *>(kernel_size_data.data());
+    if (kernel_size_count == pool_ndim) {
+        test->_attributes->kernel_size.clear();
+        for (size_t i = 0; i < kernel_size_count; i++) {
+            test->_attributes->kernel_size.push_back(static_cast<size_t>(kernel_size_ptr[i]));
+        }
+    } else {
+        test->_attributes->kernel_size.assign(pool_ndim, static_cast<size_t>(kernel_size_ptr[0]));
+    }
+
+    // 解析并广播 stride
+    auto stride_data = attributes["stride"];
+    if (stride_data.size() % sizeof(int) != 0) {
+        throw std::runtime_error("Invalid stride data size");
+    }
+    size_t stride_count = stride_data.size() / sizeof(int);
+    const int *stride_ptr = reinterpret_cast<const int *>(stride_data.data());
+    if (stride_count == pool_ndim) {
+        test->_attributes->stride.clear();
+        for (size_t i = 0; i < stride_count; i++) {
+            test->_attributes->stride.push_back(static_cast<size_t>(stride_ptr[i]));
+        }
+    } else {
+        test->_attributes->stride.assign(pool_ndim, static_cast<size_t>(stride_ptr[0]));
+    }
+
+    // 解析并广播 padding
+    auto padding_data = attributes["padding"];
+    if (padding_data.size() % sizeof(int) != 0) {
+        throw std::runtime_error("Invalid padding data size");
+    }
+    size_t padding_count = padding_data.size() / sizeof(int);
+    const int *padding_ptr = reinterpret_cast<const int *>(padding_data.data());
+    if (padding_count == pool_ndim) {
+        test->_attributes->padding.clear();
+        for (size_t i = 0; i < padding_count; i++) {
+            test->_attributes->padding.push_back(static_cast<size_t>(padding_ptr[i]));
+        }
+    } else {
+        test->_attributes->padding.assign(pool_ndim, static_cast<size_t>(padding_ptr[0]));
+    }
+
+    // 解析 ceil_mode
+    auto ceil_mode_data = attributes["ceil_mode"];
+    if (ceil_mode_data.size() == sizeof(bool)) {
+        test->_attributes->ceil_mode = *reinterpret_cast<const bool *>(ceil_mode_data.data());
+    } else if (ceil_mode_data.size() == sizeof(uint8_t)) {
+        test->_attributes->ceil_mode = *reinterpret_cast<const uint8_t *>(ceil_mode_data.data()) != 0;
+    } else {
+        throw std::runtime_error("Invalid ceil_mode data size");
+    }
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id,
+    size_t warm_ups, size_t iterations) {
+
+    infiniopMaxPoolBackwardDescriptor_t op_desc;
+
+    // 将输入张量移动到指定设备
+    auto grad_output = _attributes->grad_output->to(device, device_id);
+    auto input = _attributes->input->to(device, device_id);
+    auto expected_grad_input = _attributes->expected_grad_input;
+
+    // 获取输入数据类型
+    auto input_dtype = input->ggml_type();
+
+    // 手动创建 grad_input 张量（使用期望结果的形状，但使用输入的数据类型）
+    auto grad_input_shape = expected_grad_input->shape();
+
+    size_t grad_input_size = 1;
+    for (auto dim : grad_input_shape) {
+        grad_input_size *= dim;
+    }
+    grad_input_size *= ggmlTypeSize(input_dtype);
+
+    auto grad_input_memory = std::make_shared<Memory>(grad_input_size, device, device_id);
+    std::vector<ptrdiff_t> grad_input_strides(grad_input_shape.size());
+
+    if (grad_input_shape.size() > 0) {
+        grad_input_strides[grad_input_shape.size() - 1] = 1;
+        for (int i = static_cast<int>(grad_input_shape.size()) - 2; i >= 0; i--) {
+            grad_input_strides[i] = grad_input_strides[i + 1] * grad_input_shape[i + 1];
+        }
+    }
+
+    auto actual_grad_input = std::make_shared<Tensor>(
+        grad_input_memory, 0, grad_input_shape, grad_input_strides, input_dtype);
+
+    // 准备参数指针
+    void *kernel_size_ptr = _attributes->kernel_size.data();
+    void *stride_ptr = _attributes->stride.data();
+    void *padding_ptr = _attributes->padding.data();
+
+    auto grad_output_shape = grad_output->shape();
+    CHECK_OR(infiniopCreateMaxPoolBackwardDescriptor(
+                 handle, &op_desc,
+                 actual_grad_input->desc(),
+                 grad_output->desc(),
+                 input->desc(),
+                 kernel_size_ptr,
+                 stride_ptr,
+                 padding_ptr,
+                 _attributes->ceil_mode),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create maxpool_backward descriptor."));
+
+    // 获取工作空间大小
+    size_t workspace_size;
+    CHECK_OR(infiniopGetMaxPoolBackwardWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+
+    // 分配工作空间
+    void *workspace = nullptr;
+    if (workspace_size > 0) {
+        CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+                 return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    }
+
+    // 执行最大池化反向传播
+    CHECK_OR(infiniopMaxPoolBackward(
+                 op_desc, workspace, workspace_size,
+                 actual_grad_input->data(), // void *grad_input
+                 grad_output->data(),       // const void *grad_output
+                 input->data(),             // const void *input
+                 nullptr),                  // void *stream
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during maxpool_backward execution."));
+
+    // 验证结果
+    try {
+        allClose(actual_grad_input, expected_grad_input, _rtol, _atol);
+    } catch (const std::exception &e) {
+        if (workspace) {
+            infinirtFree(workspace);
+        }
+        infiniopDestroyMaxPoolBackwardDescriptor(op_desc);
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    // 性能测试
+    double elapsed_time = benchmark(
+        [=]() {
+            infiniopMaxPoolBackward(
+                op_desc, workspace, workspace_size,
+                actual_grad_input->data(),
+                grad_output->data(),
+                input->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    // 清理资源
+    if (workspace) {
+        infinirtFree(workspace);
+    }
+    infiniopDestroyMaxPoolBackwardDescriptor(op_desc);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"kernel_size", "stride", "padding", "ceil_mode"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"grad_output", "input", "grad_input"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- grad_output: " << _attributes->grad_output->info() << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- expected_grad_input: " << _attributes->expected_grad_input->info() << std::endl;
+
+    oss << "- kernel_size: [";
+    for (size_t i = 0; i < _attributes->kernel_size.size(); ++i) {
+        if (i > 0) {
+            oss << ", ";
+        }
+        oss << _attributes->kernel_size[i];
+    }
+    oss << "]" << std::endl;
+
+    oss << "- stride: [";
+    for (size_t i = 0; i < _attributes->stride.size(); ++i) {
+        if (i > 0) {
+            oss << ", ";
+        }
+        oss << _attributes->stride[i];
+    }
+    oss << "]" << std::endl;
+
+    oss << "- padding: [";
+    for (size_t i = 0; i < _attributes->padding.size(); ++i) {
+        if (i > 0) {
+            oss << ", ";
+        }
+        oss << _attributes->padding[i];
+    }
+    oss << "]" << std::endl;
+
+    oss << "- ceil_mode: " << (_attributes->ceil_mode ? "true" : "false") << std::endl;
+
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::maxpool_backward
diff --git a/src/infiniop/ops/averagepool/averagepool.h b/src/infiniop/ops/averagepool/averagepool.h
new file mode 100644
index 000000000..7762826ab
--- /dev/null
+++ b/src/infiniop/ops/averagepool/averagepool.h
@@ -0,0 +1,52 @@
+#ifndef __AVERAGEPOOL_H__
+#define __AVERAGEPOOL_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+    namespace op::averagepool::NAMESPACE {                       \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        infiniDtype_t _dtype;                                    \
+        AvgPoolInfo _info;                                       \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            infiniDtype_t dtype,                                 \
+            AvgPoolInfo info,                                    \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _dtype(dtype),                                     \
+              _info(info),                                       \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t output_desc,              \
+            infiniopTensorDescriptor_t input_desc,               \
+            void *kernel_size,                                   \
+            void *strides,                                       \
+            void *pads,                                          \
+            bool ceil_mode);                                     \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            void *output,                                        \
+            const void *input,                                   \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __AVERAGEPOOL_H__
diff --git a/src/infiniop/ops/averagepool/cpu/averagepool_cpu.cc b/src/infiniop/ops/averagepool/cpu/averagepool_cpu.cc
new file mode 100644
index 000000000..2e8fa6851
--- /dev/null
+++ b/src/infiniop/ops/averagepool/cpu/averagepool_cpu.cc
@@ -0,0 +1,362 @@
+#include "averagepool_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../devices/cpu/cpu_handle.h"
+#include "../info.h"
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <numeric>
+#include <vector>
+
+namespace op::averagepool::cpu {
+
+struct Descriptor::Opaque {
+    device::cpu::Handle *handle;
+    AvgPoolInfo info;
+    size_t workspace_size = 0;
+
+private:
+    Opaque(device::cpu::Handle *handle_ptr, const AvgPoolInfo &avgpool_info)
+        : handle(handle_ptr), info(avgpool_info) {
+        workspace_size = 0;
+    }
+
+    template <typename T, typename Ydata>
+    void _avgpool_1d(Ydata *output, const T *input) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_width = info.input_dims[0];
+        size_t output_width = info.output_dims[0];
+        size_t kernel_width = info.kernel_sizes[0];
+        size_t stride_width = info.strides[0];
+        size_t pad_width = info.pads[0];
+
+        const size_t input_nc_stride = input_width;
+        const size_t output_nc_stride = output_width;
+
+        #pragma omp parallel for collapse(2) schedule(static)
+        for (size_t b = 0; b < batch_size; ++b) {
+            for (size_t c = 0; c < channels; ++c) {
+                const size_t input_offset = (b * channels + c) * input_nc_stride;
+                const size_t output_offset = (b * channels + c) * output_nc_stride;
+
+                for (size_t ow = 0; ow < output_width; ++ow) {
+                    float sum = 0.0f;
+                    int valid_count = 0;
+
+                    const int window_start = static_cast<int>(ow * stride_width) - static_cast<int>(pad_width);
+                    const int window_end = window_start + static_cast<int>(kernel_width);
+
+                    for (int iw = window_start; iw < window_end; ++iw) {
+                        if (iw >= 0 && iw < static_cast<int>(input_width)) {
+                            sum += utils::cast<float>(input[input_offset + iw]);
+                            valid_count++;
+                        } else if (iw >= -static_cast<int>(pad_width) && 
+                                   iw < static_cast<int>(input_width + pad_width)) {
+                            valid_count++;
+                        }
+                    }
+
+                    float result = 0.0f;
+                    if (valid_count > 0) {
+                        result = sum / static_cast<float>(valid_count);
+                    }
+                    output[output_offset + ow] = utils::cast<Ydata>(result);
+                }
+            }
+        }
+    }    
+    
+    template <typename T, typename Ydata>
+    void _avgpool_2d(Ydata *output, const T *input) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_height = info.input_dims[0];
+        size_t input_width = info.input_dims[1];
+        size_t output_height = info.output_dims[0];
+        size_t output_width = info.output_dims[1];
+        size_t kernel_height = info.kernel_sizes[0];
+        size_t kernel_width = info.kernel_sizes[1];
+        size_t stride_height = info.strides[0];
+        size_t stride_width = info.strides[1];
+        size_t pad_height = info.pads[0];
+        size_t pad_width = info.pads[1];
+
+        const size_t input_nc_stride = input_height * input_width;
+        const size_t output_nc_stride = output_height * output_width;
+
+        #pragma omp parallel for collapse(2) schedule(static)
+        for (size_t b = 0; b < batch_size; ++b) {
+            for (size_t c = 0; c < channels; ++c) {
+                const size_t input_offset = (b * channels + c) * input_nc_stride;
+                const size_t output_offset = (b * channels + c) * output_nc_stride;
+
+                for (size_t oh = 0; oh < output_height; ++oh) {
+                    for (size_t ow = 0; ow < output_width; ++ow) {
+                        float sum = 0.0f;
+                        int valid_count = 0;
+
+                        const int start_h = static_cast<int>(oh * stride_height) - static_cast<int>(pad_height);
+                        const int start_w = static_cast<int>(ow * stride_width) - static_cast<int>(pad_width);
+
+                        for (int kh = 0; kh < static_cast<int>(kernel_height); ++kh) {
+                            for (int kw = 0; kw < static_cast<int>(kernel_width); ++kw) {
+                                const int ih = start_h + kh;
+                                const int iw = start_w + kw;
+
+                                if (ih >= 0 && ih < static_cast<int>(input_height) &&
+                                    iw >= 0 && iw < static_cast<int>(input_width)) {
+                                    sum += utils::cast<float>(input[input_offset + ih * input_width + iw]);
+                                    valid_count++;
+                                } else if (ih >= -static_cast<int>(pad_height) && 
+                                           ih < static_cast<int>(input_height + pad_height) &&
+                                           iw >= -static_cast<int>(pad_width) &&
+                                           iw < static_cast<int>(input_width + pad_width)) {
+                                    valid_count++;
+                                }
+                            }
+                        }
+
+                        float result = 0.0f;
+                        if (valid_count > 0) {
+                            result = sum / static_cast<float>(valid_count);
+                        }
+                        output[output_offset + oh * output_width + ow] = utils::cast<Ydata>(result);
+                    }
+                }
+            }
+        }
+    }
+
+    template <typename T, typename Ydata>
+    void _avgpool_3d(Ydata *output, const T *input) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_depth = info.input_dims[0];
+        size_t input_height = info.input_dims[1];
+        size_t input_width = info.input_dims[2];
+        size_t output_depth = info.output_dims[0];
+        size_t output_height = info.output_dims[1];
+        size_t output_width = info.output_dims[2];
+        size_t kernel_depth = info.kernel_sizes[0];
+        size_t kernel_height = info.kernel_sizes[1];
+        size_t kernel_width = info.kernel_sizes[2];
+        size_t stride_depth = info.strides[0];
+        size_t stride_height = info.strides[1];
+        size_t stride_width = info.strides[2];
+        size_t pad_depth = info.pads[0];
+        size_t pad_height = info.pads[1];
+        size_t pad_width = info.pads[2];
+
+        const size_t input_nc_stride = input_depth * input_height * input_width;
+        const size_t output_nc_stride = output_depth * output_height * output_width;
+
+        #pragma omp parallel for collapse(2) schedule(static)
+        for (size_t b = 0; b < batch_size; ++b) {
+            for (size_t c = 0; c < channels; ++c) {
+                const size_t input_offset = (b * channels + c) * input_nc_stride;
+                const size_t output_offset = (b * channels + c) * output_nc_stride;
+
+                for (size_t od = 0; od < output_depth; ++od) {
+                    for (size_t oh = 0; oh < output_height; ++oh) {
+                        for (size_t ow = 0; ow < output_width; ++ow) {
+                            float sum = 0.0f;
+                            int valid_count = 0;
+
+                            const int start_d = static_cast<int>(od * stride_depth) - static_cast<int>(pad_depth);
+                            const int start_h = static_cast<int>(oh * stride_height) - static_cast<int>(pad_height);
+                            const int start_w = static_cast<int>(ow * stride_width) - static_cast<int>(pad_width);
+
+                            for (int kd = 0; kd < static_cast<int>(kernel_depth); ++kd) {
+                                const int id = start_d + kd;
+                                for (int kh = 0; kh < static_cast<int>(kernel_height); ++kh) {
+                                    const int ih = start_h + kh;
+                                    for (int kw = 0; kw < static_cast<int>(kernel_width); ++kw) {
+                                        const int iw = start_w + kw;
+
+                                        if (id >= 0 && id < static_cast<int>(input_depth) &&
+                                            ih >= 0 && ih < static_cast<int>(input_height) &&
+                                            iw >= 0 && iw < static_cast<int>(input_width)) {
+                                            const size_t idx = id * (input_height * input_width) + 
+                                                            ih * input_width + iw;
+                                            sum += utils::cast<float>(input[input_offset + idx]);
+                                            valid_count++;
+                                        } else if (id >= -static_cast<int>(pad_depth) && 
+                                                   id < static_cast<int>(input_depth + pad_depth) &&
+                                                   ih >= -static_cast<int>(pad_height) && 
+                                                   ih < static_cast<int>(input_height + pad_height) &&
+                                                   iw >= -static_cast<int>(pad_width) && 
+                                                   iw < static_cast<int>(input_width + pad_width)) {
+                                            valid_count++;
+                                        }
+                                    }
+                                }
+                            }
+
+                            float result = 0.0f;
+                            if (valid_count > 0) {
+                                result = sum / static_cast<float>(valid_count);
+                            }
+                            
+                            const size_t out_idx = od * (output_height * output_width) + 
+                                                oh * output_width + ow;
+                            output[output_offset + out_idx] = utils::cast<Ydata>(result);
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    template <typename T, typename Ydata>
+    void _avgpool_cpu(Ydata *output, const T *input) const {
+        switch (info.ndim) {
+        case 1:
+            _avgpool_1d<T, Ydata>(output, input);
+            break;
+        case 2:
+            _avgpool_2d<T, Ydata>(output, input);
+            break;
+        case 3:
+            _avgpool_3d<T, Ydata>(output, input);
+            break;
+        default:
+            break;
+        }
+    }
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : handle(other.handle),
+          info(std::move(other.info)),
+          workspace_size(other.workspace_size) {
+        other.handle = nullptr;
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() = default;
+
+    static inline utils::Result<Opaque>
+    create(device::cpu::Handle *handle_ptr,
+           AvgPoolInfo &info) {
+
+        Opaque opaque(handle_ptr, info);
+        return utils::Result<Opaque>(std::move(opaque));
+    }
+
+    infiniStatus_t calculate(void *workspace, size_t workspace_size,
+                            void *output, const void *input, infiniDtype_t dtype) const {
+        if (!output || !input) {
+            return INFINI_STATUS_BAD_PARAM;
+        }
+        
+        size_t output_size = info.batch * info.channels;
+        for (size_t i = 0; i < info.ndim; ++i) {
+            output_size *= info.output_dims[i];
+        }
+
+        switch (dtype) {
+        case INFINI_DTYPE_F32: {
+            float *typed_output = static_cast<float *>(output);
+            const float *typed_input = static_cast<const float *>(input);
+            _avgpool_cpu<float, float>(typed_output, typed_input);
+            break;
+        }
+        case INFINI_DTYPE_F16: {
+            float *typed_output_f32 = static_cast<float*>(workspace);
+            const fp16_t *typed_input = static_cast<const fp16_t *>(input);
+            
+            _avgpool_cpu<fp16_t, float>(typed_output_f32, typed_input);
+            
+            fp16_t *typed_output = static_cast<fp16_t*>(output);
+            #pragma omp parallel for
+            for(size_t i = 0; i < output_size; ++i) {
+                typed_output[i] = utils::cast<fp16_t>(typed_output_f32[i]);
+            }
+            break;
+        }
+        case INFINI_DTYPE_BF16: {
+            float *typed_output_f32 = static_cast<float*>(workspace);
+            const bf16_t *typed_input = static_cast<const bf16_t *>(input);
+
+            _avgpool_cpu<bf16_t, float>(typed_output_f32, typed_input);
+
+            bf16_t *typed_output = static_cast<bf16_t*>(output);
+            #pragma omp parallel for
+            for(size_t i = 0; i < output_size; ++i) {
+                typed_output[i] = utils::cast<bf16_t>(typed_output_f32[i]);
+            }
+            break;
+        }
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        return INFINI_STATUS_SUCCESS;
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+inline size_t calculateOutputSize(const AvgPoolInfo &info) {
+    size_t size = info.batch * info.channels;
+    for(size_t i = 0; i < info.ndim; ++i) {
+        size *= info.output_dims[i];
+    }
+    return size;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    void *kernel_size,
+    void *strides,
+    void *pads,
+    bool ceil_mode) {
+    
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = input_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16);
+
+    auto result = AvgPoolInfo::create(output_desc, input_desc, kernel_size,
+                                    strides, pads, ceil_mode);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    auto opaque_result = Opaque::create(handle, info);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    size_t workspace_size = 0;
+    if (dtype == INFINI_DTYPE_F16 || dtype == INFINI_DTYPE_BF16) {
+        workspace_size = calculateOutputSize(info) * sizeof(float);
+    }
+
+    *desc_ptr = new Descriptor(dtype, std::move(info), workspace_size,
+                             opaque, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    
+    return _opaque->calculate(workspace, workspace_size, output, input, _dtype);
+}
+
+} // namespace op::averagepool::cpu
diff --git a/src/infiniop/ops/averagepool/cpu/averagepool_cpu.h b/src/infiniop/ops/averagepool/cpu/averagepool_cpu.h
new file mode 100644
index 000000000..8388f80ff
--- /dev/null
+++ b/src/infiniop/ops/averagepool/cpu/averagepool_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __AVERAGEPOOL_CPU_H__
+#define __AVERAGEPOOL_CPU_H__
+
+#include "../averagepool.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __AVERAGEPOOL_CPU_H__
diff --git a/src/infiniop/ops/averagepool/cuda/averagepool_kernel.cuh b/src/infiniop/ops/averagepool/cuda/averagepool_kernel.cuh
new file mode 100644
index 000000000..7c9d0f438
--- /dev/null
+++ b/src/infiniop/ops/averagepool/cuda/averagepool_kernel.cuh
@@ -0,0 +1,185 @@
+#ifndef __AVERAGEPOOL_KERNEL_H__
+#define __AVERAGEPOOL_KERNEL_H__
+
+#include <cmath>
+
+// 1D平均池化kernel，兼容PyTorch的隐式填充逻辑
+template <typename T>
+__global__ void avgpool1d_pytorch_compatible_kernel(
+    const T *input, T *output, int batch_size, int channels, int input_length,
+    int output_length, int kernel_size, int stride, int padding) {
+
+    int batch_idx = blockIdx.x;
+    int channel_idx = blockIdx.y;
+    int output_idx = blockIdx.z * blockDim.x + threadIdx.x;
+
+    if (batch_idx >= batch_size || channel_idx >= channels || output_idx >= output_length) {
+        return;
+    }
+
+    // 计算输入和输出的偏移
+    const T *input_ptr = input + batch_idx * channels * input_length + channel_idx * input_length;
+    T *output_ptr = output + batch_idx * channels * output_length + channel_idx * output_length;
+
+    // 计算池化窗口的起始位置
+    int window_start = output_idx * stride - padding;
+
+    // 使用单精度进行中间计算
+    float sum = 0.0f;
+    int valid_count = 0;
+
+    // 遍历池化窗口
+    for (int k = 0; k < kernel_size; ++k) {
+        int input_pos = window_start + k;
+
+        if (input_pos >= 0 && input_pos < input_length) {
+            // 有效的输入位置，转换为单精度进行累加
+            sum += static_cast<float>(input_ptr[input_pos]);
+            valid_count++;
+        } else if (input_pos >= -padding && input_pos < input_length + padding) {
+            // 显式填充区域，值为0，只增加计数
+            valid_count++;
+        }
+        // 其他位置是隐式填充，不计入分母
+    }
+
+    // 计算平均值并转换回原始数据类型
+    if (valid_count > 0) {
+        float result = sum / static_cast<float>(valid_count);
+        output_ptr[output_idx] = static_cast<T>(result);
+    } else {
+        output_ptr[output_idx] = T(0);
+    }
+}
+
+// 2D平均池化kernel，兼容PyTorch的隐式填充逻辑
+template <typename T>
+__global__ void avgpool2d_pytorch_compatible_kernel(
+    const T *input, T *output, int batch_size, int channels, int input_height,
+    int input_width, int output_height, int output_width, int kernel_h,
+    int kernel_w, int stride_h, int stride_w, int pad_h, int pad_w) {
+
+    int batch_idx = blockIdx.x;
+    int channel_idx = blockIdx.y;
+    int output_idx = blockIdx.z * blockDim.x + threadIdx.x;
+
+    int total_output_elements = output_height * output_width;
+    if (batch_idx >= batch_size || channel_idx >= channels || output_idx >= total_output_elements) {
+        return;
+    }
+
+    // 将线性索引转换为2D坐标
+    int out_h = output_idx / output_width;
+    int out_w = output_idx % output_width;
+
+    // 计算输入和输出的偏移
+    const T *input_ptr = input + batch_idx * channels * input_height * input_width + channel_idx * input_height * input_width;
+    T *output_ptr = output + batch_idx * channels * output_height * output_width + channel_idx * output_height * output_width;
+
+    // 计算池化窗口的起始位置
+    int window_start_h = out_h * stride_h - pad_h;
+    int window_start_w = out_w * stride_w - pad_w;
+
+    // 使用单精度进行中间计算
+    float sum = 0.0f;
+    int valid_count = 0;
+
+    // 遍历池化窗口
+    for (int kh = 0; kh < kernel_h; ++kh) {
+        for (int kw = 0; kw < kernel_w; ++kw) {
+            int input_h = window_start_h + kh;
+            int input_w = window_start_w + kw;
+
+            if (input_h >= 0 && input_h < input_height && input_w >= 0 && input_w < input_width) {
+                // 有效的输入位置，转换为单精度进行累加
+                int input_idx = input_h * input_width + input_w;
+                sum += static_cast<float>(input_ptr[input_idx]);
+                valid_count++;
+            } else if (input_h >= -pad_h && input_h < input_height + pad_h && input_w >= -pad_w && input_w < input_width + pad_w) {
+                // 显式填充区域，值为0，只增加计数
+                valid_count++;
+            }
+            // 其他位置是隐式填充，不计入分母
+        }
+    }
+
+    // 计算平均值并转换回原始数据类型
+    if (valid_count > 0) {
+        float result = sum / static_cast<float>(valid_count);
+        output_ptr[output_idx] = static_cast<T>(result);
+    } else {
+        output_ptr[output_idx] = T(0);
+    }
+}
+
+// 3D平均池化kernel，兼容PyTorch的隐式填充逻辑
+template <typename T>
+__global__ void avgpool3d_pytorch_compatible_kernel(
+    const T *input, T *output, int batch_size, int channels, int input_depth,
+    int input_height, int input_width, int output_depth, int output_height,
+    int output_width, int kernel_d, int kernel_h, int kernel_w, int stride_d,
+    int stride_h, int stride_w, int pad_d, int pad_h, int pad_w) {
+
+    int batch_idx = blockIdx.x;
+    int channel_idx = blockIdx.y;
+    int output_idx = blockIdx.z * blockDim.x + threadIdx.x;
+
+    int total_output_elements = output_depth * output_height * output_width;
+    if (batch_idx >= batch_size || channel_idx >= channels || output_idx >= total_output_elements) {
+        return;
+    }
+
+    // 将线性索引转换为3D坐标
+    int out_d = output_idx / (output_height * output_width);
+    int remaining = output_idx % (output_height * output_width);
+    int out_h = remaining / output_width;
+    int out_w = remaining % output_width;
+
+    // 计算输入和输出的偏移
+    int input_spatial_size = input_depth * input_height * input_width;
+    int output_spatial_size = output_depth * output_height * output_width;
+
+    const T *input_ptr = input + batch_idx * channels * input_spatial_size + channel_idx * input_spatial_size;
+    T *output_ptr = output + batch_idx * channels * output_spatial_size + channel_idx * output_spatial_size;
+
+    // 计算池化窗口的起始位置
+    int window_start_d = out_d * stride_d - pad_d;
+    int window_start_h = out_h * stride_h - pad_h;
+    int window_start_w = out_w * stride_w - pad_w;
+
+    // 使用单精度进行中间计算
+    float sum = 0.0f;
+    int valid_count = 0;
+
+    // 遍历池化窗口
+    for (int kd = 0; kd < kernel_d; ++kd) {
+        for (int kh = 0; kh < kernel_h; ++kh) {
+            for (int kw = 0; kw < kernel_w; ++kw) {
+                int input_d = window_start_d + kd;
+                int input_h = window_start_h + kh;
+                int input_w = window_start_w + kw;
+
+                if (input_d >= 0 && input_d < input_depth && input_h >= 0 && input_h < input_height && input_w >= 0 && input_w < input_width) {
+                    // 有效的输入位置，转换为单精度进行累加
+                    int input_idx = (input_d * input_height + input_h) * input_width + input_w;
+                    sum += static_cast<float>(input_ptr[input_idx]);
+                    valid_count++;
+                } else if (input_d >= -pad_d && input_d < input_depth + pad_d && input_h >= -pad_h && input_h < input_height + pad_h && input_w >= -pad_w && input_w < input_width + pad_w) {
+                    // 显式填充区域，值为0，只增加计数
+                    valid_count++;
+                }
+                // 其他位置是隐式填充，不计入分母
+            }
+        }
+    }
+
+    // 计算平均值并转换回原始数据类型
+    if (valid_count > 0) {
+        float result = sum / static_cast<float>(valid_count);
+        output_ptr[output_idx] = static_cast<T>(result);
+    } else {
+        output_ptr[output_idx] = T(0);
+    }
+}
+
+#endif // __AVERAGEPOOL_KERNEL_H__
diff --git a/src/infiniop/ops/averagepool/info.h b/src/infiniop/ops/averagepool/info.h
new file mode 100644
index 000000000..871e827a7
--- /dev/null
+++ b/src/infiniop/ops/averagepool/info.h
@@ -0,0 +1,136 @@
+#ifndef __AVERAGEPOOL_INFO_H__
+#define __AVERAGEPOOL_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include <cstddef>
+#include <vector>
+
+namespace op::averagepool {
+
+inline utils::Result<size_t> calculatePoolOutputSize(
+    size_t input_size,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding = 0,
+    bool ceil_mode = false) {
+
+    if (stride == 0) {
+        return utils::Result<size_t>(INFINI_STATUS_BAD_PARAM);
+    }
+    if (kernel_size == 0) {
+        return utils::Result<size_t>(INFINI_STATUS_BAD_PARAM);
+    }
+
+    size_t padded_input_size = input_size + 2 * padding;
+
+    if (padded_input_size < kernel_size) {
+        return utils::Result<size_t>(INFINI_STATUS_BAD_TENSOR_SHAPE);
+    }
+
+    size_t output_size;
+    if (ceil_mode) {
+        // 等效于整数的上取整
+        output_size = (padded_input_size - kernel_size + stride - 1) / stride + 1;
+    } else {
+        // 等效于整数的下取整
+        output_size = (padded_input_size - kernel_size) / stride + 1;
+    }
+
+    return utils::Result<size_t>(output_size);
+}
+
+// 检查是否存在隐式填充
+inline bool hasImplicitPadding(
+    size_t input_size,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding,
+    bool ceil_mode) {
+
+    if (!ceil_mode) {
+        return false;
+    }
+    return ((input_size + 2 * padding) - kernel_size) % stride != 0;
+}
+
+class AvgPoolInfo {
+    AvgPoolInfo() = default;
+
+public:
+    std::vector<size_t> input_dims;
+    std::vector<size_t> output_dims;
+    std::vector<size_t> kernel_sizes;
+    std::vector<size_t> strides;
+    std::vector<size_t> pads;
+    bool ceil_mode;
+    size_t ndim;
+    size_t batch;
+    size_t channels;
+    bool has_implicit_padding = false;
+
+    static utils::Result<AvgPoolInfo> create(
+        infiniopTensorDescriptor_t output_desc,
+        infiniopTensorDescriptor_t input_desc,
+        void *kernel_size,
+        void *strides,
+        void *pads,
+        bool ceil_mode) {
+
+        AvgPoolInfo info;
+
+        if (input_desc->ndim() < 3 || input_desc->ndim() > 5) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        if (input_desc->ndim() != output_desc->ndim()) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        if (input_desc->dim(0) != output_desc->dim(0) || input_desc->dim(1) != output_desc->dim(1)) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        info.ndim = input_desc->ndim() - 2; // 空间维度
+        info.batch = input_desc->dim(0);
+        info.channels = input_desc->dim(1);
+        info.ceil_mode = ceil_mode;
+
+        auto kernel_ptr = reinterpret_cast<const size_t *>(kernel_size);
+        auto stride_ptr = reinterpret_cast<const size_t *>(strides);
+        auto pad_ptr = reinterpret_cast<const size_t *>(pads);
+
+        // 初始化隐式填充标志
+        info.has_implicit_padding = false;
+
+        // 获取并校验空间维度
+        for (size_t i = 0; i < info.ndim; ++i) {
+            info.input_dims.push_back(input_desc->dim(i + 2));
+            info.kernel_sizes.push_back(kernel_ptr[i]);
+            info.strides.push_back(stride_ptr[i]);
+            info.pads.push_back(pad_ptr[i]);
+
+            auto output_size_result = calculatePoolOutputSize(
+                info.input_dims[i], info.kernel_sizes[i], info.strides[i], info.pads[i], info.ceil_mode);
+            CHECK_RESULT(output_size_result);
+
+            size_t expected_size = output_size_result.take();
+            if (expected_size != output_desc->dim(i + 2)) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+
+            info.output_dims.push_back(output_desc->dim(i + 2));
+
+            // 检查当前维度是否存在隐式填充
+            if (hasImplicitPadding(info.input_dims[i], info.kernel_sizes[i],
+                                   info.strides[i], info.pads[i], info.ceil_mode)) {
+                info.has_implicit_padding = true;
+            }
+        }
+        return utils::Result<AvgPoolInfo>(std::move(info));
+    }
+};
+} // namespace op::averagepool
+
+#endif // __AVERAGEPOOL_INFO_H__
diff --git a/src/infiniop/ops/averagepool/metax/averagepool_metax.h b/src/infiniop/ops/averagepool/metax/averagepool_metax.h
new file mode 100644
index 000000000..eef332b5f
--- /dev/null
+++ b/src/infiniop/ops/averagepool/metax/averagepool_metax.h
@@ -0,0 +1,8 @@
+#ifndef __AVERAGEPOOL_METAX_H__
+#define __AVERAGEPOOL_METAX_H__
+
+#include "../averagepool.h"
+
+DESCRIPTOR(metax)
+
+#endif // __AVERAGEPOOL_METAX_CUH__
diff --git a/src/infiniop/ops/averagepool/metax/averagepool_metax.maca b/src/infiniop/ops/averagepool/metax/averagepool_metax.maca
new file mode 100644
index 000000000..ee3c4bd9c
--- /dev/null
+++ b/src/infiniop/ops/averagepool/metax/averagepool_metax.maca
@@ -0,0 +1,332 @@
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+#include "averagepool_metax.h"
+#include "../cuda/averagepool_kernel.cuh"
+#include <cstdio>
+
+infiniStatus_t launch_avgpool_pytorch_kernel(
+    const op::averagepool::AvgPoolInfo& info,
+    const void* input, void* output,
+    infiniDtype_t data_type, hcStream_t stream) {
+    
+    int batch_size = static_cast<int>(info.batch);
+    int channels = static_cast<int>(info.channels);
+    
+    if (info.ndim == 1) {
+        // 1D平均池化
+        int input_length = static_cast<int>(info.input_dims[0]);
+        int output_length = static_cast<int>(info.output_dims[0]);
+        int kernel_size = static_cast<int>(info.kernel_sizes[0]);
+        int stride = static_cast<int>(info.strides[0]);
+        int padding = static_cast<int>(info.pads[0]);
+        
+        dim3 blockSize(256);
+        dim3 gridSize(batch_size, channels, (output_length + blockSize.x - 1) / blockSize.x);
+        
+        switch (data_type) {
+        case INFINI_DTYPE_F32:
+            avgpool1d_pytorch_compatible_kernel<float><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const float*>(input), static_cast<float*>(output),
+                batch_size, channels, input_length, output_length,
+                kernel_size, stride, padding);
+            break;
+        case INFINI_DTYPE_F16:
+            avgpool1d_pytorch_compatible_kernel<half><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const half*>(input), static_cast<half*>(output),
+                batch_size, channels, input_length, output_length,
+                kernel_size, stride, padding);
+            break;
+        case INFINI_DTYPE_BF16:
+            avgpool1d_pytorch_compatible_kernel<__hpcc_bfloat16><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const __hpcc_bfloat16*>(input), static_cast<__hpcc_bfloat16*>(output),
+                batch_size, channels, input_length, output_length,
+                kernel_size, stride, padding);
+            break;
+        default:
+            return INFINI_STATUS_NOT_IMPLEMENTED;
+        }
+        
+    } else if (info.ndim == 2) {
+        // 2D平均池化
+        int input_height = static_cast<int>(info.input_dims[0]);
+        int input_width = static_cast<int>(info.input_dims[1]);
+        int output_height = static_cast<int>(info.output_dims[0]);
+        int output_width = static_cast<int>(info.output_dims[1]);
+        int kernel_h = static_cast<int>(info.kernel_sizes[0]);
+        int kernel_w = static_cast<int>(info.kernel_sizes[1]);
+        int stride_h = static_cast<int>(info.strides[0]);
+        int stride_w = static_cast<int>(info.strides[1]);
+        int pad_h = static_cast<int>(info.pads[0]);
+        int pad_w = static_cast<int>(info.pads[1]);
+        
+        int total_output_elements = output_height * output_width;
+        dim3 blockSize(256);
+        dim3 gridSize(batch_size, channels, (total_output_elements + blockSize.x - 1) / blockSize.x);
+        
+        switch (data_type) {
+        case INFINI_DTYPE_F32:
+            avgpool2d_pytorch_compatible_kernel<float><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const float*>(input), static_cast<float*>(output),
+                batch_size, channels, input_height, input_width,
+                output_height, output_width, kernel_h, kernel_w,
+                stride_h, stride_w, pad_h, pad_w);
+            break;
+        case INFINI_DTYPE_F16:
+            avgpool2d_pytorch_compatible_kernel<half><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const half*>(input), static_cast<half*>(output),
+                batch_size, channels, input_height, input_width,
+                output_height, output_width, kernel_h, kernel_w,
+                stride_h, stride_w, pad_h, pad_w);
+            break;
+        case INFINI_DTYPE_BF16:
+            avgpool2d_pytorch_compatible_kernel<__hpcc_bfloat16><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const __hpcc_bfloat16*>(input), static_cast<__hpcc_bfloat16*>(output),
+                batch_size, channels, input_height, input_width,
+                output_height, output_width, kernel_h, kernel_w,
+                stride_h, stride_w, pad_h, pad_w);
+            break;
+        default:
+            return INFINI_STATUS_NOT_IMPLEMENTED;
+        }
+        
+    } else if (info.ndim == 3) {
+        // 3D平均池化
+        int input_depth = static_cast<int>(info.input_dims[0]);
+        int input_height = static_cast<int>(info.input_dims[1]);
+        int input_width = static_cast<int>(info.input_dims[2]);
+        int output_depth = static_cast<int>(info.output_dims[0]);
+        int output_height = static_cast<int>(info.output_dims[1]);
+        int output_width = static_cast<int>(info.output_dims[2]);
+        int kernel_d = static_cast<int>(info.kernel_sizes[0]);
+        int kernel_h = static_cast<int>(info.kernel_sizes[1]);
+        int kernel_w = static_cast<int>(info.kernel_sizes[2]);
+        int stride_d = static_cast<int>(info.strides[0]);
+        int stride_h = static_cast<int>(info.strides[1]);
+        int stride_w = static_cast<int>(info.strides[2]);
+        int pad_d = static_cast<int>(info.pads[0]);
+        int pad_h = static_cast<int>(info.pads[1]);
+        int pad_w = static_cast<int>(info.pads[2]);
+        
+        int total_output_elements = output_depth * output_height * output_width;
+        dim3 blockSize(256);
+        dim3 gridSize(batch_size, channels, (total_output_elements + blockSize.x - 1) / blockSize.x);
+        
+        switch (data_type) {
+        case INFINI_DTYPE_F32:
+            avgpool3d_pytorch_compatible_kernel<float><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const float*>(input), static_cast<float*>(output),
+                batch_size, channels, input_depth, input_height, input_width,
+                output_depth, output_height, output_width,
+                kernel_d, kernel_h, kernel_w, stride_d, stride_h, stride_w,
+                pad_d, pad_h, pad_w);
+            break;
+        case INFINI_DTYPE_F16:
+            avgpool3d_pytorch_compatible_kernel<half><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const half*>(input), static_cast<half*>(output),
+                batch_size, channels, input_depth, input_height, input_width,
+                output_depth, output_height, output_width,
+                kernel_d, kernel_h, kernel_w, stride_d, stride_h, stride_w,
+                pad_d, pad_h, pad_w);
+            break;
+        case INFINI_DTYPE_BF16:
+            avgpool3d_pytorch_compatible_kernel<__hpcc_bfloat16><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const __hpcc_bfloat16*>(input), static_cast<__hpcc_bfloat16*>(output),
+                batch_size, channels, input_depth, input_height, input_width,
+                output_depth, output_height, output_width,
+                kernel_d, kernel_h, kernel_w, stride_d, stride_h, stride_w,
+                pad_d, pad_h, pad_w);
+            break;
+        default:
+            return INFINI_STATUS_NOT_IMPLEMENTED;
+        }
+        
+    } else {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+    
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define DESTROY_hcdnn_DESCRIPTOR(desc_ptr, destroy_func)                       \
+  do {                                                                         \
+    if (desc_ptr) {                                                            \
+      destroy_func(desc_ptr);                                                  \
+      desc_ptr = nullptr;                                                      \
+    }                                                                          \
+  } while (0)
+
+#define CLEANUP_hcdnn_DESCRIPTORS()                                            \
+  do {                                                                         \
+    DESTROY_hcdnn_DESCRIPTOR(input_desc, hcdnnDestroyTensorDescriptor);        \
+    DESTROY_hcdnn_DESCRIPTOR(output_desc, hcdnnDestroyTensorDescriptor);       \
+    DESTROY_hcdnn_DESCRIPTOR(pooling_desc, hcdnnDestroyPoolingDescriptor);     \
+  } while (0)
+
+namespace op::averagepool::metax {
+
+struct Descriptor::Opaque {
+  std::shared_ptr<device::metax::Handle::Internal> internal;
+  size_t workspace_size = 0;
+
+#ifdef ENABLE_HCDNN_API
+  hcdnnTensorDescriptor_t input_desc = nullptr;
+  hcdnnTensorDescriptor_t output_desc = nullptr;
+  hcdnnPoolingDescriptor_t pooling_desc = nullptr;
+#endif
+
+private:
+  Opaque(std::shared_ptr<device::metax::Handle::Internal> internal_ptr)
+      : internal(internal_ptr) {}
+
+#ifdef ENABLE_HCDNN_API
+  infiniStatus_t createPoolingDescriptors(const AvgPoolInfo &info,
+                                         hcdnnDataType_t hcdnn_data_type) {
+    CHECK_MCDNN(hcdnnCreateTensorDescriptor(&input_desc));
+    CHECK_MCDNN(hcdnnCreateTensorDescriptor(&output_desc));
+    CHECK_MCDNN(hcdnnCreatePoolingDescriptor(&pooling_desc));
+
+    std::vector<int> input_dims = {static_cast<int>(info.batch), static_cast<int>(info.channels)};
+    std::vector<int> output_dims = {static_cast<int>(info.batch), static_cast<int>(info.channels)};
+    for (size_t i = 0; i < info.ndim; ++i) {
+      input_dims.push_back(static_cast<int>(info.input_dims[i]));
+      output_dims.push_back(static_cast<int>(info.output_dims[i]));
+    }
+    while (input_dims.size() < 5)  input_dims.push_back(1);
+    while (output_dims.size() < 5) output_dims.push_back(1);
+    std::vector<int> input_strides(input_dims.size(), 1);
+    std::vector<int> output_strides(output_dims.size(), 1);
+    for (int i = input_dims.size() - 2; i >= 0; --i) {
+      input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
+      output_strides[i] = output_strides[i + 1] * output_dims[i + 1];
+    }
+
+    CHECK_MCDNN(hcdnnSetTensorNdDescriptor(input_desc, hcdnn_data_type,
+                                           input_dims.size(), input_dims.data(), input_strides.data()));
+    CHECK_MCDNN(hcdnnSetTensorNdDescriptor(output_desc, hcdnn_data_type,
+                                           output_dims.size(), output_dims.data(), output_strides.data()));
+
+    return INFINI_STATUS_SUCCESS;
+  }
+
+  infiniStatus_t setupPoolingDescriptor(const AvgPoolInfo &info) {
+    std::vector<int> kernel_size, strides, pads;
+    for (size_t i = 0; i < info.ndim; ++i) {
+      kernel_size.push_back(static_cast<int>(info.kernel_sizes[i]));
+      strides.push_back(static_cast<int>(info.strides[i]));
+      pads.push_back(static_cast<int>(info.pads[i]));
+    }
+    while (kernel_size.size() < 3) kernel_size.push_back(1);
+    while (strides.size() < 3)     strides.push_back(1);
+    while (pads.size() < 3)        pads.push_back(0);
+    CHECK_MCDNN(hcdnnSetPoolingNdDescriptor(pooling_desc, HCDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING,
+                                            HCDNN_NOT_PROPAGATE_NAN, kernel_size.size(),
+                                            kernel_size.data(), pads.data(), strides.data()));
+    return INFINI_STATUS_SUCCESS;
+  }
+
+  infiniStatus_t initializehcdnnContext(AvgPoolInfo &info,
+                                       infiniDtype_t data_type) {
+    hcdnnDataType_t hcdnn_data_type = device::metax::getHcdnnDtype(data_type);
+    CHECK_STATUS(createPoolingDescriptors(info, hcdnn_data_type));
+    CHECK_STATUS(setupPoolingDescriptor(info));
+    workspace_size = 0;
+    return INFINI_STATUS_SUCCESS;
+  }
+#endif
+
+public:
+  Opaque(Opaque &&other) noexcept
+      : internal(std::move(other.internal)),
+        workspace_size(other.workspace_size)
+#ifdef ENABLE_HCDNN_API
+        , input_desc(other.input_desc)
+        , output_desc(other.output_desc)
+        , pooling_desc(other.pooling_desc)
+#endif
+  {
+#ifdef ENABLE_HCDNN_API
+    other.input_desc = nullptr;
+    other.output_desc = nullptr;
+    other.pooling_desc = nullptr;
+#endif
+    other.workspace_size = 0;
+  }
+
+  ~Opaque() {
+#ifdef ENABLE_HCDNN_API
+    CLEANUP_hcdnn_DESCRIPTORS();
+#endif
+  }
+
+  static inline utils::Result<Opaque>
+  create(std::shared_ptr<device::metax::Handle::Internal> internal_ptr,
+         AvgPoolInfo &info, infiniDtype_t data_type) {
+#ifdef ENABLE_HCDNN_API
+    Opaque opaque(internal_ptr);
+    auto status = opaque.initializehcdnnContext(info, data_type);
+    if (status != INFINI_STATUS_SUCCESS) {
+      return status;
+    }
+    return utils::Result<Opaque>(std::move(opaque));
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+  }
+};
+
+Descriptor::~Descriptor() {
+  if (_opaque) {
+    delete _opaque;
+  }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  infiniopTensorDescriptor_t input_desc,
+                                  void *kernel_size, void *strides, void *pads,
+                                  bool ceil_mode) {
+#ifdef ENABLE_HCDNN_API
+  auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+  auto dtype = input_desc->dtype();
+  CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+  auto result = AvgPoolInfo::create(output_desc, input_desc, kernel_size,
+                                    strides, pads, ceil_mode);
+  CHECK_RESULT(result);
+  auto info = result.take();
+  auto opaque_result = Opaque::create(handle->internal(), info, dtype);
+  CHECK_RESULT(opaque_result);
+  auto opaque = new Opaque(opaque_result.take());
+
+  *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size,
+                             opaque, handle->device, handle->device_id);
+  return INFINI_STATUS_SUCCESS;
+#else
+  return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *output, const void *input,
+                                     void *stream) const {
+#ifdef ENABLE_HCDNN_API
+  if (_info.has_implicit_padding) {
+    // 使用自定义kernel实现PyTorch兼容的逻辑
+    return launch_avgpool_pytorch_kernel(_info, input, output, _dtype, (hcStream_t)stream);
+  } else {
+    const float alpha = 1.0f, beta = 0.0f;
+    CHECK_STATUS(_opaque->internal->useMcdnn(
+      (hcStream_t)stream, [&](hcdnnHandle_t handle) {
+        CHECK_MCDNN(hcdnnPoolingForward(handle, _opaque->pooling_desc, &alpha,
+                                        _opaque->input_desc, input, &beta,
+                                        _opaque->output_desc, output));
+        return INFINI_STATUS_SUCCESS;
+      }));
+    return INFINI_STATUS_SUCCESS;
+  }  
+#else
+  return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+} // namespace op::averagepool::metax
diff --git a/src/infiniop/ops/averagepool/nvidia/averagepool.cu b/src/infiniop/ops/averagepool/nvidia/averagepool.cu
new file mode 100644
index 000000000..6f276aac8
--- /dev/null
+++ b/src/infiniop/ops/averagepool/nvidia/averagepool.cu
@@ -0,0 +1,220 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "averagepool_nvidia.cuh"
+
+#define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \
+    do {                                                 \
+        if (desc_ptr) {                                  \
+            destroy_func(desc_ptr);                      \
+            desc_ptr = nullptr;                          \
+        }                                                \
+    } while (0)
+
+#define CLEANUP_CUDNN_DESCRIPTORS()                                            \
+    do {                                                                       \
+        DESTROY_CUDNN_DESCRIPTOR(input_desc, cudnnDestroyTensorDescriptor);    \
+        DESTROY_CUDNN_DESCRIPTOR(output_desc, cudnnDestroyTensorDescriptor);   \
+        DESTROY_CUDNN_DESCRIPTOR(pooling_desc, cudnnDestroyPoolingDescriptor); \
+    } while (0)
+
+namespace op::averagepool::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+    size_t workspace_size = 0;
+
+#ifdef ENABLE_CUDNN_API
+    cudnnTensorDescriptor_t input_desc = nullptr;
+    cudnnTensorDescriptor_t output_desc = nullptr;
+    cudnnPoolingDescriptor_t pooling_desc = nullptr;
+#endif
+
+private:
+    Opaque(std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr)
+        : internal(internal_ptr) {}
+
+#ifdef ENABLE_CUDNN_API
+    infiniStatus_t getCudnnDataType(infiniDtype_t data_type,
+                                    cudnnDataType_t &cudnn_data_type) const {
+        if (data_type == INFINI_DTYPE_F16) {
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
+        } else if (data_type == INFINI_DTYPE_F32) {
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
+        } else if (data_type == INFINI_DTYPE_BF16) {
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t createPoolingDescriptors(const AvgPoolInfo &info,
+                                            cudnnDataType_t cudnn_data_type) {
+        CHECK_CUDNN(cudnnCreateTensorDescriptor(&input_desc));
+        CHECK_CUDNN(cudnnCreateTensorDescriptor(&output_desc));
+        CHECK_CUDNN(cudnnCreatePoolingDescriptor(&pooling_desc));
+
+        std::vector<int> input_dims_vec = {static_cast<int>(info.batch),
+                                           static_cast<int>(info.channels)};
+        std::vector<int> output_dims_vec = {static_cast<int>(info.batch),
+                                            static_cast<int>(info.channels)};
+
+        for (size_t i = 0; i < info.ndim; ++i) {
+            input_dims_vec.push_back(static_cast<int>(info.input_dims[i]));
+            output_dims_vec.push_back(static_cast<int>(info.output_dims[i]));
+        }
+
+        if (info.ndim == 1) {
+            input_dims_vec.push_back(1);
+            output_dims_vec.push_back(1);
+        }
+
+        CHECK_CUDNN(cudnnSetTensorNdDescriptorEx(
+            input_desc, CUDNN_TENSOR_NCHW, cudnn_data_type, input_dims_vec.size(),
+            input_dims_vec.data()));
+
+        CHECK_CUDNN(cudnnSetTensorNdDescriptorEx(
+            output_desc, CUDNN_TENSOR_NCHW, cudnn_data_type, output_dims_vec.size(),
+            output_dims_vec.data()));
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t setupPoolingDescriptor(const AvgPoolInfo &info) {
+        std::vector<int> kernel_vec, stride_vec, pad_vec;
+        for (size_t i = 0; i < info.ndim; ++i) {
+            kernel_vec.push_back(static_cast<int>(info.kernel_sizes[i]));
+            stride_vec.push_back(static_cast<int>(info.strides[i]));
+            pad_vec.push_back(static_cast<int>(info.pads[i]));
+        }
+
+        if (info.ndim == 1) {
+            kernel_vec.push_back(1);
+            stride_vec.push_back(1);
+            pad_vec.push_back(0);
+        }
+
+        CHECK_CUDNN(cudnnSetPoolingNdDescriptor(
+            pooling_desc, CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING,
+            CUDNN_NOT_PROPAGATE_NAN, kernel_vec.size(), kernel_vec.data(),
+            pad_vec.data(), stride_vec.data()));
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t initializeCudnnContext(AvgPoolInfo &info,
+                                          infiniDtype_t data_type) {
+        cudnnDataType_t cudnn_data_type;
+        CHECK_STATUS(getCudnnDataType(data_type, cudnn_data_type));
+
+        CHECK_STATUS(createPoolingDescriptors(info, cudnn_data_type));
+        CHECK_STATUS(setupPoolingDescriptor(info));
+
+        // Average pooling typically doesn't need a workspace
+        workspace_size = 0;
+
+        return INFINI_STATUS_SUCCESS;
+    }
+#endif
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : internal(std::move(other.internal)),
+          workspace_size(other.workspace_size)
+    // clang-format off
+#ifdef ENABLE_CUDNN_API
+        , input_desc(other.input_desc)
+        , output_desc(other.output_desc)
+        , pooling_desc(other.pooling_desc)
+#endif
+    // clang-format on
+    {
+#ifdef ENABLE_CUDNN_API
+        other.input_desc = nullptr;
+        other.output_desc = nullptr;
+        other.pooling_desc = nullptr;
+#endif
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() {
+#ifdef ENABLE_CUDNN_API
+        CLEANUP_CUDNN_DESCRIPTORS();
+#endif
+    }
+
+    static inline utils::Result<Opaque>
+    create(std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr,
+           AvgPoolInfo &info, infiniDtype_t data_type) {
+#ifdef ENABLE_CUDNN_API
+        Opaque opaque(internal_ptr);
+        auto status = opaque.initializeCudnnContext(info, data_type);
+        if (status != INFINI_STATUS_SUCCESS) {
+            return status;
+        }
+        return utils::Result<Opaque>(std::move(opaque));
+#else
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  infiniopTensorDescriptor_t input_desc,
+                                  void *kernel_size, void *strides, void *pads,
+                                  bool ceil_mode) {
+
+#ifdef ENABLE_CUDNN_API
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = input_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    auto result = AvgPoolInfo::create(output_desc, input_desc, kernel_size,
+                                      strides, pads, ceil_mode);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    auto opaque_result = Opaque::create(handle->internal(), info, dtype);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size,
+                               opaque, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *output, const void *input,
+                                     void *stream) const {
+
+#ifdef ENABLE_CUDNN_API
+    const float alpha = 1.0f, beta = 0.0f;
+
+    CHECK_STATUS(_opaque->internal->useCudnn(
+        (cudaStream_t)stream, [&](cudnnHandle_t handle) {
+            CHECK_CUDNN(cudnnPoolingForward(handle, _opaque->pooling_desc, &alpha,
+                                            _opaque->input_desc, input, &beta,
+                                            _opaque->output_desc, output));
+            return INFINI_STATUS_SUCCESS;
+        }));
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+} // namespace op::averagepool::nvidia
diff --git a/src/infiniop/ops/averagepool/nvidia/averagepool_nvidia.cuh b/src/infiniop/ops/averagepool/nvidia/averagepool_nvidia.cuh
new file mode 100644
index 000000000..ef19aa1dc
--- /dev/null
+++ b/src/infiniop/ops/averagepool/nvidia/averagepool_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __AVERAGEPOOL_CUDA_CUH__
+#define __AVERAGEPOOL_CUDA_CUH__
+
+#include "../averagepool.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __AVERAGEPOOL_CUDA_CUH__
diff --git a/src/infiniop/ops/averagepool/operator.cc b/src/infiniop/ops/averagepool/operator.cc
new file mode 100644
index 000000000..5d72af8f8
--- /dev/null
+++ b/src/infiniop/ops/averagepool/operator.cc
@@ -0,0 +1,155 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/averagepool.h"
+
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/averagepool_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/averagepool_metax.h"
+#endif
+#ifdef ENABLE_CPU_API 
+#include "cpu/averagepool_cpu.h"
+#endif
+
+__C infiniStatus_t infiniopCreateAvgPoolDescriptor(
+    infiniopHandle_t handle,
+    infiniopAvgPoolDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    void *kernel_size,
+    void *strides,
+    void *pads,
+    bool ceil_mode) {
+
+#define CREATE(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                     \
+        return op::averagepool::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                \
+            reinterpret_cast<op::averagepool::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                           \
+            input_desc,                                                            \
+            kernel_size,                                                           \
+            strides,                                                               \
+            pads,                                                                  \
+            ceil_mode)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CPU_API 
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetAvgPoolWorkspaceSize(
+    infiniopAvgPoolDescriptor_t desc,
+    size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                             \
+    case CASE:                                                                                           \
+        *size = reinterpret_cast<const op::averagepool::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CPU_API 
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef GET
+}
+
+__C infiniStatus_t infiniopAvgPool(
+    infiniopAvgPoolDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                        \
+        return reinterpret_cast<const op::averagepool::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size,                                    \
+                        output,                                                       \
+                        input,                                                        \
+                        stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CPU_API 
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t infiniopDestroyAvgPoolDescriptor(infiniopAvgPoolDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                        \
+    case CASE:                                                                         \
+        delete reinterpret_cast<const op::averagepool::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CPU_API 
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/averagepool_backward/averagepool_backward.h b/src/infiniop/ops/averagepool_backward/averagepool_backward.h
new file mode 100644
index 000000000..6322c3d92
--- /dev/null
+++ b/src/infiniop/ops/averagepool_backward/averagepool_backward.h
@@ -0,0 +1,55 @@
+#ifndef __AVERAGEPOOL_BACKWARD_H__
+#define __AVERAGEPOOL_BACKWARD_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::averagepool_backward::NAMESPACE {              \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        infiniDtype_t _dtype;                                    \
+        AvgPoolBackwardInfo _info;                               \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            infiniDtype_t dtype,                                 \
+            AvgPoolBackwardInfo info,                            \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _dtype(dtype),                                     \
+              _info(info),                                       \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t grad_input_desc,          \
+            infiniopTensorDescriptor_t grad_output_desc,         \
+            infiniopTensorDescriptor_t input_desc,               \
+            void *kernel_size,                                   \
+            void *strides,                                       \
+            void *pads,                                          \
+            bool ceil_mode);                                     \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            void *grad_input,                                    \
+            const void *grad_output,                             \
+            const void *input,                                   \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __AVERAGEPOOL_BACKWARD_H__
diff --git a/src/infiniop/ops/averagepool_backward/cpu/averagepool_backward_cpu.cc b/src/infiniop/ops/averagepool_backward/cpu/averagepool_backward_cpu.cc
new file mode 100644
index 000000000..399d005ee
--- /dev/null
+++ b/src/infiniop/ops/averagepool_backward/cpu/averagepool_backward_cpu.cc
@@ -0,0 +1,390 @@
+#include "averagepool_backward_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../devices/cpu/cpu_handle.h"
+#include "../info.h"
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <numeric>
+#include <vector>
+
+namespace op::averagepool_backward::cpu {
+
+struct Descriptor::Opaque {
+    device::cpu::Handle *handle;
+    AvgPoolBackwardInfo info;
+    size_t workspace_size = 0;
+
+private:
+    Opaque(device::cpu::Handle *handle_ptr, const AvgPoolBackwardInfo &avgpool_info)
+        : handle(handle_ptr), info(avgpool_info) {
+        workspace_size = 0;
+    }
+    
+    template <typename T_out, typename T_in>
+    void _avgpool_backward_1d(T_out *grad_input, const T_in *grad_output) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_width = info.input_dims[0];
+        size_t output_width = info.output_dims[0];
+        size_t kernel_width = info.kernel_sizes[0];
+        size_t stride_width = info.strides[0];
+        size_t pad_width = info.pads[0];
+
+        const size_t input_nc_stride = input_width;
+        const size_t output_nc_stride = output_width;
+
+        size_t grad_input_nelem = info.batch * info.channels * input_width;
+        memset(grad_input, 0, grad_input_nelem * sizeof(T_out));
+
+#pragma omp parallel for collapse(2) schedule(static)
+        for (size_t b = 0; b < batch_size; ++b) {
+            for (size_t c = 0; c < channels; ++c) {
+                const size_t grad_output_offset = (b * channels + c) * output_nc_stride;
+                const size_t grad_input_offset = (b * channels + c) * input_nc_stride;
+
+                for (size_t ow = 0; ow < output_width; ++ow) {
+                    float grad_value = utils::cast<float>(grad_output[grad_output_offset + ow]);
+
+                    int valid_count = 0;
+                    const int window_start = static_cast<int>(ow * stride_width) - static_cast<int>(pad_width);
+                    const int window_end = window_start + static_cast<int>(kernel_width);
+
+                    for (int iw = window_start; iw < window_end; ++iw) {
+                        if (iw >= 0 && iw < static_cast<int>(input_width)) {
+                            valid_count++;
+                        } else if (iw >= -static_cast<int>(pad_width) && 
+                                   iw < static_cast<int>(input_width + pad_width)) {
+                            valid_count++;
+                        }
+                    }
+                    
+                    if (valid_count > 0) {
+                        float grad_distribute = grad_value / static_cast<float>(valid_count);
+                        for (int iw = window_start; iw < window_end; ++iw) {
+                            if (iw >= 0 && iw < static_cast<int>(input_width)) {
+                                grad_input[grad_input_offset + iw] += utils::cast<T_out>(grad_distribute);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    template <typename T_out, typename T_in>
+    void _avgpool_backward_2d(T_out *grad_input, const T_in *grad_output) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_height = info.input_dims[0];
+        size_t input_width = info.input_dims[1];
+        size_t output_height = info.output_dims[0];
+        size_t output_width = info.output_dims[1];
+        size_t kernel_height = info.kernel_sizes[0];
+        size_t kernel_width = info.kernel_sizes[1];
+        size_t stride_height = info.strides[0];
+        size_t stride_width = info.strides[1];
+        size_t pad_h = info.pads[0];
+        size_t pad_w = info.pads[1];
+        
+        const size_t input_nc_stride = input_height * input_width;
+        const size_t output_nc_stride = output_height * output_width;
+
+        size_t grad_input_nelem = info.batch * info.channels * input_height * input_width;
+        memset(grad_input, 0, grad_input_nelem * sizeof(T_out));
+
+        #pragma omp parallel for collapse(2) schedule(static)
+        for (size_t b = 0; b < batch_size; ++b) {
+            for (size_t c = 0; c < channels; ++c) {
+                const size_t grad_output_offset = (b * channels + c) * output_nc_stride;
+                const size_t grad_input_offset = (b * channels + c) * input_nc_stride;
+
+                for (size_t oh = 0; oh < output_height; ++oh) {
+                    for (size_t ow = 0; ow < output_width; ++ow) {
+                        float grad_value = utils::cast<float>(grad_output[grad_output_offset + oh * output_width + ow]);
+
+                        int valid_count = 0;
+                        const int start_h = static_cast<int>(oh * stride_height) - static_cast<int>(pad_h);
+                        const int start_w = static_cast<int>(ow * stride_width) - static_cast<int>(pad_w);
+
+                        for (int kh = 0; kh < static_cast<int>(kernel_height); ++kh) {
+                            for (int kw = 0; kw < static_cast<int>(kernel_width); ++kw) {
+                                const int ih = start_h + kh;
+                                const int iw = start_w + kw;
+
+                                if (ih >= 0 && ih < static_cast<int>(input_height) &&
+                                    iw >= 0 && iw < static_cast<int>(input_width)) {
+                                    valid_count++;
+                                } else if (ih >= -static_cast<int>(pad_h) && 
+                                           ih < static_cast<int>(input_height + pad_h) &&
+                                           iw >= -static_cast<int>(pad_w) &&
+                                           iw < static_cast<int>(input_width + pad_w)) {
+                                    valid_count++;
+                                }
+                            }
+                        }
+
+                        if (valid_count > 0) {
+                            float grad_distribute = grad_value / static_cast<float>(valid_count);
+                            for (int kh = 0; kh < static_cast<int>(kernel_height); ++kh) {
+                                for (int kw = 0; kw < static_cast<int>(kernel_width); ++kw) {
+                                    const int ih = start_h + kh;
+                                    const int iw = start_w + kw;
+                                    if (ih >= 0 && ih < static_cast<int>(input_height) &&
+                                        iw >= 0 && iw < static_cast<int>(input_width)) {
+                                        grad_input[grad_input_offset + ih * input_width + iw] += utils::cast<T_out>(grad_distribute);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    template <typename T_out, typename T_in>
+    void _avgpool_backward_3d(T_out *grad_input, const T_in *grad_output) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_depth = info.input_dims[0];
+        size_t input_height = info.input_dims[1];
+        size_t input_width = info.input_dims[2];
+        size_t output_depth = info.output_dims[0];
+        size_t output_height = info.output_dims[1];
+        size_t output_width = info.output_dims[2];
+        size_t kernel_d = info.kernel_sizes[0];
+        size_t kernel_h = info.kernel_sizes[1];
+        size_t kernel_w = info.kernel_sizes[2];
+        size_t stride_d = info.strides[0];
+        size_t stride_h = info.strides[1];
+        size_t stride_w = info.strides[2];
+        size_t pad_d = info.pads[0];
+        size_t pad_h = info.pads[1];
+        size_t pad_w = info.pads[2];
+
+        const size_t input_nc_stride = input_depth * input_height * input_width;
+        const size_t output_nc_stride = output_depth * output_height * output_width;
+
+        size_t grad_input_nelem = info.batch * info.channels * input_depth * input_height * input_width;
+        memset(grad_input, 0, grad_input_nelem * sizeof(T_out));
+
+        #pragma omp parallel for collapse(2) schedule(static)
+        for (size_t b = 0; b < batch_size; ++b) {
+            for (size_t c = 0; c < channels; ++c) {
+                const size_t grad_output_offset = (b * channels + c) * output_nc_stride;
+                const size_t grad_input_offset = (b * channels + c) * input_nc_stride;
+
+                for (size_t od = 0; od < output_depth; ++od) {
+                    for (size_t oh = 0; oh < output_height; ++oh) {
+                        for (size_t ow = 0; ow < output_width; ++ow) {
+                            float grad_value = utils::cast<float>(grad_output[grad_output_offset + od * output_height * output_width + oh * output_width + ow]);
+                            
+                            int valid_count = 0;
+                            const int start_d = static_cast<int>(od * stride_d) - static_cast<int>(pad_d);
+                            const int start_h = static_cast<int>(oh * stride_h) - static_cast<int>(pad_h);
+                            const int start_w = static_cast<int>(ow * stride_w) - static_cast<int>(pad_w);
+
+                            for (int kd = 0; kd < static_cast<int>(kernel_d); ++kd) {
+                                for (int kh = 0; kh < static_cast<int>(kernel_h); ++kh) {
+                                    for (int kw = 0; kw < static_cast<int>(kernel_w); ++kw) {
+                                        const int id = start_d + kd;
+                                        const int ih = start_h + kh;
+                                        const int iw = start_w + kw;
+
+                                        if (id >= 0 && id < static_cast<int>(input_depth) &&
+                                            ih >= 0 && ih < static_cast<int>(input_height) &&
+                                            iw >= 0 && iw < static_cast<int>(input_width)) {
+                                            valid_count++;
+                                        } else if (id >= -static_cast<int>(pad_d) && 
+                                                   id < static_cast<int>(input_depth + pad_d) &&
+                                                   ih >= -static_cast<int>(pad_h) && 
+                                                   ih < static_cast<int>(input_height + pad_h) &&
+                                                   iw >= -static_cast<int>(pad_w) && 
+                                                   iw < static_cast<int>(input_width + pad_w)) {
+                                            valid_count++;
+                                        }
+                                    }
+                                }
+                            }
+                            
+                            if (valid_count > 0) {
+                                float grad_distribute = grad_value / static_cast<float>(valid_count);
+                                for (int kd = 0; kd < static_cast<int>(kernel_d); ++kd) {
+                                    for (int kh = 0; kh < static_cast<int>(kernel_h); ++kh) {
+                                        for (int kw = 0; kw < static_cast<int>(kernel_w); ++kw) {
+                                            const int id = start_d + kd;
+                                            const int ih = start_h + kh;
+                                            const int iw = start_w + kw;
+                                            if (id >= 0 && id < static_cast<int>(input_depth) &&
+                                                ih >= 0 && ih < static_cast<int>(input_height) &&
+                                                iw >= 0 && iw < static_cast<int>(input_width)) {
+                                                grad_input[grad_input_offset + id * input_height * input_width + ih * input_width + iw] += utils::cast<T_out>(grad_distribute);
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    template <typename T_out, typename T_in>
+    void _avgpool_backward_cpu(T_out *grad_input, const T_in *grad_output) const {
+        switch (info.ndim) {
+        case 1:
+            _avgpool_backward_1d<T_out, T_in>(grad_input, grad_output);
+            break;
+        case 2:
+            _avgpool_backward_2d<T_out, T_in>(grad_input, grad_output);
+            break;
+        case 3:
+            _avgpool_backward_3d<T_out, T_in>(grad_input, grad_output);
+            break;
+        default:
+            break;
+        }
+    }
+public:
+    Opaque(Opaque &&other) noexcept
+        : handle(other.handle),
+          info(std::move(other.info)),
+          workspace_size(other.workspace_size) {
+        other.handle = nullptr;
+        other.workspace_size = 0;
+    }
+    
+    ~Opaque() = default;
+
+    static inline utils::Result<Opaque>
+    create(device::cpu::Handle *handle_ptr,
+           AvgPoolBackwardInfo &info) {
+        Opaque opaque(handle_ptr, info);
+        return utils::Result<Opaque>(std::move(opaque));
+    }
+
+    infiniStatus_t calculate(void *workspace, size_t workspace_size,
+                             void *grad_input, const void *grad_output,
+                             const void *input, infiniDtype_t dtype) const {
+        if (!grad_input || !grad_output) {
+            return INFINI_STATUS_BAD_PARAM;
+        }
+        
+        size_t grad_input_nelem = info.batch * info.channels * info.input_dims[0];
+        if (info.ndim > 1) {
+            grad_input_nelem *= info.input_dims[1];
+        }
+        if (info.ndim > 2) {
+            grad_input_nelem *= info.input_dims[2];
+        }
+
+        switch (dtype) {
+        case INFINI_DTYPE_F32: {
+            float *typed_grad_input = static_cast<float *>(grad_input);
+            const float *typed_grad_output = static_cast<const float *>(grad_output);
+            _avgpool_backward_cpu<float, float>(typed_grad_input, typed_grad_output);
+            break;
+        }
+        case INFINI_DTYPE_F16: {
+            float *typed_grad_input_f32 = static_cast<float*>(workspace);
+            const fp16_t *typed_grad_output = static_cast<const fp16_t *>(grad_output);
+            
+            _avgpool_backward_cpu<float, fp16_t>(typed_grad_input_f32, typed_grad_output);
+            
+            fp16_t *typed_grad_input = static_cast<fp16_t*>(grad_input);
+            #pragma omp parallel for
+            for(size_t i = 0; i < grad_input_nelem; ++i) {
+                typed_grad_input[i] = utils::cast<fp16_t>(typed_grad_input_f32[i]);
+            }
+            break;
+        }
+        case INFINI_DTYPE_BF16: {
+            float *typed_grad_input_f32 = static_cast<float*>(workspace);
+            const bf16_t *typed_grad_output = static_cast<const bf16_t *>(grad_output);
+
+            _avgpool_backward_cpu<float, bf16_t>(typed_grad_input_f32, typed_grad_output);
+
+            bf16_t *typed_grad_input = static_cast<bf16_t*>(grad_input);
+            #pragma omp parallel for
+            for(size_t i = 0; i < grad_input_nelem; ++i) {
+                typed_grad_input[i] = utils::cast<bf16_t>(typed_grad_input_f32[i]);
+            }
+            break;
+        }
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        return INFINI_STATUS_SUCCESS;
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+inline size_t calculateOutputSize(const AvgPoolBackwardInfo &info) {
+    size_t size = info.batch * info.channels;
+    for (size_t i = 0; i < info.ndim; ++i) {
+        size *= info.input_dims[i];
+    }
+    return size;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t grad_input_desc,
+    infiniopTensorDescriptor_t grad_output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    void *kernel_size,
+    void *strides,
+    void *pads,
+    bool ceil_mode) {
+    
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = grad_input_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16);
+
+    auto result = AvgPoolBackwardInfo::create(
+        grad_input_desc, grad_output_desc, input_desc, kernel_size, strides, pads, ceil_mode);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    auto opaque_result = Opaque::create(handle, info);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    size_t workspace_size = 0;
+    if (dtype == INFINI_DTYPE_F16 || dtype == INFINI_DTYPE_BF16) {
+        workspace_size = calculateOutputSize(info) * sizeof(float);
+    }
+    
+    *desc_ptr = new Descriptor(dtype, std::move(info), workspace_size,
+                             opaque, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *grad_input,
+    const void *grad_output,
+    const void *input,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+    
+    return _opaque->calculate(workspace, workspace_size, grad_input, grad_output, input, _dtype);
+}
+
+} // namespace op::averagepool_backward::cpu
diff --git a/src/infiniop/ops/averagepool_backward/cpu/averagepool_backward_cpu.h b/src/infiniop/ops/averagepool_backward/cpu/averagepool_backward_cpu.h
new file mode 100644
index 000000000..f83f70cbc
--- /dev/null
+++ b/src/infiniop/ops/averagepool_backward/cpu/averagepool_backward_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __AVERAGEPOOL_BACKWARD_CPU_H__
+#define __AVERAGEPOOL_BACKWARD_CPU_H__
+
+#include "../averagepool_backward.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __AVERAGEPOOL_BACKWARD_CPU_H__
diff --git a/src/infiniop/ops/averagepool_backward/cuda/averagepool_backward_kernel.cuh b/src/infiniop/ops/averagepool_backward/cuda/averagepool_backward_kernel.cuh
new file mode 100644
index 000000000..0394d62dd
--- /dev/null
+++ b/src/infiniop/ops/averagepool_backward/cuda/averagepool_backward_kernel.cuh
@@ -0,0 +1,177 @@
+#ifndef __AVERAGEPOOL_BACKWARD_KERNEL_H__
+#define __AVERAGEPOOL_BACKWARD_KERNEL_H__
+
+#include <cmath>
+
+template <typename T>
+__global__ void
+avgpool1d_pytorch_backward_kernel(const T *grad_output, T *grad_input,
+                                  int batch_size, int channels,
+                                  int input_length, int output_length,
+                                  int kernel_size, int stride, int padding) {
+
+    int batch_idx = blockIdx.x;
+    int channel_idx = blockIdx.y;
+    int output_idx = blockIdx.z * blockDim.x + threadIdx.x;
+
+    if (batch_idx >= batch_size || channel_idx >= channels || output_idx >= output_length) {
+        return;
+    }
+
+    const T *grad_output_ptr = grad_output + batch_idx * channels * output_length + channel_idx * output_length;
+    T *grad_input_ptr = grad_input + batch_idx * channels * input_length + channel_idx * input_length;
+
+    // 从输出中获取梯度值
+    float grad = static_cast<float>(grad_output_ptr[output_idx]);
+    int window_start = output_idx * stride - padding;
+
+    int pool_size = 0;
+    for (int k = 0; k < kernel_size; ++k) {
+        int input_pos = window_start + k;
+        if ((input_pos >= 0 && input_pos < input_length) || (input_pos >= -padding && input_pos < input_length + padding)) {
+            pool_size++;
+        }
+    }
+
+    // 避免除以零的极端情况
+    if (pool_size == 0) {
+        return;
+    }
+
+    float grad_per_input = grad / static_cast<float>(pool_size);
+    for (int k = 0; k < kernel_size; ++k) {
+        int input_pos = window_start + k;
+        if (input_pos >= 0 && input_pos < input_length) {
+            // Atomically add the distributed gradient to the input gradient tensor
+            atomicAdd(&grad_input_ptr[input_pos], static_cast<T>(grad_per_input));
+        }
+    }
+}
+
+template <typename T>
+__global__ void avgpool2d_pytorch_backward_kernel(
+    const T *grad_output, T *grad_input, int batch_size, int channels,
+    int input_height, int input_width, int output_height, int output_width,
+    int kernel_h, int kernel_w, int stride_h, int stride_w, int pad_h,
+    int pad_w) {
+
+    int batch_idx = blockIdx.x;
+    int channel_idx = blockIdx.y;
+    int output_idx = blockIdx.z * blockDim.x + threadIdx.x;
+
+    int total_output_elements = output_height * output_width;
+    if (batch_idx >= batch_size || channel_idx >= channels || output_idx >= total_output_elements) {
+        return;
+    }
+
+    // 将线性输出索引转换为二维坐标
+    int out_h = output_idx / output_width;
+    int out_w = output_idx % output_width;
+
+    const T *grad_output_ptr = grad_output + batch_idx * channels * total_output_elements + channel_idx * total_output_elements;
+    T *grad_input_ptr = grad_input + batch_idx * channels * input_height * input_width + channel_idx * input_height * input_width;
+
+    float grad = static_cast<float>(grad_output_ptr[output_idx]);
+    int window_start_h = out_h * stride_h - pad_h;
+    int window_start_w = out_w * stride_w - pad_w;
+
+    int pool_size = 0;
+    for (int kh = 0; kh < kernel_h; ++kh) {
+        for (int kw = 0; kw < kernel_w; ++kw) {
+            int input_h = window_start_h + kh;
+            int input_w = window_start_w + kw;
+            if ((input_h >= 0 && input_h < input_height && input_w >= 0 && input_w < input_width) || (input_h >= -pad_h && input_h < input_height + pad_h && input_w >= -pad_w && input_w < input_width + pad_w)) {
+                pool_size++;
+            }
+        }
+    }
+
+    if (pool_size == 0) {
+        return;
+    }
+
+    float grad_per_input = grad / static_cast<float>(pool_size);
+
+    for (int kh = 0; kh < kernel_h; ++kh) {
+        for (int kw = 0; kw < kernel_w; ++kw) {
+            int input_h = window_start_h + kh;
+            int input_w = window_start_w + kw;
+
+            if (input_h >= 0 && input_h < input_height && input_w >= 0 && input_w < input_width) {
+                int input_idx = input_h * input_width + input_w;
+                atomicAdd(&grad_input_ptr[input_idx], static_cast<T>(grad_per_input));
+            }
+        }
+    }
+}
+
+template <typename T>
+__global__ void avgpool3d_pytorch_backward_kernel(
+    const T *grad_output, T *grad_input, int batch_size, int channels,
+    int input_depth, int input_height, int input_width, int output_depth,
+    int output_height, int output_width, int kernel_d, int kernel_h,
+    int kernel_w, int stride_d, int stride_h, int stride_w, int pad_d,
+    int pad_h, int pad_w) {
+
+    int batch_idx = blockIdx.x;
+    int channel_idx = blockIdx.y;
+    int output_idx = blockIdx.z * blockDim.x + threadIdx.x;
+
+    int total_output_elements = output_depth * output_height * output_width;
+    if (batch_idx >= batch_size || channel_idx >= channels || output_idx >= total_output_elements) {
+        return;
+    }
+
+    // 将线性输出索引转换为三维坐标
+    int out_d = output_idx / (output_height * output_width);
+    int remaining = output_idx % (output_height * output_width);
+    int out_h = remaining / output_width;
+    int out_w = remaining % output_width;
+
+    int input_spatial_size = input_depth * input_height * input_width;
+    const T *grad_output_ptr = grad_output + batch_idx * channels * total_output_elements + channel_idx * total_output_elements;
+    T *grad_input_ptr = grad_input + batch_idx * channels * input_spatial_size + channel_idx * input_spatial_size;
+
+    float grad = static_cast<float>(grad_output_ptr[output_idx]);
+    int window_start_d = out_d * stride_d - pad_d;
+    int window_start_h = out_h * stride_h - pad_h;
+    int window_start_w = out_w * stride_w - pad_w;
+
+    int pool_size = 0;
+    for (int kd = 0; kd < kernel_d; ++kd) {
+        for (int kh = 0; kh < kernel_h; ++kh) {
+            for (int kw = 0; kw < kernel_w; ++kw) {
+                int input_d = window_start_d + kd;
+                int input_h = window_start_h + kh;
+                int input_w = window_start_w + kw;
+
+                if ((input_d >= 0 && input_d < input_depth && input_h >= 0 && input_h < input_height && input_w >= 0 && input_w < input_width) || (input_d >= -pad_d && input_d < input_depth + pad_d && input_h >= -pad_h && input_h < input_height + pad_h && input_w >= -pad_w && input_w < input_width + pad_w)) {
+                    pool_size++;
+                }
+            }
+        }
+    }
+
+    if (pool_size == 0) {
+        return;
+    }
+
+    float grad_per_input = grad / static_cast<float>(pool_size);
+
+    for (int kd = 0; kd < kernel_d; ++kd) {
+        for (int kh = 0; kh < kernel_h; ++kh) {
+            for (int kw = 0; kw < kernel_w; ++kw) {
+                int input_d = window_start_d + kd;
+                int input_h = window_start_h + kh;
+                int input_w = window_start_w + kw;
+
+                if (input_d >= 0 && input_d < input_depth && input_h >= 0 && input_h < input_height && input_w >= 0 && input_w < input_width) {
+                    int input_idx = (input_d * input_height + input_h) * input_width + input_w;
+                    atomicAdd(&grad_input_ptr[input_idx], static_cast<T>(grad_per_input));
+                }
+            }
+        }
+    }
+}
+
+#endif // __AVERAGEPOOL_BACKWARD_KERNEL_H__
diff --git a/src/infiniop/ops/averagepool_backward/info.h b/src/infiniop/ops/averagepool_backward/info.h
new file mode 100644
index 000000000..8927864b8
--- /dev/null
+++ b/src/infiniop/ops/averagepool_backward/info.h
@@ -0,0 +1,100 @@
+#ifndef __AVERAGEPOOL_BACKWARD_INFO_H__
+#define __AVERAGEPOOL_BACKWARD_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::averagepool_backward {
+
+// 检查是否存在隐式填充
+inline bool hasImplicitPadding(
+    size_t input_size,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding,
+    bool ceil_mode) {
+
+    if (!ceil_mode) {
+        return false;
+    }
+    return ((input_size + 2 * padding) - kernel_size) % stride != 0;
+}
+
+class AvgPoolBackwardInfo {
+    AvgPoolBackwardInfo() = default;
+
+public:
+    std::vector<size_t> input_dims;  // original input dimensions
+    std::vector<size_t> output_dims; // pooled output dimensions
+    std::vector<size_t> kernel_sizes;
+    std::vector<size_t> strides;
+    std::vector<size_t> pads;
+    bool ceil_mode;
+    size_t ndim;
+    size_t batch;
+    size_t channels;
+    bool has_implicit_padding = false;
+
+    static utils::Result<AvgPoolBackwardInfo> create(
+        infiniopTensorDescriptor_t grad_input_desc,  // gradient w.r.t. input
+        infiniopTensorDescriptor_t grad_output_desc, // gradient w.r.t. output
+        infiniopTensorDescriptor_t input_desc,       // original input from forward pass
+        void *kernel_size,
+        void *strides,
+        void *pads,
+        bool ceil_mode) {
+
+        AvgPoolBackwardInfo info;
+
+        if (input_desc->ndim() < 3 || input_desc->ndim() > 5) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        if (input_desc->ndim() != grad_input_desc->ndim() || grad_output_desc->ndim() != grad_input_desc->ndim()) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        if (input_desc->dim(0) != grad_input_desc->dim(0) || input_desc->dim(1) != grad_input_desc->dim(1) || grad_output_desc->dim(0) != grad_input_desc->dim(0) || grad_output_desc->dim(1) != grad_input_desc->dim(1)) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        for (size_t i = 2; i < input_desc->ndim(); ++i) {
+            if (input_desc->dim(i) != grad_input_desc->dim(i)) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+        }
+
+        info.ndim = input_desc->ndim() - 2;
+        info.batch = input_desc->dim(0);
+        info.channels = input_desc->dim(1);
+        info.ceil_mode = ceil_mode;
+
+        auto kernel_ptr = reinterpret_cast<const size_t *>(kernel_size);
+        auto stride_ptr = reinterpret_cast<const size_t *>(strides);
+        auto pad_ptr = reinterpret_cast<const size_t *>(pads);
+
+        // 初始化隐式填充标志
+        info.has_implicit_padding = false;
+        for (size_t i = 0; i < info.ndim; ++i) {
+            info.input_dims.push_back(input_desc->dim(i + 2));
+            info.output_dims.push_back(grad_output_desc->dim(i + 2));
+            info.kernel_sizes.push_back(kernel_ptr[i]);
+            info.strides.push_back(stride_ptr[i]);
+            info.pads.push_back(pad_ptr[i]);
+
+            // 检查当前维度是否存在隐式填充
+            if (hasImplicitPadding(info.input_dims[i], info.kernel_sizes[i],
+                                   info.strides[i], info.pads[i], info.ceil_mode)) {
+                info.has_implicit_padding = true;
+            }
+        }
+
+        return utils::Result<AvgPoolBackwardInfo>(std::move(info));
+    }
+};
+
+} // namespace op::averagepool_backward
+
+#endif // __AVERAGEPOOL_BACKWARD_INFO_H__
diff --git a/src/infiniop/ops/averagepool_backward/metax/averagepool_backward_metax.h b/src/infiniop/ops/averagepool_backward/metax/averagepool_backward_metax.h
new file mode 100644
index 000000000..65d1f25fc
--- /dev/null
+++ b/src/infiniop/ops/averagepool_backward/metax/averagepool_backward_metax.h
@@ -0,0 +1,8 @@
+#ifndef __AVERAGEPOOL_BACKWARD_METAX_H__
+#define __AVERAGEPOOL_BACKWARD_METAX_H__
+
+#include "../averagepool_backward.h"
+
+DESCRIPTOR(metax)
+
+#endif // __AVERAGEPOOL_BACKWARD_METAX_H__
diff --git a/src/infiniop/ops/averagepool_backward/metax/averagepool_backward_metax.maca b/src/infiniop/ops/averagepool_backward/metax/averagepool_backward_metax.maca
new file mode 100644
index 000000000..b11f24b99
--- /dev/null
+++ b/src/infiniop/ops/averagepool_backward/metax/averagepool_backward_metax.maca
@@ -0,0 +1,415 @@
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+#include "averagepool_backward_metax.h"
+#include "../cuda/averagepool_backward_kernel.cuh"
+#include <cstdio>
+
+// 自定义核函数
+infiniStatus_t launch_avgpool_pytorch_backward_kernel(
+    const op::averagepool_backward::AvgPoolBackwardInfo& info,
+    const void* grad_output, void* grad_input,
+    infiniDtype_t data_type, hcStream_t stream) {
+    
+    // 在累加梯度之前，必须将grad_input张量清零
+    size_t grad_input_nelem = info.batch * info.channels;
+    for (size_t i = 0; i < info.ndim; ++i) grad_input_nelem *= info.input_dims[i];
+
+    size_t dtype_size = 0;
+    switch (data_type) {
+        case INFINI_DTYPE_F32:
+            dtype_size = sizeof(float);
+            break;
+        case INFINI_DTYPE_F16:
+            dtype_size = sizeof(half);
+            break;
+        case INFINI_DTYPE_BF16:
+            dtype_size = sizeof(__hpcc_bfloat16);
+            break;
+        default:
+            return INFINI_STATUS_NOT_IMPLEMENTED; // Or handle error
+    }
+    
+    size_t grad_input_bytes = grad_input_nelem * dtype_size; 
+    hcMemsetAsync(grad_input, 0, grad_input_bytes, stream);
+
+    int batch_size = static_cast<int>(info.batch);
+    int channels = static_cast<int>(info.channels);
+    
+    if (info.ndim == 1) {
+        int input_length = static_cast<int>(info.input_dims[0]);
+        int output_length = static_cast<int>(info.output_dims[0]);
+        int kernel_size = static_cast<int>(info.kernel_sizes[0]);
+        int stride = static_cast<int>(info.strides[0]);
+        int padding = static_cast<int>(info.pads[0]);
+        
+        dim3 blockSize(256);
+        dim3 gridSize(batch_size, channels, (output_length + blockSize.x - 1) / blockSize.x);
+        
+        switch (data_type) {
+        case INFINI_DTYPE_F32:
+            avgpool1d_pytorch_backward_kernel<float><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const float*>(grad_output), static_cast<float*>(grad_input),
+                batch_size, channels, input_length, output_length,
+                kernel_size, stride, padding);
+            break;
+        case INFINI_DTYPE_F16:
+            avgpool1d_pytorch_backward_kernel<half><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const half*>(grad_output), static_cast<half*>(grad_input),
+                batch_size, channels, input_length, output_length,
+                kernel_size, stride, padding);
+            break;
+        case INFINI_DTYPE_BF16:
+            avgpool1d_pytorch_backward_kernel<__hpcc_bfloat16><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const __hpcc_bfloat16*>(grad_output), static_cast<__hpcc_bfloat16*>(grad_input),
+                batch_size, channels, input_length, output_length,
+                kernel_size, stride, padding);
+            break;
+        default:
+            return INFINI_STATUS_NOT_IMPLEMENTED;
+        }
+        
+    } else if (info.ndim == 2) {
+        // 2D平均池化 - 后向
+        int input_height = static_cast<int>(info.input_dims[0]);
+        int input_width = static_cast<int>(info.input_dims[1]);
+        int output_height = static_cast<int>(info.output_dims[0]);
+        int output_width = static_cast<int>(info.output_dims[1]);
+        int kernel_h = static_cast<int>(info.kernel_sizes[0]);
+        int kernel_w = static_cast<int>(info.kernel_sizes[1]);
+        int stride_h = static_cast<int>(info.strides[0]);
+        int stride_w = static_cast<int>(info.strides[1]);
+        int pad_h = static_cast<int>(info.pads[0]);
+        int pad_w = static_cast<int>(info.pads[1]);
+        
+        int total_output_elements = output_height * output_width;
+        dim3 blockSize(256);
+        dim3 gridSize(batch_size, channels, (total_output_elements + blockSize.x - 1) / blockSize.x);
+        
+        switch (data_type) {
+        case INFINI_DTYPE_F32:
+            avgpool2d_pytorch_backward_kernel<float><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const float*>(grad_output), static_cast<float*>(grad_input),
+                batch_size, channels, input_height, input_width,
+                output_height, output_width, kernel_h, kernel_w,
+                stride_h, stride_w, pad_h, pad_w);
+            break;
+        case INFINI_DTYPE_F16:
+            avgpool2d_pytorch_backward_kernel<half><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const half*>(grad_output), static_cast<half*>(grad_input),
+                batch_size, channels, input_height, input_width,
+                output_height, output_width, kernel_h, kernel_w,
+                stride_h, stride_w, pad_h, pad_w);
+            break;
+        case INFINI_DTYPE_BF16:
+            avgpool2d_pytorch_backward_kernel<__hpcc_bfloat16><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const __hpcc_bfloat16*>(grad_output), static_cast<__hpcc_bfloat16*>(grad_input),
+                batch_size, channels, input_height, input_width,
+                output_height, output_width, kernel_h, kernel_w,
+                stride_h, stride_w, pad_h, pad_w);
+            break;
+        default:
+            return INFINI_STATUS_NOT_IMPLEMENTED;
+        }
+        
+    } else if (info.ndim == 3) {
+        // 3D平均池化 - 后向
+        int input_depth = static_cast<int>(info.input_dims[0]);
+        int input_height = static_cast<int>(info.input_dims[1]);
+        int input_width = static_cast<int>(info.input_dims[2]);
+        int output_depth = static_cast<int>(info.output_dims[0]);
+        int output_height = static_cast<int>(info.output_dims[1]);
+        int output_width = static_cast<int>(info.output_dims[2]);
+        int kernel_d = static_cast<int>(info.kernel_sizes[0]);
+        int kernel_h = static_cast<int>(info.kernel_sizes[1]);
+        int kernel_w = static_cast<int>(info.kernel_sizes[2]);
+        int stride_d = static_cast<int>(info.strides[0]);
+        int stride_h = static_cast<int>(info.strides[1]);
+        int stride_w = static_cast<int>(info.strides[2]);
+        int pad_d = static_cast<int>(info.pads[0]);
+        int pad_h = static_cast<int>(info.pads[1]);
+        int pad_w = static_cast<int>(info.pads[2]);
+        
+        int total_output_elements = output_depth * output_height * output_width;
+        dim3 blockSize(256);
+        dim3 gridSize(batch_size, channels, (total_output_elements + blockSize.x - 1) / blockSize.x);
+        
+        switch (data_type) {
+        case INFINI_DTYPE_F32:
+            avgpool3d_pytorch_backward_kernel<float><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const float*>(grad_output), static_cast<float*>(grad_input),
+                batch_size, channels, input_depth, input_height, input_width,
+                output_depth, output_height, output_width,
+                kernel_d, kernel_h, kernel_w, stride_d, stride_h, stride_w,
+                pad_d, pad_h, pad_w);
+            break;
+        case INFINI_DTYPE_F16:
+            avgpool3d_pytorch_backward_kernel<half><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const half*>(grad_output), static_cast<half*>(grad_input),
+                batch_size, channels, input_depth, input_height, input_width,
+                output_depth, output_height, output_width,
+                kernel_d, kernel_h, kernel_w, stride_d, stride_h, stride_w,
+                pad_d, pad_h, pad_w);
+            break;
+        case INFINI_DTYPE_BF16:
+            avgpool3d_pytorch_backward_kernel<__hpcc_bfloat16><<<gridSize, blockSize, 0, stream>>>(
+                static_cast<const __hpcc_bfloat16*>(grad_output), static_cast<__hpcc_bfloat16*>(grad_input),
+                batch_size, channels, input_depth, input_height, input_width,
+                output_depth, output_height, output_width,
+                kernel_d, kernel_h, kernel_w, stride_d, stride_h, stride_w,
+                pad_d, pad_h, pad_w);
+            break;
+        default:
+            return INFINI_STATUS_NOT_IMPLEMENTED;
+        }
+        
+    } else {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+    
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define DESTROY_hcdnn_DESCRIPTOR(desc_ptr, destroy_func)                       \
+  do {                                                                         \
+    if (desc_ptr) {                                                            \
+      destroy_func(desc_ptr);                                                  \
+      desc_ptr = nullptr;                                                      \
+    }                                                                          \
+  } while (0)
+
+#define CLEANUP_hcdnn_DESCRIPTORS()                                            \
+  do {                                                                         \
+    DESTROY_hcdnn_DESCRIPTOR(input_desc, hcdnnDestroyTensorDescriptor);        \
+    DESTROY_hcdnn_DESCRIPTOR(grad_input_desc, hcdnnDestroyTensorDescriptor);   \
+    DESTROY_hcdnn_DESCRIPTOR(grad_output_desc, hcdnnDestroyTensorDescriptor);  \
+    DESTROY_hcdnn_DESCRIPTOR(pooling_backward_desc,                            \
+                             hcdnnDestroyPoolingDescriptor);                   \
+  } while (0)
+
+namespace op::averagepool_backward::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+    size_t workspace_size = 0;
+
+#ifdef ENABLE_HCDNN_API
+    hcdnnTensorDescriptor_t input_desc = nullptr;
+    hcdnnTensorDescriptor_t grad_input_desc = nullptr;
+    hcdnnTensorDescriptor_t grad_output_desc = nullptr;
+    hcdnnPoolingDescriptor_t pooling_backward_desc = nullptr;
+#endif
+
+private:
+    Opaque(std::shared_ptr<device::metax::Handle::Internal> internal_ptr)
+      : internal(internal_ptr) {}
+
+#ifdef ENABLE_HCDNN_API
+    void calculateStrides(const std::vector<int> &dims, std::vector<int> &strides,
+                        int ndim) const {
+    strides[ndim - 1] = 1;
+    for (int d = ndim - 2; d >= 0; --d) {
+      strides[d] = strides[d + 1] * dims[d + 1];
+    }
+  }
+
+  infiniStatus_t createPoolingDescriptors(const AvgPoolBackwardInfo &info,
+                                         hcdnnDataType_t hcdnn_data_type) {
+    // 创建hcdnn描述符
+    CHECK_MCDNN(hcdnnCreateTensorDescriptor(&input_desc));
+    CHECK_MCDNN(hcdnnCreateTensorDescriptor(&grad_input_desc));
+    CHECK_MCDNN(hcdnnCreateTensorDescriptor(&grad_output_desc));
+    CHECK_MCDNN(hcdnnCreatePoolingDescriptor(&pooling_backward_desc));
+
+    // 构建输入、输出梯度维度（NCHW格式）
+    std::vector<int> input_dims_vec = {static_cast<int>(info.batch),
+                                       static_cast<int>(info.channels)};
+    std::vector<int> output_dims_vec = {static_cast<int>(info.batch),
+                                        static_cast<int>(info.channels)};
+    for (size_t i = 0; i < info.ndim; ++i) {
+      input_dims_vec.push_back(static_cast<int>(info.input_dims[i]));
+      output_dims_vec.push_back(static_cast<int>(info.output_dims[i]));
+    }
+
+    while (input_dims_vec.size() < 5)  input_dims_vec.push_back(1);
+    while (output_dims_vec.size() < 5) output_dims_vec.push_back(1);
+
+    // 计算内存步幅
+    std::vector<int> input_strides_vec(input_dims_vec.size());
+    std::vector<int> output_strides_vec(output_dims_vec.size());
+    calculateStrides(input_dims_vec, input_strides_vec, input_dims_vec.size());
+    calculateStrides(output_dims_vec, output_strides_vec, output_dims_vec.size());
+
+    // 设置张量描述符（带步幅）
+    CHECK_MCDNN(hcdnnSetTensorNdDescriptor(
+        input_desc, hcdnn_data_type, input_dims_vec.size(),
+        input_dims_vec.data(), input_strides_vec.data()));
+
+    CHECK_MCDNN(hcdnnSetTensorNdDescriptor(
+        grad_input_desc, hcdnn_data_type, input_dims_vec.size(),
+        input_dims_vec.data(), input_strides_vec.data()));
+
+    CHECK_MCDNN(hcdnnSetTensorNdDescriptor(
+        grad_output_desc, hcdnn_data_type, output_dims_vec.size(),
+        output_dims_vec.data(), output_strides_vec.data()));
+
+    return INFINI_STATUS_SUCCESS;
+  }
+
+  infiniStatus_t setupPoolingDescriptor(const AvgPoolBackwardInfo &info) {
+    // 构建池化参数
+    std::vector<int> kernel_vec, stride_vec, pad_vec;
+    for (size_t i = 0; i < info.ndim; ++i) {
+      kernel_vec.push_back(static_cast<int>(info.kernel_sizes[i]));
+      stride_vec.push_back(static_cast<int>(info.strides[i]));
+      pad_vec.push_back(static_cast<int>(info.pads[i]));
+    }
+
+    while (kernel_vec.size() < 3) kernel_vec.push_back(1);
+    while (stride_vec.size() < 3) stride_vec.push_back(1);
+    while (pad_vec.size() < 3)    pad_vec.push_back(0);
+
+    // 设置平均池化反向描述符
+    CHECK_MCDNN(hcdnnSetPoolingNdDescriptor(
+        pooling_backward_desc, 
+        HCDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING, // 平均池化模式
+        HCDNN_NOT_PROPAGATE_NAN,                     // 不传播NaN
+        kernel_vec.size(),
+        kernel_vec.data(),
+        pad_vec.data(),
+        stride_vec.data()));
+
+    return INFINI_STATUS_SUCCESS;
+  }
+
+  infiniStatus_t initializeHcdnnContext(AvgPoolBackwardInfo &info,
+                                        infiniDtype_t data_type) {
+    hcdnnDataType_t hcdnn_data_type = device::metax::getHcdnnDtype(data_type);
+
+    CHECK_STATUS(createPoolingDescriptors(info, hcdnn_data_type));
+    CHECK_STATUS(setupPoolingDescriptor(info));
+
+    // 计算工作空间大小（需要存储前向输出用于反向计算）
+    CHECK_MCDNN(hcdnnGetTensorSizeInBytes(grad_output_desc, &workspace_size));
+
+    return INFINI_STATUS_SUCCESS;
+  }
+#endif
+
+public:
+    Opaque(Opaque &&other) noexcept
+      : internal(std::move(other.internal)),
+        workspace_size(other.workspace_size)
+#ifdef ENABLE_HCDNN_API
+        , input_desc(other.input_desc)
+        , grad_input_desc(other.grad_input_desc)
+        , grad_output_desc(other.grad_output_desc)
+        , pooling_backward_desc(other.pooling_backward_desc)
+#endif
+    {
+#ifdef ENABLE_HCDNN_API
+        other.input_desc = nullptr;
+        other.grad_input_desc = nullptr;
+        other.grad_output_desc = nullptr;
+        other.pooling_backward_desc = nullptr;
+#endif
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() {
+#ifdef ENABLE_HCDNN_API
+        CLEANUP_hcdnn_DESCRIPTORS();
+#endif
+    }
+
+    static inline utils::Result<Opaque>
+    create(std::shared_ptr<device::metax::Handle::Internal> internal_ptr,
+           AvgPoolBackwardInfo &info, infiniDtype_t data_type) {
+#ifdef ENABLE_HCDNN_API
+        Opaque opaque(internal_ptr);
+        if (!info.has_implicit_padding) {
+            auto status = opaque.initializeHcdnnContext(info, data_type);
+            if (status != INFINI_STATUS_SUCCESS) {
+                return status;
+            }
+        }
+        return utils::Result<Opaque>(std::move(opaque));
+#else
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t grad_input_desc,
+                                  infiniopTensorDescriptor_t grad_output_desc,
+                                  infiniopTensorDescriptor_t input_desc,
+                                  void *kernel_size, void *strides, void *pads,
+                                  bool ceil_mode) {
+
+#ifdef ENABLE_HCDNN_API
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = input_desc->dtype();
+    auto result =
+      AvgPoolBackwardInfo::create(grad_input_desc, grad_output_desc, input_desc,
+                                  kernel_size, strides, pads, ceil_mode);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    auto opaque_result = Opaque::create(handle->internal(), info, dtype);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size,
+                             opaque, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *grad_input, const void *grad_output,
+                                     const void *input, void *stream) const {
+#ifdef ENABLE_HCDNN_API
+    if (_info.has_implicit_padding) {
+        return launch_avgpool_pytorch_backward_kernel(
+            _info, grad_output, grad_input, _dtype, (hcStream_t)stream);
+    } else {
+        const float alpha = 1.0f, beta = 0.0f;
+        if (workspace_size < _workspace_size) {
+            return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+        }
+        CHECK_STATUS(_opaque->internal->useMcdnn(
+            (hcStream_t)stream, [&](hcdnnHandle_t handle) {
+                void *temp_output = workspace;
+                CHECK_MCDNN(hcdnnPoolingForward(
+                    handle, _opaque->pooling_backward_desc, &alpha,
+                    _opaque->input_desc, input, 
+                    &beta, 
+                    _opaque->grad_output_desc, temp_output));
+                CHECK_MCDNN(hcdnnPoolingBackward(
+                    handle, _opaque->pooling_backward_desc, &alpha,
+                    _opaque->grad_output_desc, temp_output,
+                    _opaque->grad_output_desc, grad_output,
+                    _opaque->input_desc, input,
+                    &beta,
+                    _opaque->grad_input_desc, grad_input
+                ));
+                return INFINI_STATUS_SUCCESS;
+            }));
+        return INFINI_STATUS_SUCCESS;
+    }
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+} // namespace op::averagepool_backward::metax
diff --git a/src/infiniop/ops/averagepool_backward/nvidia/averagepool_backward_nvidia.cu b/src/infiniop/ops/averagepool_backward/nvidia/averagepool_backward_nvidia.cu
new file mode 100644
index 000000000..71fcf95d5
--- /dev/null
+++ b/src/infiniop/ops/averagepool_backward/nvidia/averagepool_backward_nvidia.cu
@@ -0,0 +1,260 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "averagepool_backward_nvidia.cuh"
+
+#define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \
+    do {                                                 \
+        if (desc_ptr) {                                  \
+            destroy_func(desc_ptr);                      \
+            desc_ptr = nullptr;                          \
+        }                                                \
+    } while (0)
+
+#define CLEANUP_CUDNN_DESCRIPTORS()                                               \
+    do {                                                                          \
+        DESTROY_CUDNN_DESCRIPTOR(input_desc, cudnnDestroyTensorDescriptor);       \
+        DESTROY_CUDNN_DESCRIPTOR(grad_input_desc, cudnnDestroyTensorDescriptor);  \
+        DESTROY_CUDNN_DESCRIPTOR(grad_output_desc, cudnnDestroyTensorDescriptor); \
+        DESTROY_CUDNN_DESCRIPTOR(pooling_backward_desc,                           \
+                                 cudnnDestroyPoolingDescriptor);                  \
+    } while (0)
+
+namespace op::averagepool_backward::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+    size_t workspace_size = 0;
+
+#ifdef ENABLE_CUDNN_API
+    cudnnTensorDescriptor_t input_desc = nullptr;
+    cudnnTensorDescriptor_t grad_input_desc = nullptr;
+    cudnnTensorDescriptor_t grad_output_desc = nullptr;
+    cudnnPoolingDescriptor_t pooling_backward_desc = nullptr;
+#endif
+
+private:
+    Opaque(std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr)
+        : internal(internal_ptr) {}
+
+#ifdef ENABLE_CUDNN_API
+    infiniStatus_t getCudnnDataType(infiniDtype_t data_type,
+                                    cudnnDataType_t &cudnn_data_type) const {
+        if (data_type == INFINI_DTYPE_F16) {
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
+        } else if (data_type == INFINI_DTYPE_F32) {
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
+        } else if (data_type == INFINI_DTYPE_BF16) {
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    void calculateStrides(const std::vector<int> &dims, std::vector<int> &strides,
+                          int ndim) const {
+        strides[ndim - 1] = 1;
+        for (int d = ndim - 2; d >= 0; --d) {
+            strides[d] = strides[d + 1] * dims[d + 1];
+        }
+    }
+
+    infiniStatus_t createPoolingDescriptors(const AvgPoolBackwardInfo &info,
+                                            cudnnDataType_t cudnn_data_type) {
+        CHECK_CUDNN(cudnnCreateTensorDescriptor(&input_desc));
+        CHECK_CUDNN(cudnnCreateTensorDescriptor(&grad_input_desc));
+        CHECK_CUDNN(cudnnCreateTensorDescriptor(&grad_output_desc));
+        CHECK_CUDNN(cudnnCreatePoolingDescriptor(&pooling_backward_desc));
+
+        std::vector<int> input_dims_vec = {static_cast<int>(info.batch),
+                                           static_cast<int>(info.channels)};
+        std::vector<int> output_dims_vec = {static_cast<int>(info.batch),
+                                            static_cast<int>(info.channels)};
+
+        for (size_t i = 0; i < info.ndim; ++i) {
+            input_dims_vec.push_back(static_cast<int>(info.input_dims[i]));
+            output_dims_vec.push_back(static_cast<int>(info.output_dims[i]));
+        }
+
+        if (info.ndim == 1) {
+            input_dims_vec.push_back(1);
+            output_dims_vec.push_back(1);
+        }
+
+        std::vector<int> input_strides_vec(input_dims_vec.size());
+        std::vector<int> output_strides_vec(output_dims_vec.size());
+        calculateStrides(input_dims_vec, input_strides_vec, input_dims_vec.size());
+        calculateStrides(output_dims_vec, output_strides_vec,
+                         output_dims_vec.size());
+
+        CHECK_CUDNN(cudnnSetTensorNdDescriptor(
+            input_desc, cudnn_data_type, input_dims_vec.size(),
+            input_dims_vec.data(), input_strides_vec.data()));
+
+        CHECK_CUDNN(cudnnSetTensorNdDescriptor(
+            grad_input_desc, cudnn_data_type, input_dims_vec.size(),
+            input_dims_vec.data(), input_strides_vec.data()));
+
+        CHECK_CUDNN(cudnnSetTensorNdDescriptor(
+            grad_output_desc, cudnn_data_type, output_dims_vec.size(),
+            output_dims_vec.data(), output_strides_vec.data()));
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t setupPoolingDescriptor(const AvgPoolBackwardInfo &info) {
+        std::vector<int> kernel_vec, stride_vec, pad_vec;
+        for (size_t i = 0; i < info.ndim; ++i) {
+            kernel_vec.push_back(static_cast<int>(info.kernel_sizes[i]));
+            stride_vec.push_back(static_cast<int>(info.strides[i]));
+            pad_vec.push_back(static_cast<int>(info.pads[i]));
+        }
+
+        if (info.ndim == 1) {
+            kernel_vec.push_back(1);
+            stride_vec.push_back(1);
+            pad_vec.push_back(0);
+        }
+
+        CHECK_CUDNN(cudnnSetPoolingNdDescriptor(
+            pooling_backward_desc, CUDNN_POOLING_AVERAGE_COUNT_INCLUDE_PADDING, CUDNN_NOT_PROPAGATE_NAN,
+            kernel_vec.size(), kernel_vec.data(), pad_vec.data(),
+            stride_vec.data()));
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t initializeCudnnContext(AvgPoolBackwardInfo &info,
+                                          infiniDtype_t data_type) {
+        cudnnDataType_t cudnn_data_type;
+        CHECK_STATUS(getCudnnDataType(data_type, cudnn_data_type));
+
+        CHECK_STATUS(createPoolingDescriptors(info, cudnn_data_type));
+        CHECK_STATUS(setupPoolingDescriptor(info));
+
+        CHECK_CUDNN(cudnnGetTensorSizeInBytes(grad_output_desc, &workspace_size));
+
+        return INFINI_STATUS_SUCCESS;
+    }
+#endif
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : internal(std::move(other.internal)),
+          workspace_size(other.workspace_size)
+#ifdef ENABLE_CUDNN_API
+          ,
+          input_desc(other.input_desc), grad_input_desc(other.grad_input_desc), grad_output_desc(other.grad_output_desc), pooling_backward_desc(other.pooling_backward_desc)
+#endif
+    {
+#ifdef ENABLE_CUDNN_API
+        other.input_desc = nullptr;
+        other.grad_input_desc = nullptr;
+        other.grad_output_desc = nullptr;
+        other.pooling_backward_desc = nullptr;
+#endif
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() {
+#ifdef ENABLE_CUDNN_API
+        CLEANUP_CUDNN_DESCRIPTORS();
+#endif
+    }
+
+    static inline utils::Result<Opaque>
+    create(std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr,
+           AvgPoolBackwardInfo &info, infiniDtype_t data_type) {
+#ifdef ENABLE_CUDNN_API
+        Opaque opaque(internal_ptr);
+        auto status = opaque.initializeCudnnContext(info, data_type);
+        if (status != INFINI_STATUS_SUCCESS) {
+            return status;
+        }
+        return utils::Result<Opaque>(std::move(opaque));
+#else
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t grad_input_desc,
+                                  infiniopTensorDescriptor_t grad_output_desc,
+                                  infiniopTensorDescriptor_t input_desc,
+                                  void *kernel_size, void *strides, void *pads,
+                                  bool ceil_mode) {
+
+#ifdef ENABLE_CUDNN_API
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = input_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    auto result = AvgPoolBackwardInfo::create(grad_input_desc, grad_output_desc, input_desc,
+                                              kernel_size, strides, pads, ceil_mode);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    auto opaque_result = Opaque::create(handle->internal(), info, dtype);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size,
+                               opaque, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *grad_input, const void *grad_output,
+                                     const void *input, void *stream) const {
+
+#ifdef ENABLE_CUDNN_API
+    const float alpha = 1.0f, beta = 0.0f;
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    CHECK_STATUS(_opaque->internal->useCudnn(
+        (cudaStream_t)stream, [&](cudnnHandle_t handle) {
+            size_t grad_input_size = 0;
+            CHECK_CUDNN(cudnnGetTensorSizeInBytes(_opaque->grad_input_desc,
+                                                  &grad_input_size));
+            CHECK_CUDA(cudaMemset(grad_input, 0, grad_input_size));
+            CHECK_CUDA(cudaMemset(workspace, 0, _workspace_size));
+
+            void *temp_output = workspace;
+            CHECK_CUDNN(cudnnPoolingForward(
+                handle, _opaque->pooling_backward_desc, &alpha, _opaque->input_desc,
+                input, &beta, _opaque->grad_output_desc, temp_output));
+
+            CHECK_CUDNN(cudnnPoolingBackward(
+                handle, _opaque->pooling_backward_desc, &alpha,
+                _opaque->grad_output_desc, temp_output,
+                _opaque->grad_output_desc, grad_output,
+                _opaque->input_desc, input,
+                &beta,
+                _opaque->grad_input_desc, grad_input));
+            return INFINI_STATUS_SUCCESS;
+        }));
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+} // namespace op::averagepool_backward::nvidia
diff --git a/src/infiniop/ops/averagepool_backward/nvidia/averagepool_backward_nvidia.cuh b/src/infiniop/ops/averagepool_backward/nvidia/averagepool_backward_nvidia.cuh
new file mode 100644
index 000000000..b4fa6661e
--- /dev/null
+++ b/src/infiniop/ops/averagepool_backward/nvidia/averagepool_backward_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __AVERAGEPOOL_BACKWARD_NVIDIA_CUH__
+#define __AVERAGEPOOL_BACKWARD_NVIDIA_CUH__
+
+#include "../averagepool_backward.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __AVERAGEPOOL_BACKWARD_NVIDIA_CUH__
diff --git a/src/infiniop/ops/averagepool_backward/operator.cc b/src/infiniop/ops/averagepool_backward/operator.cc
new file mode 100644
index 000000000..844c68601
--- /dev/null
+++ b/src/infiniop/ops/averagepool_backward/operator.cc
@@ -0,0 +1,159 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/averagepool_backward.h"
+
+#ifdef ENABLE_CPU_API 
+#include "cpu/averagepool_backward_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/averagepool_backward_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/averagepool_backward_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateAvgPoolBackwardDescriptor(
+    infiniopHandle_t handle,
+    infiniopAvgPoolBackwardDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t grad_input_desc,
+    infiniopTensorDescriptor_t grad_output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    void *kernel_size,
+    void *strides,
+    void *pads,
+    bool ceil_mode) {
+
+#define CREATE(CASE, NAMESPACE)                                                             \
+    case CASE:                                                                              \
+        return op::averagepool_backward::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                         \
+            reinterpret_cast<op::averagepool_backward::NAMESPACE::Descriptor **>(desc_ptr), \
+            grad_input_desc,                                                                \
+            grad_output_desc,                                                               \
+            input_desc,                                                                     \
+            kernel_size,                                                                    \
+            strides,                                                                        \
+            pads,                                                                           \
+            ceil_mode)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CPU_API 
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetAvgPoolBackwardWorkspaceSize(
+    infiniopAvgPoolBackwardDescriptor_t desc,
+    size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                                      \
+    case CASE:                                                                                                    \
+        *size = reinterpret_cast<const op::averagepool_backward::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CPU_API 
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef GET
+}
+
+__C infiniStatus_t infiniopAvgPoolBackward(
+    infiniopAvgPoolBackwardDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *grad_input,
+    const void *grad_output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                             \
+    case CASE:                                                                                 \
+        return reinterpret_cast<const op::averagepool_backward::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size,                                             \
+                        grad_input,                                                            \
+                        grad_output,                                                           \
+                        input,                                                                 \
+                        stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CPU_API 
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t infiniopDestroyAvgPoolBackwardDescriptor(infiniopAvgPoolBackwardDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                                 \
+    case CASE:                                                                                  \
+        delete reinterpret_cast<const op::averagepool_backward::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_CPU_API 
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/conv_backward/conv_backward.h b/src/infiniop/ops/conv_backward/conv_backward.h
new file mode 100644
index 000000000..1c105af8c
--- /dev/null
+++ b/src/infiniop/ops/conv_backward/conv_backward.h
@@ -0,0 +1,47 @@
+#ifndef __CONV_BACKWARD_H__
+#define __CONV_BACKWARD_H__
+
+#include "../../operator.h"
+
+#define DESCRIPTOR(NAMESPACE)                                     \
+    namespace op::conv_backward::NAMESPACE {                      \
+    class Descriptor final : public InfiniopDescriptor {          \
+        struct Opaque;                                            \
+        Opaque *_opaque;                                          \
+        infiniDtype_t _dtype;                                     \
+        size_t _workspace_size;                                   \
+        Descriptor(                                               \
+            infiniDtype_t dtype,                                  \
+            size_t workspace_size_,                               \
+            Opaque *opaque,                                       \
+            infiniDevice_t device_type,                           \
+            int device_id)                                        \
+            : InfiniopDescriptor{device_type, device_id},         \
+              _opaque(opaque),                                    \
+              _dtype(dtype),                                      \
+              _workspace_size(workspace_size_) {}                 \
+                                                                  \
+    public:                                                       \
+        ~Descriptor();                                            \
+        size_t workspaceSize() const { return _workspace_size; }  \
+        static infiniStatus_t create(                             \
+            infiniopHandle_t handle,                              \
+            Descriptor **desc_ptr,                                \
+            infiniopTensorDescriptor_t grad_output_desc,          \
+            infiniopTensorDescriptor_t input_desc,                \
+            infiniopTensorDescriptor_t weight_desc,               \
+            infiniopTensorDescriptor_t bias_desc,                 \
+            void *pads,                                           \
+            void *strides,                                        \
+            void *dilations,                                      \
+            size_t groups);                                       \
+        infiniStatus_t calculate(                                 \
+            void *workspace, size_t workspace_size,               \
+            void *grad_input, void *grad_weight, void *grad_bias, \
+            const void *grad_output,                              \
+            const void *input, const void *weight,                \
+            void *stream) const;                                  \
+    };                                                            \
+    }
+
+#endif // __CONV_BACKWARD_H__
diff --git a/src/infiniop/ops/conv_backward/cpu/conv_backward_cpu.cc b/src/infiniop/ops/conv_backward/cpu/conv_backward_cpu.cc
new file mode 100644
index 000000000..e20ee140c
--- /dev/null
+++ b/src/infiniop/ops/conv_backward/cpu/conv_backward_cpu.cc
@@ -0,0 +1,517 @@
+#include "conv_backward_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../devices/cpu/cpu_handle.h"
+#include "../info.h"
+#include <algorithm>
+#include <cstring>
+#include <memory>
+#include <vector>
+
+namespace op::conv_backward::cpu {
+
+struct Descriptor::Opaque {
+    device::cpu::Handle *handle;
+    op::conv_backward::ConvBackwardInfo info;
+    size_t workspace_size = 0;
+
+private:
+    Opaque(device::cpu::Handle *handle_ptr,
+           const op::conv_backward::ConvBackwardInfo &conv_info)
+        : handle(handle_ptr), info(conv_info) {
+        workspace_size = 0;
+    }
+
+    // 递归函数：计算数据梯度的N维卷积反向传播
+    template <typename GradOutData, typename WeightData, typename GradInData>
+    void _applyDataGradient(
+        size_t grad_out_index, size_t weight_index, size_t grad_in_index,
+        size_t ndim, const GradOutData *grad_output, const WeightData *weight,
+        GradInData *grad_input, const size_t *grad_in_shape) const {
+
+        if (ndim >= info.ndim + 2) {
+            // 到达最深层，执行实际计算
+            // 始终使用float精度进行计算，避免半精度累积误差
+            float grad_out_f32 = utils::cast<float>(grad_output[grad_out_index]);
+            float weight_f32 = utils::cast<float>(weight[weight_index]);
+            float current_grad_in = utils::cast<float>(grad_input[grad_in_index]);
+            float result = current_grad_in + grad_out_f32 * weight_f32;
+            grad_input[grad_in_index] = utils::cast<GradInData>(result);
+            return;
+        }
+
+        size_t dim_idx = ndim - 2;
+        size_t grad_out_dim = info.grad_output_dims[dim_idx];
+        size_t weight_dim = info.weight_dims[dim_idx];
+        size_t grad_in_dim = grad_in_shape[ndim];
+        size_t stride = info.strides[dim_idx];
+        size_t pad = info.pads[dim_idx];
+        size_t dilation = info.dilations[dim_idx];
+
+        // 遍历输出维度
+        for (size_t oh = 0; oh < grad_out_dim; ++oh) {
+            size_t curr_grad_out_index = grad_out_index * grad_out_dim + oh;
+
+            // 遍历卷积核维度
+            for (size_t kh = 0; kh < weight_dim; ++kh) {
+                size_t curr_weight_index = weight_index * weight_dim + kh;
+
+                // 计算对应的输入位置
+                int ih = static_cast<int>(oh * stride + kh * dilation) - static_cast<int>(pad);
+
+                if (ih >= 0 && ih < static_cast<int>(grad_in_dim)) {
+                    size_t curr_grad_in_index = grad_in_index * grad_in_dim + ih;
+
+                    _applyDataGradient(curr_grad_out_index, curr_weight_index, curr_grad_in_index,
+                                       ndim + 1, grad_output, weight, grad_input, grad_in_shape);
+                }
+            }
+        }
+    }
+
+    // 递归函数：计算权重梯度的N维卷积反向传播
+    template <typename InputData, typename GradOutData, typename GradWeightData>
+    void _applyWeightGradient(
+        size_t input_index, size_t grad_out_index, size_t grad_weight_index,
+        size_t ndim, const InputData *input, const GradOutData *grad_output,
+        GradWeightData *grad_weight, const size_t *input_shape) const {
+
+        if (ndim >= info.ndim + 2) {
+            // 到达最深层，执行实际计算
+            // 始终使用float精度进行计算，避免半精度累积误差
+            float input_f32 = utils::cast<float>(input[input_index]);
+            float grad_out_f32 = utils::cast<float>(grad_output[grad_out_index]);
+            float current_grad_weight = utils::cast<float>(grad_weight[grad_weight_index]);
+            float result = current_grad_weight + input_f32 * grad_out_f32;
+            grad_weight[grad_weight_index] = utils::cast<GradWeightData>(result);
+            return;
+        }
+
+        size_t dim_idx = ndim - 2;
+        size_t input_dim = input_shape[ndim];
+        size_t grad_out_dim = info.grad_output_dims[dim_idx];
+        size_t weight_dim = info.weight_dims[dim_idx];
+        size_t stride = info.strides[dim_idx];
+        size_t pad = info.pads[dim_idx];
+        size_t dilation = info.dilations[dim_idx];
+
+        // 遍历卷积核维度
+        for (size_t kh = 0; kh < weight_dim; ++kh) {
+            size_t curr_grad_weight_index = grad_weight_index * weight_dim + kh;
+
+            // 遍历输出维度
+            for (size_t oh = 0; oh < grad_out_dim; ++oh) {
+                size_t curr_grad_out_index = grad_out_index * grad_out_dim + oh;
+
+                // 计算对应的输入位置
+                int ih = static_cast<int>(oh * stride + kh * dilation) - static_cast<int>(pad);
+
+                if (ih >= 0 && ih < static_cast<int>(input_dim)) {
+                    size_t curr_input_index = input_index * input_dim + ih;
+
+                    _applyWeightGradient(curr_input_index, curr_grad_out_index, curr_grad_weight_index,
+                                         ndim + 1, input, grad_output, grad_weight, input_shape);
+                }
+            }
+        }
+    }
+
+    // 获取零值
+    template <typename T>
+    static T get_zero() {
+        if constexpr (std::is_same<T, float>::value) {
+            return 0.0f;
+        } else if constexpr (std::is_same<T, fp16_t>::value) {
+            return _f32_to_f16(0.0f);
+        } else if constexpr (std::is_same<T, bf16_t>::value) {
+            return _f32_to_bf16(0.0f);
+        } else {
+            return T{};
+        }
+    }
+
+    // 计算数据梯度 (grad_input) - 使用更直接的实现避免递归
+    template <typename GradOutData, typename WeightData, typename GradInData>
+    void compute_data_gradient(GradInData *grad_input, const GradOutData *grad_output,
+                               const WeightData *weight) const {
+
+        size_t batch_size = info.batch;
+        size_t in_channels = info.in_channels;
+        size_t out_channels = info.out_channels;
+        size_t groups = info.groups;
+        size_t channels_per_group = in_channels / groups;
+        size_t out_channels_per_group = out_channels / groups;
+
+        // 计算空间大小
+        size_t input_spatial_size = 1;
+        size_t output_spatial_size = 1;
+        for (size_t i = 0; i < info.ndim; ++i) {
+            input_spatial_size *= info.input_dims[i];
+            output_spatial_size *= info.grad_output_dims[i];
+        }
+
+        // 初始化为零
+        size_t total_grad_input_size = batch_size * in_channels * input_spatial_size;
+        GradInData zero_val = get_zero<GradInData>();
+        std::fill(grad_input, grad_input + total_grad_input_size, zero_val);
+
+        // 对每个批次和组并行处理
+#pragma omp parallel for collapse(2) schedule(dynamic)
+        for (size_t b = 0; b < batch_size; ++b) {
+            for (size_t g = 0; g < groups; ++g) {
+                // 对每个输出通道
+                for (size_t oc = 0; oc < out_channels_per_group; ++oc) {
+                    size_t abs_oc = g * out_channels_per_group + oc;
+
+                    // 对每个输入通道
+                    for (size_t ic = 0; ic < channels_per_group; ++ic) {
+                        size_t abs_ic = g * channels_per_group + ic;
+
+                        // 对每个输出空间位置
+                        for (size_t out_spatial = 0; out_spatial < output_spatial_size; ++out_spatial) {
+
+                            // 将一维空间索引转换为多维坐标
+                            std::vector<size_t> out_coords(info.ndim);
+                            size_t temp = out_spatial;
+                            for (int d = info.ndim - 1; d >= 0; --d) {
+                                out_coords[d] = temp % info.grad_output_dims[d];
+                                temp /= info.grad_output_dims[d];
+                            }
+
+                            // 对每个卷积核空间位置
+                            size_t kernel_spatial_size = 1;
+                            for (size_t i = 0; i < info.ndim; ++i) {
+                                kernel_spatial_size *= info.weight_dims[i];
+                            }
+
+                            for (size_t kernel_spatial = 0; kernel_spatial < kernel_spatial_size; ++kernel_spatial) {
+
+                                // 将一维卷积核索引转换为多维坐标
+                                std::vector<size_t> kernel_coords(info.ndim);
+                                temp = kernel_spatial;
+                                for (int d = info.ndim - 1; d >= 0; --d) {
+                                    kernel_coords[d] = temp % info.weight_dims[d];
+                                    temp /= info.weight_dims[d];
+                                }
+
+                                // 计算对应的输入坐标
+                                std::vector<int> input_coords(info.ndim);
+                                bool valid = true;
+
+                                for (size_t d = 0; d < info.ndim; ++d) {
+                                    input_coords[d] = static_cast<int>(out_coords[d] * info.strides[d] + kernel_coords[d] * info.dilations[d]) - static_cast<int>(info.pads[d]);
+
+                                    if (input_coords[d] < 0 || input_coords[d] >= static_cast<int>(info.input_dims[d])) {
+                                        valid = false;
+                                        break;
+                                    }
+                                }
+
+                                if (valid) {
+                                    // 计算线性索引
+                                    size_t grad_out_idx = b * out_channels * output_spatial_size + abs_oc * output_spatial_size + out_spatial;
+
+                                    size_t weight_idx = abs_oc * channels_per_group * kernel_spatial_size + ic * kernel_spatial_size + kernel_spatial;
+
+                                    size_t input_spatial_idx = 0;
+                                    size_t multiplier = 1;
+                                    for (int d = info.ndim - 1; d >= 0; --d) {
+                                        input_spatial_idx += input_coords[d] * multiplier;
+                                        multiplier *= info.input_dims[d];
+                                    }
+
+                                    size_t grad_in_idx = b * in_channels * input_spatial_size + abs_ic * input_spatial_size + input_spatial_idx;
+
+                                    // 执行计算
+                                    float grad_out_f32 = utils::cast<float>(grad_output[grad_out_idx]);
+                                    float weight_f32 = utils::cast<float>(weight[weight_idx]);
+                                    float current_grad_in = utils::cast<float>(grad_input[grad_in_idx]);
+                                    float result = current_grad_in + grad_out_f32 * weight_f32;
+                                    grad_input[grad_in_idx] = utils::cast<GradInData>(result);
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // 计算权重梯度 (grad_weight) - 使用更直接的实现
+    template <typename InputData, typename GradOutData, typename GradWeightData>
+    void compute_weight_gradient(GradWeightData *grad_weight, const GradOutData *grad_output,
+                                 const InputData *input) const {
+
+        size_t batch_size = info.batch;
+        size_t in_channels = info.in_channels;
+        size_t out_channels = info.out_channels;
+        size_t groups = info.groups;
+        size_t channels_per_group = in_channels / groups;
+        size_t out_channels_per_group = out_channels / groups;
+
+        // 计算空间大小
+        size_t input_spatial_size = 1;
+        size_t output_spatial_size = 1;
+        size_t kernel_spatial_size = 1;
+        for (size_t i = 0; i < info.ndim; ++i) {
+            input_spatial_size *= info.input_dims[i];
+            output_spatial_size *= info.grad_output_dims[i];
+            kernel_spatial_size *= info.weight_dims[i];
+        }
+
+        // 初始化为零
+        size_t total_weight_size = out_channels * channels_per_group * kernel_spatial_size;
+        GradWeightData zero_val = get_zero<GradWeightData>();
+        std::fill(grad_weight, grad_weight + total_weight_size, zero_val);
+
+        // 对每个权重元素并行处理
+#pragma omp parallel for collapse(3) schedule(dynamic)
+        for (size_t abs_oc = 0; abs_oc < out_channels; ++abs_oc) {
+            for (size_t ic = 0; ic < channels_per_group; ++ic) {
+                for (size_t kernel_spatial = 0; kernel_spatial < kernel_spatial_size; ++kernel_spatial) {
+
+                    size_t g = abs_oc / out_channels_per_group;
+                    size_t abs_ic = g * channels_per_group + ic;
+
+                    // 将一维卷积核索引转换为多维坐标
+                    std::vector<size_t> kernel_coords(info.ndim);
+                    size_t temp = kernel_spatial;
+                    for (int d = info.ndim - 1; d >= 0; --d) {
+                        kernel_coords[d] = temp % info.weight_dims[d];
+                        temp /= info.weight_dims[d];
+                    }
+
+                    float accumulator = 0.0f;
+
+                    // 对所有批次和输出位置累积梯度
+                    for (size_t b = 0; b < batch_size; ++b) {
+                        for (size_t out_spatial = 0; out_spatial < output_spatial_size; ++out_spatial) {
+
+                            // 将一维输出空间索引转换为多维坐标
+                            std::vector<size_t> out_coords(info.ndim);
+                            temp = out_spatial;
+                            for (int d = info.ndim - 1; d >= 0; --d) {
+                                out_coords[d] = temp % info.grad_output_dims[d];
+                                temp /= info.grad_output_dims[d];
+                            }
+
+                            // 计算对应的输入坐标
+                            std::vector<int> input_coords(info.ndim);
+                            bool valid = true;
+
+                            for (size_t d = 0; d < info.ndim; ++d) {
+                                input_coords[d] = static_cast<int>(out_coords[d] * info.strides[d] + kernel_coords[d] * info.dilations[d]) - static_cast<int>(info.pads[d]);
+
+                                if (input_coords[d] < 0 || input_coords[d] >= static_cast<int>(info.input_dims[d])) {
+                                    valid = false;
+                                    break;
+                                }
+                            }
+
+                            if (valid) {
+                                // 计算线性索引
+                                size_t grad_out_idx = b * out_channels * output_spatial_size + abs_oc * output_spatial_size + out_spatial;
+
+                                size_t input_spatial_idx = 0;
+                                size_t multiplier = 1;
+                                for (int d = info.ndim - 1; d >= 0; --d) {
+                                    input_spatial_idx += input_coords[d] * multiplier;
+                                    multiplier *= info.input_dims[d];
+                                }
+
+                                size_t input_idx = b * in_channels * input_spatial_size + abs_ic * input_spatial_size + input_spatial_idx;
+
+                                // 累积梯度
+                                float input_f32 = utils::cast<float>(input[input_idx]);
+                                float grad_out_f32 = utils::cast<float>(grad_output[grad_out_idx]);
+                                accumulator += input_f32 * grad_out_f32;
+                            }
+                        }
+                    }
+
+                    // 写入结果
+                    size_t weight_idx = abs_oc * channels_per_group * kernel_spatial_size + ic * kernel_spatial_size + kernel_spatial;
+                    grad_weight[weight_idx] = utils::cast<GradWeightData>(accumulator);
+                }
+            }
+        }
+    }
+
+    // 计算偏置梯度 (grad_bias)
+    template <typename GradOutData, typename GradBiasData>
+    void compute_bias_gradient(GradBiasData *grad_bias, const GradOutData *grad_output) const {
+        size_t batch_size = info.batch;
+        size_t out_channels = info.out_channels;
+
+        size_t output_spatial_size = 1;
+        for (size_t i = 0; i < info.ndim; ++i) {
+            output_spatial_size *= info.grad_output_dims[i];
+        }
+
+        // 并行处理每个输出通道
+#pragma omp parallel for
+        for (ptrdiff_t c = 0; c < static_cast<ptrdiff_t>(out_channels); ++c) {
+            float sum = 0.0f;
+
+            for (size_t b = 0; b < batch_size; ++b) {
+                for (size_t s = 0; s < output_spatial_size; ++s) {
+                    size_t idx = b * out_channels * output_spatial_size + c * output_spatial_size + s;
+                    sum += utils::cast<float>(grad_output[idx]);
+                }
+            }
+
+            grad_bias[c] = utils::cast<GradBiasData>(sum);
+        }
+    }
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : handle(other.handle),
+          info(std::move(other.info)),
+          workspace_size(other.workspace_size) {
+        other.handle = nullptr;
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() = default;
+
+    static inline utils::Result<Opaque>
+    create(device::cpu::Handle *handle_ptr,
+           const op::conv_backward::ConvBackwardInfo &info,
+           infiniDtype_t data_type) {
+        if (data_type != INFINI_DTYPE_F32 && data_type != INFINI_DTYPE_F16 && data_type != INFINI_DTYPE_BF16) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        Opaque opaque(handle_ptr, info);
+        return utils::Result<Opaque>(std::move(opaque));
+    }
+
+    // CPU 实现的卷积反向传播
+    infiniStatus_t calculate(void *workspace, size_t workspace_size,
+                             void *grad_input, void *grad_weight, void *grad_bias,
+                             const void *grad_output, const void *input,
+                             const void *weight, infiniDtype_t dtype) const {
+
+        if (!grad_output || !input || !weight) {
+            return INFINI_STATUS_BAD_PARAM;
+        }
+
+        switch (dtype) {
+        case INFINI_DTYPE_F32: {
+            const float *grad_output_f32 = static_cast<const float *>(grad_output);
+            const float *input_f32 = static_cast<const float *>(input);
+            const float *weight_f32 = static_cast<const float *>(weight);
+
+            if (grad_input) {
+                float *grad_input_f32 = static_cast<float *>(grad_input);
+                compute_data_gradient(grad_input_f32, grad_output_f32, weight_f32);
+            }
+
+            if (grad_weight) {
+                float *grad_weight_f32 = static_cast<float *>(grad_weight);
+                compute_weight_gradient(grad_weight_f32, grad_output_f32, input_f32);
+            }
+
+            if (grad_bias) {
+                float *grad_bias_f32 = static_cast<float *>(grad_bias);
+                compute_bias_gradient(grad_bias_f32, grad_output_f32);
+            }
+            break;
+        }
+
+        case INFINI_DTYPE_F16: {
+            const fp16_t *grad_output_f16 = static_cast<const fp16_t *>(grad_output);
+            const fp16_t *input_f16 = static_cast<const fp16_t *>(input);
+            const fp16_t *weight_f16 = static_cast<const fp16_t *>(weight);
+
+            if (grad_input) {
+                fp16_t *grad_input_f16 = static_cast<fp16_t *>(grad_input);
+                compute_data_gradient(grad_input_f16, grad_output_f16, weight_f16);
+            }
+
+            if (grad_weight) {
+                fp16_t *grad_weight_f16 = static_cast<fp16_t *>(grad_weight);
+                compute_weight_gradient(grad_weight_f16, grad_output_f16, input_f16);
+            }
+
+            if (grad_bias) {
+                fp16_t *grad_bias_f16 = static_cast<fp16_t *>(grad_bias);
+                compute_bias_gradient(grad_bias_f16, grad_output_f16);
+            }
+            break;
+        }
+
+        case INFINI_DTYPE_BF16: {
+            const bf16_t *grad_output_bf16 = static_cast<const bf16_t *>(grad_output);
+            const bf16_t *input_bf16 = static_cast<const bf16_t *>(input);
+            const bf16_t *weight_bf16 = static_cast<const bf16_t *>(weight);
+
+            if (grad_input) {
+                bf16_t *grad_input_bf16 = static_cast<bf16_t *>(grad_input);
+                compute_data_gradient(grad_input_bf16, grad_output_bf16, weight_bf16);
+            }
+
+            if (grad_weight) {
+                bf16_t *grad_weight_bf16 = static_cast<bf16_t *>(grad_weight);
+                compute_weight_gradient(grad_weight_bf16, grad_output_bf16, input_bf16);
+            }
+
+            if (grad_bias) {
+                bf16_t *grad_bias_bf16 = static_cast<bf16_t *>(grad_bias);
+                compute_bias_gradient(grad_bias_bf16, grad_output_bf16);
+            }
+            break;
+        }
+
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        return INFINI_STATUS_SUCCESS;
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t grad_output_desc,
+                                  infiniopTensorDescriptor_t input_desc,
+                                  infiniopTensorDescriptor_t weight_desc,
+                                  infiniopTensorDescriptor_t bias_desc,
+                                  void *pads, void *strides, void *dilations,
+                                  size_t groups) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = input_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16);
+
+    auto info_result = op::conv_backward::ConvBackwardInfo::create(
+        grad_output_desc, input_desc, weight_desc, pads, strides, dilations, groups);
+    CHECK_RESULT(info_result);
+    auto info = info_result.take();
+
+    auto opaque_result = Opaque::create(handle, info, dtype);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    *desc_ptr = new Descriptor(dtype, opaque->workspace_size, opaque,
+                               handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *grad_input, void *grad_weight,
+                                     void *grad_bias, const void *grad_output,
+                                     const void *input, const void *weight,
+                                     void *stream) const {
+    return _opaque->calculate(workspace, workspace_size, grad_input, grad_weight,
+                              grad_bias, grad_output, input, weight, _dtype);
+}
+
+} // namespace op::conv_backward::cpu
diff --git a/src/infiniop/ops/conv_backward/cpu/conv_backward_cpu.h b/src/infiniop/ops/conv_backward/cpu/conv_backward_cpu.h
new file mode 100644
index 000000000..6adb485bf
--- /dev/null
+++ b/src/infiniop/ops/conv_backward/cpu/conv_backward_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __CONV_BACKWARD_CPU_H__
+#define __CONV_BACKWARD_CPU_H__
+
+#include "../conv_backward.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __CONV_BACKWARD_CPU_H__
diff --git a/src/infiniop/ops/conv_backward/cuda/bias_grad_kernel.cuh b/src/infiniop/ops/conv_backward/cuda/bias_grad_kernel.cuh
new file mode 100644
index 000000000..63aa9967c
--- /dev/null
+++ b/src/infiniop/ops/conv_backward/cuda/bias_grad_kernel.cuh
@@ -0,0 +1,27 @@
+#ifndef __GRAD_CUDA_H__
+#define __GRAD_CUDA_H__
+
+#include <cmath>
+
+// 特化模板：对于 bf16 类型，使用 float 进行累加以保持精度
+template <typename T>
+__global__ void compute_bias_grad_kernel(const T *grad_output, T *grad_bias,
+                                         int batch_size, int channels,
+                                         int spatial_size) {
+    int c = blockIdx.x * blockDim.x + threadIdx.x;
+    if (c >= channels) {
+        return;
+    }
+
+    // 使用 float 进行累加以保持精度
+    float sum = 0.0f;
+    for (int n = 0; n < batch_size; n++) {
+        for (int s = 0; s < spatial_size; s++) {
+            int idx = n * channels * spatial_size + c * spatial_size + s;
+            sum += static_cast<float>(grad_output[idx]);
+        }
+    }
+    grad_bias[c] = static_cast<T>(sum);
+}
+
+#endif // __GRAD_CUDA_H__
diff --git a/src/infiniop/ops/conv_backward/info.h b/src/infiniop/ops/conv_backward/info.h
new file mode 100644
index 000000000..2e412f0c2
--- /dev/null
+++ b/src/infiniop/ops/conv_backward/info.h
@@ -0,0 +1,70 @@
+#ifndef __CONV_BACKWARD_INFO_H__
+#define __CONV_BACKWARD_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::conv_backward {
+
+class ConvBackwardInfo {
+    ConvBackwardInfo() = default;
+
+public:
+    size_t ndim;
+    size_t batch;
+    size_t in_channels;
+    size_t out_channels;
+    size_t groups;
+    std::vector<size_t> input_dims;
+    std::vector<size_t> weight_dims;
+    std::vector<size_t> grad_output_dims;
+    std::vector<size_t> pads;
+    std::vector<size_t> strides;
+    std::vector<size_t> dilations;
+
+    static utils::Result<ConvBackwardInfo> create(
+        infiniopTensorDescriptor_t grad_output_desc,
+        infiniopTensorDescriptor_t input_desc,
+        infiniopTensorDescriptor_t weight_desc,
+        void *pads,
+        void *strides,
+        void *dilations,
+        size_t groups) {
+        ConvBackwardInfo info;
+        info.ndim = input_desc->ndim() - 2;
+        info.batch = input_desc->dim(0);
+        info.in_channels = input_desc->dim(1);
+        info.out_channels = weight_desc->dim(0);
+        info.groups = groups;
+        // 校验维度
+        if (input_desc->ndim() != weight_desc->ndim() || input_desc->ndim() != grad_output_desc->ndim()) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+        if (input_desc->dim(0) != grad_output_desc->dim(0) || weight_desc->dim(0) != grad_output_desc->dim(1)) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        for (size_t i = 2; i < input_desc->ndim(); ++i) {
+            info.input_dims.push_back(input_desc->dim(i));
+            info.weight_dims.push_back(weight_desc->dim(i));
+            info.grad_output_dims.push_back(grad_output_desc->dim(i));
+        }
+
+        auto pad_ptr = reinterpret_cast<const int *>(pads);
+        auto stride_ptr = reinterpret_cast<const int *>(strides);
+        auto dilation_ptr = reinterpret_cast<const int *>(dilations);
+
+        for (size_t i = 0; i < info.ndim; ++i) {
+            info.pads.push_back(pad_ptr ? static_cast<size_t>(pad_ptr[i]) : 0);
+            info.strides.push_back(stride_ptr ? static_cast<size_t>(stride_ptr[i]) : 1);
+            info.dilations.push_back(dilation_ptr ? static_cast<size_t>(dilation_ptr[i]) : 1);
+        }
+        return utils::Result<ConvBackwardInfo>(std::move(info));
+    }
+};
+
+} // namespace op::conv_backward
+
+#endif // __CONV_BACKWARD_INFO_H__
diff --git a/src/infiniop/ops/conv_backward/metax/conv_backward_metax.h b/src/infiniop/ops/conv_backward/metax/conv_backward_metax.h
new file mode 100644
index 000000000..d0fd76b9c
--- /dev/null
+++ b/src/infiniop/ops/conv_backward/metax/conv_backward_metax.h
@@ -0,0 +1,8 @@
+#ifndef __CONV_BACKWARD_METAX_H__
+#define __CONV_BACKWARD_METAX_H__
+
+#include "../conv_backward.h"
+
+DESCRIPTOR(metax)
+
+#endif // __CONV_BACKWARD_METAX_H__
diff --git a/src/infiniop/ops/conv_backward/metax/conv_backward_metax.maca b/src/infiniop/ops/conv_backward/metax/conv_backward_metax.maca
new file mode 100644
index 000000000..597fc6e08
--- /dev/null
+++ b/src/infiniop/ops/conv_backward/metax/conv_backward_metax.maca
@@ -0,0 +1,451 @@
+#include "conv_backward_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+#include "../cuda/bias_grad_kernel.cuh"
+#include "../info.h"
+
+infiniStatus_t launch_bias_grad_kernel(const void *grad_output, void *grad_bias,
+                                       const int *grad_output_dims,
+                                       size_t conv_ndim,
+                                       hcdnnDataType_t data_type,
+                                       hcStream_t stream) {
+  // 只处理 bf16 类型
+  if (data_type != HCDNN_DATA_BFLOAT16) {
+    return INFINI_STATUS_BAD_TENSOR_DTYPE;
+  }
+
+  int batch_size = grad_output_dims[0];
+  int channels = grad_output_dims[1];
+  int spatial_size = 1;
+
+  // 计算空间维度大小
+  for (size_t i = 2; i < conv_ndim + 2; ++i) {
+    spatial_size *= grad_output_dims[i];
+  }
+
+  dim3 block(256);
+  dim3 grid((channels + block.x - 1) / block.x);
+
+  // 直接调用 bf16 kernel
+  compute_bias_grad_kernel<__hpcc_bfloat16><<<grid, block, 0, stream>>>(
+      reinterpret_cast<const __hpcc_bfloat16 *>(grad_output),
+      reinterpret_cast<__hpcc_bfloat16 *>(grad_bias), batch_size, channels,
+      spatial_size);
+
+  return INFINI_STATUS_SUCCESS;
+}
+
+#define DESTROY_HCDNN_DESCRIPTOR(desc_ptr, destroy_func) \
+    do {                                                 \
+        if (desc_ptr) {                                  \
+            destroy_func(desc_ptr);                      \
+            desc_ptr = nullptr;                          \
+        }                                                \
+    } while (0)
+
+#define CLEANUP_HCDNN_DESCRIPTORS()                                               \
+    do {                                                                          \
+        DESTROY_HCDNN_DESCRIPTOR(input_desc, hcdnnDestroyTensorDescriptor);       \
+        DESTROY_HCDNN_DESCRIPTOR(grad_output_desc, hcdnnDestroyTensorDescriptor); \
+        DESTROY_HCDNN_DESCRIPTOR(weight_desc, hcdnnDestroyFilterDescriptor);      \
+        DESTROY_HCDNN_DESCRIPTOR(grad_input_desc, hcdnnDestroyTensorDescriptor);  \
+        DESTROY_HCDNN_DESCRIPTOR(grad_weight_desc, hcdnnDestroyFilterDescriptor); \
+        DESTROY_HCDNN_DESCRIPTOR(grad_bias_desc, hcdnnDestroyTensorDescriptor);   \
+        DESTROY_HCDNN_DESCRIPTOR(conv_desc, hcdnnDestroyConvolutionDescriptor);   \
+    } while (0)
+
+namespace op::conv_backward::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+    size_t workspace_size = 0;
+
+#ifdef ENABLE_HCDNN_API
+    // hcdnn描述符（对应cudnn描述符）
+    hcdnnTensorDescriptor_t input_desc = nullptr;
+    hcdnnTensorDescriptor_t grad_output_desc = nullptr;
+    hcdnnFilterDescriptor_t weight_desc = nullptr;
+    hcdnnTensorDescriptor_t grad_input_desc = nullptr;
+    hcdnnFilterDescriptor_t grad_weight_desc = nullptr;
+    hcdnnTensorDescriptor_t grad_bias_desc = nullptr;
+    hcdnnConvolutionDescriptor_t conv_desc = nullptr;
+
+    // 反向数据和滤波器算法
+    hcdnnConvolutionBwdDataAlgo_t bwd_data_algo;
+    hcdnnConvolutionBwdFilterAlgo_t bwd_filter_algo;
+    size_t bwd_data_workspace_size = 0;
+    size_t bwd_filter_workspace_size = 0;
+    size_t conv_ndim = 0;
+#endif
+
+private:
+    Opaque(std::shared_ptr<device::metax::Handle::Internal> internal_ptr)
+        : internal(internal_ptr) {}
+
+#ifdef ENABLE_HCDNN_API
+    infiniStatus_t gethcdnnDataType(infiniDtype_t data_type,
+                                    hcdnnDataType_t &hcdnn_data_type) const {
+        switch (data_type) {
+        case INFINI_DTYPE_F16:
+            hcdnn_data_type = HCDNN_DATA_HALF;
+            break;
+        case INFINI_DTYPE_F32:
+            hcdnn_data_type = HCDNN_DATA_FLOAT;
+            break;
+        case INFINI_DTYPE_BF16:
+            hcdnn_data_type = HCDNN_DATA_BFLOAT16;
+            break;
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    // 计算张量步幅（与cuDNN逻辑一致，从最后一维开始计算）
+    infiniStatus_t calculateStrides(int ndim, const int *dims, std::vector<int> &strides) const {
+        strides.resize(ndim);
+        strides[ndim - 1] = 1; // 最后一维步幅为1
+        for (int i = ndim - 2; i >= 0; --i) {
+            strides[i] = strides[i + 1] * dims[i + 1];
+        }
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t createTensorAndFilterDescriptors(
+        const op::conv_backward::ConvBackwardInfo &info,
+        hcdnnDataType_t hcdnn_data_type,
+        infiniopTensorDescriptor_t bias_desc) {
+
+        int ndim = static_cast<int>(info.ndim + 2);
+
+        std::vector<int> input_dims = {static_cast<int>(info.batch), static_cast<int>(info.in_channels)};
+        for (size_t i = 0; i < info.ndim; ++i) {
+            input_dims.push_back(static_cast<int>(info.input_dims[i]));
+        }
+        std::vector<int> input_strides;
+        CHECK_STATUS(calculateStrides(ndim, input_dims.data(), input_strides));
+
+        std::vector<int> grad_output_dims = {static_cast<int>(info.batch), static_cast<int>(info.out_channels)};
+        for (size_t i = 0; i < info.ndim; ++i) {
+            grad_output_dims.push_back(static_cast<int>(info.grad_output_dims[i]));
+        }
+        std::vector<int> grad_output_strides;
+        CHECK_STATUS(calculateStrides(ndim, grad_output_dims.data(), grad_output_strides));
+
+        std::vector<int> weight_dims = {static_cast<int>(info.out_channels), static_cast<int>(info.in_channels / info.groups)};
+        for (size_t i = 0; i < info.ndim; ++i) {
+            weight_dims.push_back(static_cast<int>(info.weight_dims[i]));
+        }
+
+        if (info.ndim == 1) {
+            input_dims.push_back(1);
+            input_strides.push_back(1);
+            grad_output_dims.push_back(1);
+            grad_output_strides.push_back(1);
+            weight_dims.push_back(1);
+        }
+
+        CHECK_MCDNN(hcdnnCreateTensorDescriptor(&input_desc));
+        CHECK_MCDNN(hcdnnSetTensorNdDescriptor(
+            input_desc, hcdnn_data_type, input_dims.size(), input_dims.data(), input_strides.data()));
+
+        CHECK_MCDNN(hcdnnCreateTensorDescriptor(&grad_output_desc));
+        CHECK_MCDNN(hcdnnSetTensorNdDescriptor(
+            grad_output_desc, hcdnn_data_type, grad_output_dims.size(), grad_output_dims.data(), grad_output_strides.data()));
+
+        CHECK_MCDNN(hcdnnCreateFilterDescriptor(&weight_desc));
+        CHECK_MCDNN(hcdnnSetFilterNdDescriptor(
+            weight_desc, hcdnn_data_type, HCDNN_TENSOR_NCHW, weight_dims.size(), weight_dims.data()));
+
+        CHECK_MCDNN(hcdnnCreateTensorDescriptor(&grad_input_desc));
+        CHECK_MCDNN(hcdnnSetTensorNdDescriptor(
+            grad_input_desc, hcdnn_data_type, input_dims.size(), input_dims.data(), input_strides.data()));
+
+        CHECK_MCDNN(hcdnnCreateFilterDescriptor(&grad_weight_desc));
+        CHECK_MCDNN(hcdnnSetFilterNdDescriptor(
+            grad_weight_desc, hcdnn_data_type, HCDNN_TENSOR_NCHW, weight_dims.size(), weight_dims.data()));
+
+        if (bias_desc) {
+            int bias_ndim = (info.ndim == 1) ? 4 : ndim;
+            std::vector<int> bias_dims(bias_ndim, 1);
+            bias_dims[1] = static_cast<int>(bias_desc->dim(0));
+
+            std::vector<int> bias_strides(bias_ndim, 1);
+            for (int i = bias_ndim - 2; i >= 0; --i) {
+                bias_strides[i] = bias_strides[i + 1] * bias_dims[i + 1];
+            }
+
+            CHECK_MCDNN(hcdnnCreateTensorDescriptor(&grad_bias_desc));
+            CHECK_MCDNN(hcdnnSetTensorNdDescriptor(
+                grad_bias_desc, hcdnn_data_type, bias_ndim, bias_dims.data(), bias_strides.data()));
+        }
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t createConvDescriptor(const op::conv_backward::ConvBackwardInfo &info,
+                                        hcdnnDataType_t hcdnn_data_type) {
+        int conv_dim = (info.ndim == 1) ? 2 : static_cast<int>(info.ndim); // 1D卷积按2D处理
+        std::vector<int> pad_vec(info.pads.begin(), info.pads.end());
+        std::vector<int> stride_vec(info.strides.begin(), info.strides.end());
+        std::vector<int> dilation_vec(info.dilations.begin(), info.dilations.end());
+
+        if (info.ndim == 1) {
+            pad_vec.push_back(0);
+            stride_vec.push_back(1);
+            dilation_vec.push_back(1);
+        }
+
+        CHECK_MCDNN(hcdnnCreateConvolutionDescriptor(&conv_desc));
+        hcdnnDataType_t compute_type = (hcdnn_data_type == HCDNN_DATA_HALF || hcdnn_data_type == HCDNN_DATA_BFLOAT16)
+                                         ? HCDNN_DATA_FLOAT
+                                         : hcdnn_data_type;
+        CHECK_MCDNN(hcdnnSetConvolutionNdDescriptor(
+            conv_desc, conv_dim, pad_vec.data(), stride_vec.data(),
+            dilation_vec.data(), HCDNN_CROSS_CORRELATION, compute_type));
+        CHECK_MCDNN(hcdnnSetConvolutionGroupCount(conv_desc, static_cast<int>(info.groups)));
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t initializehcdnnContext(
+        const op::conv_backward::ConvBackwardInfo &info,
+        infiniDtype_t data_type,
+        infiniopTensorDescriptor_t bias_desc) {
+        hcdnnDataType_t hcdnn_data_type;
+        CHECK_STATUS(gethcdnnDataType(data_type, hcdnn_data_type));
+        CHECK_STATUS(createTensorAndFilterDescriptors(info, hcdnn_data_type, bias_desc));
+        CHECK_STATUS(createConvDescriptor(info, hcdnn_data_type));
+
+        internal->useMcdnn(nullptr, [&](hcdnnHandle_t h) {
+            // 1. 查找反向数据算法
+            int requested_algo_count = 8;
+            int returned_algo_count = 0;
+            hcdnnConvolutionBwdDataAlgoPerf_t bwd_data_perf[8];
+
+            hcdnnStatus_t status = hcdnnFindConvolutionBackwardDataAlgorithm(
+                h, weight_desc, grad_output_desc, conv_desc, grad_input_desc,
+                requested_algo_count, &returned_algo_count, bwd_data_perf);
+
+            bool found = false;
+            if (status == HCDNN_STATUS_SUCCESS && returned_algo_count > 0) {
+                for (int i = 0; i < returned_algo_count; i++) {
+                    if (bwd_data_perf[i].status == HCDNN_STATUS_SUCCESS) {
+                        bwd_data_algo = bwd_data_perf[i].algo;
+                        bwd_data_workspace_size = bwd_data_perf[i].memory;
+                        found = true;
+                        break;
+                    }
+                }
+            }
+            if (!found) {
+                // 未找到有效算法，使用默认算法
+                bwd_data_algo = HCDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+                CHECK_MCDNN(hcdnnGetConvolutionBackwardDataWorkspaceSize(
+                    h, weight_desc, grad_output_desc, conv_desc, grad_input_desc,
+                    bwd_data_algo, &bwd_data_workspace_size));
+            }
+
+            // 2. 查找反向权重算法
+            hcdnnConvolutionBwdFilterAlgoPerf_t bwd_filter_perf[8];
+            status = hcdnnFindConvolutionBackwardFilterAlgorithm(
+                h, input_desc, grad_output_desc, conv_desc, grad_weight_desc,
+                requested_algo_count, &returned_algo_count, bwd_filter_perf);
+
+            found = false;
+            if (status == HCDNN_STATUS_SUCCESS && returned_algo_count > 0) {
+                for (int i = 0; i < returned_algo_count; i++) {
+                    if (bwd_filter_perf[i].status == HCDNN_STATUS_SUCCESS) {
+                        bwd_filter_algo = bwd_filter_perf[i].algo;
+                        bwd_filter_workspace_size = bwd_filter_perf[i].memory;
+                        found = true;
+                        break;
+                    }
+                }
+            }
+            if (!found) {
+                // 未找到有效算法，使用默认算法
+                bwd_filter_algo = HCDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+                CHECK_MCDNN(hcdnnGetConvolutionBackwardFilterWorkspaceSize(
+                    h, input_desc, grad_output_desc, conv_desc, grad_weight_desc,
+                    bwd_filter_algo, &bwd_filter_workspace_size));
+            }
+            return INFINI_STATUS_SUCCESS;
+        });
+
+        // 工作空间大小取两者最大值
+        workspace_size = std::max(bwd_data_workspace_size, bwd_filter_workspace_size);
+        conv_ndim = info.ndim;
+        return INFINI_STATUS_SUCCESS;
+    }
+#endif
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : internal(std::move(other.internal)),
+          workspace_size(other.workspace_size)
+#ifdef ENABLE_HCDNN_API
+          ,
+          input_desc(other.input_desc), grad_output_desc(other.grad_output_desc), weight_desc(other.weight_desc), grad_input_desc(other.grad_input_desc), grad_weight_desc(other.grad_weight_desc), grad_bias_desc(other.grad_bias_desc), conv_desc(other.conv_desc), bwd_data_algo(other.bwd_data_algo), bwd_filter_algo(other.bwd_filter_algo), bwd_data_workspace_size(other.bwd_data_workspace_size), bwd_filter_workspace_size(other.bwd_filter_workspace_size)
+            , conv_ndim(other.conv_ndim)
+#endif
+    {
+#ifdef ENABLE_HCDNN_API
+        other.input_desc = nullptr;
+        other.grad_output_desc = nullptr;
+        other.weight_desc = nullptr;
+        other.grad_input_desc = nullptr;
+        other.grad_weight_desc = nullptr;
+        other.grad_bias_desc = nullptr;
+        other.conv_desc = nullptr;
+        other.bwd_data_algo = static_cast<hcdnnConvolutionBwdDataAlgo_t>(0);
+        other.bwd_filter_algo = static_cast<hcdnnConvolutionBwdFilterAlgo_t>(0);
+        other.bwd_data_workspace_size = 0;
+        other.bwd_filter_workspace_size = 0;
+        other.conv_ndim = 0;
+#endif
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() {
+#ifdef ENABLE_HCDNN_API
+        CLEANUP_HCDNN_DESCRIPTORS();
+#endif
+    }
+
+    static inline utils::Result<Opaque>
+    create(std::shared_ptr<device::metax::Handle::Internal> internal_ptr,
+           const op::conv_backward::ConvBackwardInfo &info,
+           infiniDtype_t data_type, infiniopTensorDescriptor_t bias_desc) {
+#ifdef ENABLE_HCDNN_API
+        Opaque opaque(internal_ptr);
+        auto status = opaque.initializehcdnnContext(info, data_type, bias_desc);
+        if (status != INFINI_STATUS_SUCCESS) {
+            return status;
+        }
+        return utils::Result<Opaque>(std::move(opaque));
+#else
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t grad_output_desc,
+                                  infiniopTensorDescriptor_t input_desc,
+                                  infiniopTensorDescriptor_t weight_desc,
+                                  infiniopTensorDescriptor_t bias_desc,
+                                  void *pads, void *strides, void *dilations,
+                                  size_t groups) {
+#ifdef ENABLE_HCDNN_API
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = input_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    auto info_result = op::conv_backward::ConvBackwardInfo::create(
+        grad_output_desc, input_desc, weight_desc, pads, strides, dilations, groups);
+    CHECK_RESULT(info_result);
+    auto info = info_result.take();
+
+    auto opaque_result = Opaque::create(handle->internal(), info, dtype, bias_desc);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    *desc_ptr = new Descriptor(dtype, opaque->workspace_size, opaque,
+                               handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *grad_input, void *grad_weight,
+                                     void *grad_bias, const void *grad_output,
+                                     const void *input, const void *weight,
+                                     void *stream) const {
+#ifdef ENABLE_HCDNN_API
+    const float alpha = 1.0f, beta = 0.0f;
+    auto internal = _opaque->internal;
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    CHECK_STATUS(_opaque->internal->useMcdnn((hcStream_t)stream, [&](hcdnnHandle_t handle) {
+        if (!grad_input || !grad_weight || !grad_output || !input || !weight) {
+            return INFINI_STATUS_BAD_PARAM;
+        }
+
+        CHECK_MCDNN(hcdnnConvolutionBackwardData(
+            handle,
+            &alpha,
+            _opaque->weight_desc,
+            weight,
+            _opaque->grad_output_desc,
+            grad_output,
+            _opaque->conv_desc,
+            _opaque->bwd_data_algo,
+            workspace,
+            _opaque->bwd_data_workspace_size,
+            &beta,
+            _opaque->grad_input_desc,
+            grad_input));
+
+        CHECK_MCDNN(hcdnnConvolutionBackwardFilter(
+            handle,
+            &alpha,
+            _opaque->input_desc,
+            input,
+            _opaque->grad_output_desc,
+            grad_output,
+            _opaque->conv_desc,
+            _opaque->bwd_filter_algo,
+            workspace,
+            _opaque->bwd_filter_workspace_size,
+            &beta,
+            _opaque->grad_weight_desc,
+            grad_weight));
+
+        if (_opaque->grad_bias_desc && grad_bias) {
+            hcdnnDataType_t grad_output_type;
+            int grad_output_nbDims;
+            int grad_output_dims[5], grad_output_strides[5];
+
+            int query_ndim = (_opaque->conv_ndim == 3) ? 5 : 4;
+
+            hcdnnStatus_t status = hcdnnGetTensorNdDescriptor(
+                _opaque->grad_output_desc, query_ndim, &grad_output_type,
+                &grad_output_nbDims, grad_output_dims, grad_output_strides);
+            if (grad_output_type == HCDNN_DATA_BFLOAT16) {
+                CHECK_STATUS(launch_bias_grad_kernel(
+                    grad_output, grad_bias, grad_output_dims, _opaque->conv_ndim,
+                    grad_output_type, (hcStream_t)stream));
+            } else {
+                CHECK_MCDNN(hcdnnConvolutionBackwardBias(
+                    handle,
+                    &alpha,
+                    _opaque->grad_output_desc,
+                    grad_output,
+                    &beta,
+                    _opaque->grad_bias_desc,
+                    grad_bias));
+            }
+        }
+        return INFINI_STATUS_SUCCESS;
+    }));
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+} // namespace op::conv_backward::metax
diff --git a/src/infiniop/ops/conv_backward/nvidia/conv_backward_nvidia.cu b/src/infiniop/ops/conv_backward/nvidia/conv_backward_nvidia.cu
new file mode 100644
index 000000000..29e810a4c
--- /dev/null
+++ b/src/infiniop/ops/conv_backward/nvidia/conv_backward_nvidia.cu
@@ -0,0 +1,452 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../cuda/bias_grad_kernel.cuh"
+#include "../info.h"
+#include "conv_backward_nvidia.cuh"
+
+infiniStatus_t launch_bias_grad_kernel(const void *grad_output, void *grad_bias,
+                                       const int *grad_output_dims,
+                                       size_t conv_ndim,
+                                       cudnnDataType_t data_type,
+                                       cudaStream_t stream) {
+    int batch_size = grad_output_dims[0];
+    int channels = grad_output_dims[1];
+    int spatial_size = 1;
+
+    for (size_t i = 2; i < conv_ndim + 2; ++i) {
+        spatial_size *= grad_output_dims[i];
+    }
+
+    dim3 block(256);
+    dim3 grid((channels + block.x - 1) / block.x);
+
+    // 直接调用 bf16 kernel
+    compute_bias_grad_kernel<__nv_bfloat16><<<grid, block, 0, stream>>>(
+        reinterpret_cast<const __nv_bfloat16 *>(grad_output),
+        reinterpret_cast<__nv_bfloat16 *>(grad_bias), batch_size, channels,
+        spatial_size);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+#define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \
+    do {                                                 \
+        if (desc_ptr) {                                  \
+            destroy_func(desc_ptr);                      \
+            desc_ptr = nullptr;                          \
+        }                                                \
+    } while (0)
+
+#define CLEANUP_CUDNN_DESCRIPTORS()                                               \
+    do {                                                                          \
+        DESTROY_CUDNN_DESCRIPTOR(input_desc, cudnnDestroyTensorDescriptor);       \
+        DESTROY_CUDNN_DESCRIPTOR(grad_output_desc, cudnnDestroyTensorDescriptor); \
+        DESTROY_CUDNN_DESCRIPTOR(weight_desc, cudnnDestroyFilterDescriptor);      \
+        DESTROY_CUDNN_DESCRIPTOR(grad_input_desc, cudnnDestroyTensorDescriptor);  \
+        DESTROY_CUDNN_DESCRIPTOR(grad_weight_desc, cudnnDestroyFilterDescriptor); \
+        DESTROY_CUDNN_DESCRIPTOR(grad_bias_desc, cudnnDestroyTensorDescriptor);   \
+        DESTROY_CUDNN_DESCRIPTOR(conv_desc, cudnnDestroyConvolutionDescriptor);   \
+    } while (0)
+
+namespace op::conv_backward::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+    size_t workspace_size = 0;
+
+#ifdef ENABLE_CUDNN_API
+    cudnnTensorDescriptor_t input_desc = nullptr;
+    cudnnTensorDescriptor_t grad_output_desc = nullptr;
+    cudnnFilterDescriptor_t weight_desc = nullptr;
+    cudnnTensorDescriptor_t grad_input_desc = nullptr;
+    cudnnFilterDescriptor_t grad_weight_desc = nullptr;
+    cudnnTensorDescriptor_t grad_bias_desc = nullptr;
+    cudnnConvolutionDescriptor_t conv_desc = nullptr;
+
+    cudnnConvolutionBwdDataAlgo_t bwd_data_algo;
+    cudnnConvolutionBwdFilterAlgo_t bwd_filter_algo;
+    size_t bwd_data_workspace_size = 0;
+    size_t bwd_filter_workspace_size = 0;
+    size_t conv_ndim = 0;
+#endif
+
+private:
+    Opaque(std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr)
+        : internal(internal_ptr) {}
+
+#ifdef ENABLE_CUDNN_API
+    infiniStatus_t getCudnnDataType(infiniDtype_t data_type,
+                                    cudnnDataType_t &cudnn_data_type) const {
+        if (data_type == INFINI_DTYPE_F16 || data_type == INFINI_DTYPE_F32 || data_type == INFINI_DTYPE_BF16) {
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
+            return INFINI_STATUS_SUCCESS;
+        }
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    infiniStatus_t calculateStrides(int ndim, const int *input_dims,
+                                    std::vector<int> &input_strides) const {
+        input_strides.resize(ndim);
+        input_strides[ndim - 1] = 1; // 最后一维 stride = 1
+        for (int i = ndim - 2; i >= 0; --i) {
+            input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
+        }
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t createTensorAndFilterDescriptors(
+        const op::conv_backward::ConvBackwardInfo &info,
+        cudnnDataType_t cudnn_data_type, infiniopTensorDescriptor_t bias_desc) {
+
+        int ndim = static_cast<int>(info.ndim + 2);
+
+        // input
+        std::vector<int> input_dims = {static_cast<int>(info.batch),
+                                       static_cast<int>(info.in_channels)};
+        for (size_t i = 0; i < info.ndim; ++i) {
+            input_dims.push_back(static_cast<int>(info.input_dims[i]));
+        }
+        std::vector<int> input_strides;
+        CHECK_STATUS(calculateStrides(ndim, input_dims.data(), input_strides));
+
+        // grad_output
+        std::vector<int> grad_output_dims = {static_cast<int>(info.batch),
+                                             static_cast<int>(info.out_channels)};
+        for (size_t i = 0; i < info.ndim; ++i) {
+            grad_output_dims.push_back(static_cast<int>(info.grad_output_dims[i]));
+        }
+        std::vector<int> grad_output_strides;
+        CHECK_STATUS(
+            calculateStrides(ndim, grad_output_dims.data(), grad_output_strides));
+
+        // weight
+        size_t in_channels_per_group = info.in_channels / info.groups;
+        std::vector<int> weight_dims = {static_cast<int>(info.out_channels),
+                                        static_cast<int>(in_channels_per_group)};
+        for (size_t i = 0; i < info.ndim; ++i) {
+            weight_dims.push_back(static_cast<int>(info.weight_dims[i]));
+        }
+
+        if (info.ndim == 1) {
+            input_dims.push_back(1);
+            input_strides.push_back(1);
+            grad_output_dims.push_back(1);
+            grad_output_strides.push_back(1);
+            weight_dims.push_back(1);
+        }
+
+        // input
+        CHECK_CUDNN(cudnnCreateTensorDescriptor(&input_desc));
+        CHECK_CUDNN(cudnnSetTensorNdDescriptor(input_desc, cudnn_data_type,
+                                               input_dims.size(), input_dims.data(),
+                                               input_strides.data()));
+
+        // grad_output
+        CHECK_CUDNN(cudnnCreateTensorDescriptor(&grad_output_desc));
+        CHECK_CUDNN(cudnnSetTensorNdDescriptor(
+            grad_output_desc, cudnn_data_type, grad_output_dims.size(),
+            grad_output_dims.data(), grad_output_strides.data()));
+
+        // weight
+        CHECK_CUDNN(cudnnCreateFilterDescriptor(&weight_desc));
+        CHECK_CUDNN(cudnnSetFilterNdDescriptor(
+            weight_desc, cudnn_data_type, CUDNN_TENSOR_NCHW, weight_dims.size(),
+            weight_dims.data()));
+
+        // grad_input
+        CHECK_CUDNN(cudnnCreateTensorDescriptor(&grad_input_desc));
+        CHECK_CUDNN(cudnnSetTensorNdDescriptor(grad_input_desc, cudnn_data_type,
+                                               input_dims.size(), input_dims.data(),
+                                               input_strides.data()));
+
+        // grad_weight
+        CHECK_CUDNN(cudnnCreateFilterDescriptor(&grad_weight_desc));
+        CHECK_CUDNN(cudnnSetFilterNdDescriptor(
+            grad_weight_desc, cudnn_data_type, CUDNN_TENSOR_NCHW,
+            weight_dims.size(), weight_dims.data()));
+
+        // grad_bias (optional)
+        if (bias_desc) {
+            int bias_ndim = (info.ndim == 1) ? 4 : ndim;
+
+            std::vector<int> bias_dims(bias_ndim, 1);
+            bias_dims[1] = static_cast<int>(bias_desc->dim(0)); // out_channels
+
+            std::vector<int> bias_strides(bias_ndim, 1);
+            for (int i = bias_ndim - 2; i >= 0; --i) {
+                bias_strides[i] = bias_strides[i + 1] * bias_dims[i + 1];
+            }
+
+            CHECK_CUDNN(cudnnCreateTensorDescriptor(&grad_bias_desc));
+            CHECK_CUDNN(cudnnSetTensorNdDescriptor(grad_bias_desc, cudnn_data_type,
+                                                   bias_ndim, bias_dims.data(),
+                                                   bias_strides.data()));
+        }
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t
+    createConvDescriptor(const op::conv_backward::ConvBackwardInfo &info,
+                         cudnnDataType_t cudnn_data_type) {
+        int conv_dim = (info.ndim == 1) ? 2 : static_cast<int>(info.ndim);
+        std::vector<int> pad_vec(info.pads.begin(), info.pads.end());
+        std::vector<int> stride_vec(info.strides.begin(), info.strides.end());
+        std::vector<int> dilation_vec(info.dilations.begin(), info.dilations.end());
+
+        if (info.ndim == 1) {
+            pad_vec.push_back(0);
+            stride_vec.push_back(1);
+            dilation_vec.push_back(1);
+        }
+
+        CHECK_CUDNN(cudnnCreateConvolutionDescriptor(&conv_desc));
+        cudnnDataType_t compute_type = (cudnn_data_type == CUDNN_DATA_BFLOAT16 || cudnn_data_type == CUDNN_DATA_HALF)
+                                         ? CUDNN_DATA_FLOAT
+                                         : cudnn_data_type;
+        CHECK_CUDNN(cudnnSetConvolutionNdDescriptor(
+            conv_desc, conv_dim, pad_vec.data(), stride_vec.data(),
+            dilation_vec.data(), CUDNN_CROSS_CORRELATION, compute_type));
+        CHECK_CUDNN(cudnnSetConvolutionGroupCount(conv_desc,
+                                                  static_cast<int>(info.groups)));
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t
+    initializeCudnnContext(const op::conv_backward::ConvBackwardInfo &info,
+                           infiniDtype_t data_type,
+                           infiniopTensorDescriptor_t bias_desc) {
+
+        cudnnDataType_t cudnn_data_type;
+        CHECK_STATUS(getCudnnDataType(data_type, cudnn_data_type));
+        CHECK_STATUS(
+            createTensorAndFilterDescriptors(info, cudnn_data_type, bias_desc));
+        CHECK_STATUS(createConvDescriptor(info, cudnn_data_type));
+
+        // Query workspace size
+        internal->useCudnn(nullptr, [&](cudnnHandle_t h) {
+            // 1. 查找适合的反向数据算法
+            int requestedAlgoCount = 8;
+            int returnedAlgoCount = 0;
+            cudnnConvolutionBwdDataAlgoPerf_t bwd_data_perf[8];
+
+            cudnnStatus_t status = cudnnFindConvolutionBackwardDataAlgorithm(
+                h, weight_desc, grad_output_desc, conv_desc, grad_input_desc,
+                requestedAlgoCount, &returnedAlgoCount, bwd_data_perf);
+            bool found = false;
+            if (status == CUDNN_STATUS_SUCCESS && returnedAlgoCount > 0) {
+                for (int i = 0; i < returnedAlgoCount; i++) {
+                    if (bwd_data_perf[i].status == CUDNN_STATUS_SUCCESS) {
+                        bwd_data_algo = bwd_data_perf[i].algo;
+                        bwd_data_workspace_size = bwd_data_perf[i].memory;
+                        found = true;
+                        break;
+                    }
+                }
+                if (!found) {
+                    // 如果没找到成功的算法，用默认的
+                    bwd_data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+                    cudnnGetConvolutionBackwardDataWorkspaceSize(
+                        h, weight_desc, grad_output_desc, conv_desc, grad_input_desc,
+                        bwd_data_algo, &bwd_data_workspace_size);
+                }
+            } else {
+                // 查找失败，回退到默认算法
+                bwd_data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
+                cudnnGetConvolutionBackwardDataWorkspaceSize(
+                    h, weight_desc, grad_output_desc, conv_desc, grad_input_desc,
+                    bwd_data_algo, &bwd_data_workspace_size);
+            }
+
+            // 2. 查找适合的反向权重算法
+            cudnnConvolutionBwdFilterAlgoPerf_t bwd_filter_perf[8];
+
+            status = cudnnFindConvolutionBackwardFilterAlgorithm(
+                h, input_desc, grad_output_desc, conv_desc, grad_weight_desc,
+                requestedAlgoCount, &returnedAlgoCount, bwd_filter_perf);
+
+            if (status == CUDNN_STATUS_SUCCESS && returnedAlgoCount > 0) {
+                found = false;
+                for (int i = 0; i < returnedAlgoCount; i++) {
+                    if (bwd_filter_perf[i].status == CUDNN_STATUS_SUCCESS) {
+                        bwd_filter_algo = bwd_filter_perf[i].algo;
+                        bwd_filter_workspace_size = bwd_filter_perf[i].memory;
+                        found = true;
+                        break;
+                    }
+                }
+                if (!found) {
+                    // 如果没找到成功的算法，用默认的
+                    bwd_filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+                    cudnnGetConvolutionBackwardFilterWorkspaceSize(
+                        h, input_desc, grad_output_desc, conv_desc, grad_weight_desc,
+                        bwd_filter_algo, &bwd_filter_workspace_size);
+                }
+            } else {
+                bwd_filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
+                cudnnGetConvolutionBackwardFilterWorkspaceSize(
+                    h, input_desc, grad_output_desc, conv_desc, grad_weight_desc,
+                    bwd_filter_algo, &bwd_filter_workspace_size);
+            }
+            return INFINI_STATUS_SUCCESS;
+        });
+        workspace_size = std::max(bwd_data_workspace_size, bwd_filter_workspace_size);
+
+        conv_ndim = info.ndim;
+
+        return INFINI_STATUS_SUCCESS;
+    }
+#endif
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : internal(std::move(other.internal)),
+          workspace_size(other.workspace_size)
+#ifdef ENABLE_CUDNN_API
+          ,
+          input_desc(other.input_desc), grad_output_desc(other.grad_output_desc),
+          weight_desc(other.weight_desc), grad_input_desc(other.grad_input_desc),
+          grad_weight_desc(other.grad_weight_desc),
+          grad_bias_desc(other.grad_bias_desc), conv_desc(other.conv_desc),
+          bwd_data_algo(other.bwd_data_algo),
+          bwd_filter_algo(other.bwd_filter_algo),
+          bwd_data_workspace_size(other.bwd_data_workspace_size),
+          bwd_filter_workspace_size(other.bwd_filter_workspace_size),
+          conv_ndim(other.conv_ndim)
+#endif
+    {
+#ifdef ENABLE_CUDNN_API
+        other.input_desc = nullptr;
+        other.grad_output_desc = nullptr;
+        other.weight_desc = nullptr;
+        other.grad_input_desc = nullptr;
+        other.grad_weight_desc = nullptr;
+        other.grad_bias_desc = nullptr;
+        other.conv_desc = nullptr;
+        other.bwd_data_algo = static_cast<cudnnConvolutionBwdDataAlgo_t>(0);
+        other.bwd_filter_algo = static_cast<cudnnConvolutionBwdFilterAlgo_t>(0);
+        other.bwd_data_workspace_size = 0;
+        other.bwd_filter_workspace_size = 0;
+        other.conv_ndim = 0;
+#endif
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() {
+#ifdef ENABLE_CUDNN_API
+        CLEANUP_CUDNN_DESCRIPTORS();
+#endif
+    }
+
+    static inline utils::Result<Opaque>
+    create(std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr,
+           const op::conv_backward::ConvBackwardInfo &info,
+           infiniDtype_t data_type, infiniopTensorDescriptor_t bias_desc) {
+#ifdef ENABLE_CUDNN_API
+        Opaque opaque(internal_ptr);
+        auto status = opaque.initializeCudnnContext(info, data_type, bias_desc);
+        if (status != INFINI_STATUS_SUCCESS) {
+            return status;
+        }
+        return utils::Result<Opaque>(std::move(opaque));
+#else
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t grad_output_desc,
+                                  infiniopTensorDescriptor_t input_desc,
+                                  infiniopTensorDescriptor_t weight_desc,
+                                  infiniopTensorDescriptor_t bias_desc,
+                                  void *pads, void *strides, void *dilations,
+                                  size_t groups) {
+#ifdef ENABLE_CUDNN_API
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = input_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    auto info_result = op::conv_backward::ConvBackwardInfo::create(
+        grad_output_desc, input_desc, weight_desc, pads, strides, dilations,
+        groups);
+    CHECK_RESULT(info_result);
+    auto info = info_result.take();
+
+    auto opaque_result = Opaque::create(handle->internal(), info, dtype, bias_desc);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    *desc_ptr = new Descriptor(dtype, opaque->workspace_size, opaque,
+                               handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *grad_input, void *grad_weight,
+                                     void *grad_bias, const void *grad_output,
+                                     const void *input, const void *weight,
+                                     void *stream) const {
+#ifdef ENABLE_CUDNN_API
+    const float alpha = 1.0f, beta = 0.0f;
+    auto internal = _opaque->internal;
+
+    return internal->useCudnn((cudaStream_t)stream, [&](cudnnHandle_t h) {
+        if (!grad_input || !grad_weight || !grad_output || !input || !weight) {
+            printf("Error: Null pointer in calculate function\n");
+            return INFINI_STATUS_BAD_PARAM;
+        }
+
+        CHECK_CUDNN(cudnnConvolutionBackwardData(
+            h, &alpha, _opaque->weight_desc, weight, _opaque->grad_output_desc,
+            grad_output, _opaque->conv_desc, _opaque->bwd_data_algo, workspace,
+            _opaque->bwd_data_workspace_size, &beta, _opaque->grad_input_desc,
+            grad_input));
+
+        CHECK_CUDNN(cudnnConvolutionBackwardFilter(
+            h, &alpha, _opaque->input_desc, input, _opaque->grad_output_desc,
+            grad_output, _opaque->conv_desc, _opaque->bwd_filter_algo, workspace,
+            _opaque->bwd_filter_workspace_size, &beta, _opaque->grad_weight_desc,
+            grad_weight));
+
+        // grad_bias = conv_bwd_bias(grad_output)
+        if (_opaque->grad_bias_desc && grad_bias) {
+            cudnnDataType_t grad_output_type;
+            int grad_output_nbDims;
+            int grad_output_dims[5], grad_output_strides[5];
+
+            int query_ndim = (_opaque->conv_ndim == 3) ? 5 : 4;
+
+            CHECK_CUDNN(cudnnGetTensorNdDescriptor(
+                _opaque->grad_output_desc, query_ndim, &grad_output_type,
+                &grad_output_nbDims, grad_output_dims, grad_output_strides));
+            if (grad_output_type == CUDNN_DATA_BFLOAT16) {
+                CHECK_STATUS(launch_bias_grad_kernel(
+                    grad_output, grad_bias, grad_output_dims, _opaque->conv_ndim,
+                    grad_output_type, (cudaStream_t)stream));
+            } else {
+                CHECK_CUDNN(cudnnConvolutionBackwardBias(
+                    h, &alpha, _opaque->grad_output_desc, grad_output, &beta,
+                    _opaque->grad_bias_desc, grad_bias));
+            }
+        }
+        return INFINI_STATUS_SUCCESS;
+    });
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+} // namespace op::conv_backward::nvidia
diff --git a/src/infiniop/ops/conv_backward/nvidia/conv_backward_nvidia.cuh b/src/infiniop/ops/conv_backward/nvidia/conv_backward_nvidia.cuh
new file mode 100644
index 000000000..363e979e1
--- /dev/null
+++ b/src/infiniop/ops/conv_backward/nvidia/conv_backward_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __CONV_BACKWARD_NVIDIA_CUH__
+#define __CONV_BACKWARD_NVIDIA_CUH__
+
+#include "../conv_backward.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __CONV_BACKWARD_NVIDIA_CUH__
diff --git a/src/infiniop/ops/conv_backward/operator.cc b/src/infiniop/ops/conv_backward/operator.cc
new file mode 100644
index 000000000..f02e31cb3
--- /dev/null
+++ b/src/infiniop/ops/conv_backward/operator.cc
@@ -0,0 +1,135 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/conv_backward.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/conv_backward_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/conv_backward_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/conv_backward_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateConvBackwardDescriptor(
+    infiniopHandle_t handle,
+    infiniopConvBackwardDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t grad_output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t weight_desc,
+    infiniopTensorDescriptor_t bias_desc,
+    void *pads,
+    void *strides,
+    void *dilations,
+    size_t groups) {
+#define CREATE(CASE, NAMESPACE)                                                              \
+    case CASE:                                                                               \
+        return op::conv_backward::NAMESPACE::Descriptor::create(                             \
+            handle, reinterpret_cast<op::conv_backward::NAMESPACE::Descriptor **>(desc_ptr), \
+            grad_output_desc, input_desc, weight_desc, bias_desc, pads, strides, dilations, groups)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetConvBackwardWorkspaceSize(
+    infiniopConvBackwardDescriptor_t desc, size_t *size) {
+#define GET(CASE, NAMESPACE)                                                                               \
+    case CASE:                                                                                             \
+        *size = reinterpret_cast<const op::conv_backward::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+
+__C infiniStatus_t infiniopConvBackward(
+    infiniopConvBackwardDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *grad_input,
+    void *grad_weight,
+    void *grad_bias,
+    const void *grad_output,
+    const void *input,
+    const void *weight,
+    void *stream) {
+#define CALCULATE(CASE, NAMESPACE)                                                      \
+    case CASE:                                                                          \
+        return reinterpret_cast<const op::conv_backward::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, grad_input, grad_weight, grad_bias, grad_output, input, weight, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CALCULATE
+}
+
+__C infiniStatus_t infiniopDestroyConvBackwardDescriptor(infiniopConvBackwardDescriptor_t desc) {
+#define DELETE(CASE, NAMESPACE)                                                          \
+    case CASE:                                                                           \
+        delete reinterpret_cast<const op::conv_backward::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DELETE
+}
diff --git a/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.cc b/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.cc
new file mode 100644
index 000000000..af97c1d09
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.cc
@@ -0,0 +1,321 @@
+#include "cross_entropy_loss_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../devices/cpu/cpu_handle.h"
+#include "../info.h"
+#include <algorithm>
+#include <cmath>
+#include <limits>
+#include <vector>
+
+namespace op::cross_entropy_loss::cpu {
+
+struct Descriptor::Opaque {
+    device::cpu::Handle *handle;
+    std::vector<size_t> logits_shape;
+    size_t workspace_size = 0;
+
+private:
+    Opaque(device::cpu::Handle *handle_ptr, const std::vector<size_t> &shape)
+        : handle(handle_ptr), logits_shape(shape) {
+        // 计算workspace大小：需要存储per-sample loss
+        size_t N = logits_shape[0];
+        size_t inner_size = 1;
+        for (size_t i = 2; i < logits_shape.size(); ++i) {
+            inner_size *= logits_shape[i];
+        }
+        workspace_size = N * inner_size * sizeof(float);
+    }
+
+    void cross_entropy_f16_as_float(float *workspace, float *loss_result,
+                                    const fp16_t *logits, const int64_t *target) const {
+        size_t N = logits_shape[0];
+        size_t C = logits_shape[1];
+        size_t inner_size = 1;
+        for (size_t i = 2; i < logits_shape.size(); ++i) {
+            inner_size *= logits_shape[i];
+        }
+
+        // 转换F16 logits为float
+        size_t total_logits_size = N * C * inner_size;
+        std::vector<float> float_logits(total_logits_size);
+        for (size_t i = 0; i < total_logits_size; ++i) {
+            float_logits[i] = utils::cast<float>(logits[i]);
+        }
+
+        // 使用float精度计算
+        cross_entropy_cpu_float(workspace, loss_result, float_logits.data(), target);
+    }
+
+    // 通用的float版本交叉熵计算
+    void cross_entropy_cpu_float(float *workspace, float *loss_result,
+                                 const float *logits, const int64_t *target) const {
+        size_t N = logits_shape[0];
+        size_t C = logits_shape[1];
+        size_t inner_size = 1;
+        for (size_t i = 2; i < logits_shape.size(); ++i) {
+            inner_size *= logits_shape[i];
+        }
+
+        const int64_t ignore_index = -100;
+        float *per_sample_loss = workspace;
+
+        // 计算每个样本的损失
+        for (size_t n = 0; n < N; ++n) {
+            for (size_t inner = 0; inner < inner_size; ++inner) {
+                size_t sample_idx = n * inner_size + inner;
+                int64_t t = target[sample_idx];
+
+                // 检查ignore_index或无效target
+                if (t == ignore_index || t < 0 || t >= static_cast<int64_t>(C)) {
+                    per_sample_loss[sample_idx] = 0.0f;
+                    continue;
+                }
+
+                // 计算这个位置的logits基址
+                size_t base_offset = n * C * inner_size + inner;
+
+                // 数值稳定的softmax计算：先找最大值
+                float max_logit = -std::numeric_limits<float>::infinity();
+                for (size_t c = 0; c < C; ++c) {
+                    size_t logit_idx = base_offset + c * inner_size;
+                    max_logit = std::max(max_logit, logits[logit_idx]);
+                }
+
+                // 计算exp的和（减去最大值保证数值稳定）
+                float sum_exp = 0.0f;
+                for (size_t c = 0; c < C; ++c) {
+                    size_t logit_idx = base_offset + c * inner_size;
+                    sum_exp += std::exp(logits[logit_idx] - max_logit);
+                }
+
+                // 计算目标类别的logit
+                size_t target_logit_idx = base_offset + static_cast<size_t>(t) * inner_size;
+                float target_logit = logits[target_logit_idx];
+
+                // 计算交叉熵损失：log_softmax[target] = logit[target] - log(sum_exp) - max_logit
+                // 所以 -log_softmax[target] = log(sum_exp) + max_logit - logit[target]
+                per_sample_loss[sample_idx] = std::log(sum_exp) + max_logit - target_logit;
+            }
+        }
+
+        // 计算平均损失（忽略ignore_index的样本）
+        double total_loss = 0.0;
+        size_t valid_count = 0;
+        size_t total_samples = N * inner_size;
+
+        for (size_t i = 0; i < total_samples; ++i) {
+            if (target[i] != ignore_index && target[i] >= 0 && target[i] < static_cast<int64_t>(C)) {
+                total_loss += static_cast<double>(per_sample_loss[i]);
+                valid_count++;
+            }
+        }
+
+        *loss_result = valid_count > 0 ? static_cast<float>(total_loss / valid_count) : 0.0f;
+    }
+
+    // 通用模板版本（用于F32和BF16）
+    template <typename T>
+    void cross_entropy_cpu_generic(float *workspace, T *loss_result,
+                                   const T *logits, const int64_t *target) const {
+        size_t N = logits_shape[0];
+        size_t C = logits_shape[1];
+        size_t inner_size = 1;
+        for (size_t i = 2; i < logits_shape.size(); ++i) {
+            inner_size *= logits_shape[i];
+        }
+
+        const int64_t ignore_index = -100;
+        float *per_sample_loss = workspace;
+
+        // 计算每个样本的损失
+        for (size_t n = 0; n < N; ++n) {
+            for (size_t inner = 0; inner < inner_size; ++inner) {
+                size_t sample_idx = n * inner_size + inner;
+                int64_t t = target[sample_idx];
+
+                // 检查ignore_index或无效target
+                if (t == ignore_index || t < 0 || t >= static_cast<int64_t>(C)) {
+                    per_sample_loss[sample_idx] = 0.0f;
+                    continue;
+                }
+
+                // 计算这个位置的logits基址
+                size_t base_offset = n * C * inner_size + inner;
+
+                // 数值稳定的softmax计算：先找最大值
+                float max_logit = -std::numeric_limits<float>::infinity();
+                for (size_t c = 0; c < C; ++c) {
+                    size_t logit_idx = base_offset + c * inner_size;
+                    float logit_val;
+                    if constexpr (std::is_same<T, bf16_t>::value) {
+                        logit_val = utils::cast<float>(logits[logit_idx]);
+                    } else {
+                        logit_val = logits[logit_idx];
+                    }
+                    max_logit = std::max(max_logit, logit_val);
+                }
+
+                // 计算exp的和
+                float sum_exp = 0.0f;
+                for (size_t c = 0; c < C; ++c) {
+                    size_t logit_idx = base_offset + c * inner_size;
+                    float logit_val;
+                    if constexpr (std::is_same<T, bf16_t>::value) {
+                        logit_val = utils::cast<float>(logits[logit_idx]);
+                    } else {
+                        logit_val = logits[logit_idx];
+                    }
+                    sum_exp += std::exp(logit_val - max_logit);
+                }
+
+                // 计算目标类别的logit
+                size_t target_logit_idx = base_offset + static_cast<size_t>(t) * inner_size;
+                float target_logit;
+                if constexpr (std::is_same<T, bf16_t>::value) {
+                    target_logit = utils::cast<float>(logits[target_logit_idx]);
+                } else {
+                    target_logit = logits[target_logit_idx];
+                }
+
+                // 计算交叉熵损失
+                per_sample_loss[sample_idx] = std::log(sum_exp) + max_logit - target_logit;
+            }
+        }
+
+        // 计算平均损失
+        double total_loss = 0.0;
+        size_t valid_count = 0;
+        size_t total_samples = N * inner_size;
+
+        for (size_t i = 0; i < total_samples; ++i) {
+            if (target[i] != ignore_index && target[i] >= 0 && target[i] < static_cast<int64_t>(C)) {
+                total_loss += static_cast<double>(per_sample_loss[i]);
+                valid_count++;
+            }
+        }
+
+        float mean_loss = valid_count > 0 ? static_cast<float>(total_loss / valid_count) : 0.0f;
+
+        // 转换回输出类型
+        if constexpr (std::is_same<T, bf16_t>::value) {
+            *loss_result = utils::cast<T>(mean_loss);
+        } else {
+            *loss_result = static_cast<T>(mean_loss);
+        }
+    }
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : handle(other.handle),
+          logits_shape(std::move(other.logits_shape)),
+          workspace_size(other.workspace_size) {
+        other.handle = nullptr;
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() = default;
+
+    static inline utils::Result<Opaque>
+    create(device::cpu::Handle *handle_ptr, const std::vector<size_t> &shape) {
+        Opaque opaque(handle_ptr, shape);
+        return utils::Result<Opaque>(std::move(opaque));
+    }
+
+    infiniStatus_t calculate(void *workspace, size_t workspace_size,
+                             void *loss, const void *logits, const void *target,
+                             infiniDtype_t dtype) const {
+        if (!workspace || !loss || !logits || !target) {
+            return INFINI_STATUS_BAD_PARAM;
+        }
+
+        if (workspace_size < this->workspace_size) {
+            return INFINI_STATUS_INTERNAL_ERROR;
+        }
+
+        float *workspace_ptr = static_cast<float *>(workspace);
+        const int64_t *target_ptr = static_cast<const int64_t *>(target);
+
+        switch (dtype) {
+        case INFINI_DTYPE_F32: {
+            const float *logits_ptr = static_cast<const float *>(logits);
+            float *loss_ptr = static_cast<float *>(loss);
+            cross_entropy_cpu_generic(workspace_ptr, loss_ptr, logits_ptr, target_ptr);
+            break;
+        }
+
+        case INFINI_DTYPE_F16: {
+            const fp16_t *logits_ptr = static_cast<const fp16_t *>(logits);
+            fp16_t *loss_ptr = static_cast<fp16_t *>(loss);
+
+            // F16特殊处理：使用float计算
+            float temp_loss;
+            cross_entropy_f16_as_float(workspace_ptr, &temp_loss, logits_ptr, target_ptr);
+            *loss_ptr = utils::cast<fp16_t>(temp_loss);
+            break;
+        }
+
+        case INFINI_DTYPE_BF16: {
+            const bf16_t *logits_ptr = static_cast<const bf16_t *>(logits);
+            bf16_t *loss_ptr = static_cast<bf16_t *>(loss);
+            cross_entropy_cpu_generic(workspace_ptr, loss_ptr, logits_ptr, target_ptr);
+            break;
+        }
+
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    size_t get_workspace_size() const {
+        return workspace_size;
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t /*loss_desc*/,
+                                  infiniopTensorDescriptor_t logits_desc,
+                                  infiniopTensorDescriptor_t /*target_desc*/) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = logits_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16);
+
+    const auto &orig_shape = logits_desc->shape();
+    std::vector<size_t> logits_shape;
+
+    if (orig_shape.size() == 1) {
+        logits_shape = {1, orig_shape[0]};
+    } else {
+        logits_shape = orig_shape;
+    }
+
+    if (logits_shape.size() < 2) {
+        return INFINI_STATUS_BAD_TENSOR_SHAPE;
+    }
+
+    auto opaque_result = Opaque::create(handle, logits_shape);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    *desc_ptr = new Descriptor(dtype, opaque->get_workspace_size(), opaque,
+                               handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *loss, const void *logits,
+                                     const void *target, void *stream) const {
+    return _opaque->calculate(workspace, workspace_size, loss, logits, target, _dtype);
+}
+
+} // namespace op::cross_entropy_loss::cpu
diff --git a/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.h b/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.h
new file mode 100644
index 000000000..8afec63d0
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/cpu/cross_entropy_loss_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __CROSS_ENTROPY_LOSS_CPU_H__
+#define __CROSS_ENTROPY_LOSS_CPU_H__
+
+#include "../cross_entropy_loss.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __CROSS_ENTROPY_LOSS_CPU_H__
diff --git a/src/infiniop/ops/cross_entropy_loss/cross_entropy_loss.h b/src/infiniop/ops/cross_entropy_loss/cross_entropy_loss.h
new file mode 100644
index 000000000..dad108d78
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/cross_entropy_loss.h
@@ -0,0 +1,48 @@
+#ifndef __CROSS_ENTROPY_LOSS_H__
+#define __CROSS_ENTROPY_LOSS_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::cross_entropy_loss::NAMESPACE {                \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        infiniDtype_t _dtype;                                    \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            infiniDtype_t dtype,                                 \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _dtype(dtype),                                     \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t loss_desc,                \
+            infiniopTensorDescriptor_t logits_desc,              \
+            infiniopTensorDescriptor_t target_desc);             \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            void *loss,                                          \
+            const void *logits,                                  \
+            const void *target,                                  \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __CROSS_ENTROPY_LOSS_H__
diff --git a/src/infiniop/ops/cross_entropy_loss/cuda/kernel.cuh b/src/infiniop/ops/cross_entropy_loss/cuda/kernel.cuh
new file mode 100644
index 000000000..5279011ef
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/cuda/kernel.cuh
@@ -0,0 +1,67 @@
+#ifndef __CROSS_ENTROPY_KERNEL_CUH__
+#define __CROSS_ENTROPY_KERNEL_CUH__
+
+#include <cstdint>
+#include <hpcc_fp16.h>
+#include <math.h>
+
+__device__ __forceinline__ float to_float(float val) { return val; }
+
+__device__ __forceinline__ float to_float(half val) {
+    return __half2float(val);
+}
+
+__device__ __forceinline__ float to_float(__hpcc_bfloat16 val) {
+    return __bfloat162float(val);
+}
+
+template <typename T_in, typename T_out>
+__global__ void cross_entropy_loss_kernel(T_out *loss, const T_in *logits,
+                                          const int64_t *target, int N, int C,
+                                          long long inner_size,
+                                          int64_t ignore_index) {
+
+    long long idx = (long long)blockIdx.x * blockDim.x + threadIdx.x;
+    long long total = (long long)N * inner_size;
+    if (idx >= total) {
+        return;
+    }
+
+    int n = (int)(idx / inner_size);
+    int inner = (int)(idx % inner_size);
+
+    int64_t t = target[idx];
+
+    if (t == ignore_index) {
+        loss[idx] = (T_out)0.0f;
+        return;
+    }
+    if (t < 0 || t >= C) {
+        loss[idx] = (T_out)0.0f;
+        return;
+    }
+
+    const long long base_offset = ((long long)n * C * inner_size) + inner;
+
+    // 1. 找到 logits 中的最大值
+    float max_val = -HUGE_VALF; // 使用浮点数的最大负值
+    for (int c = 0; c < C; ++c) {
+        long long offset = base_offset + (long long)c * inner_size;
+        max_val = fmaxf(max_val, to_float(logits[offset]));
+    }
+
+    // 2. 计算 sum(exp(x - max_val))
+    float sum_exp = 0.0f;
+    for (int c = 0; c < C; ++c) {
+        long long offset = base_offset + (long long)c * inner_size;
+        sum_exp += expf(to_float(logits[offset]) - max_val);
+    }
+
+    // 3. 计算最终 loss
+    long long target_offset = base_offset + (long long)t * inner_size;
+    float logit_tgt = to_float(logits[target_offset]);
+
+    loss[idx] = (T_out)(logf(sum_exp) + max_val - logit_tgt);
+}
+
+#endif // __CROSS_ENTROPY_KERNEL_CUH__
diff --git a/src/infiniop/ops/cross_entropy_loss/info.h b/src/infiniop/ops/cross_entropy_loss/info.h
new file mode 100644
index 000000000..5278bf912
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/info.h
@@ -0,0 +1,36 @@
+#ifndef __CROSS_ENTROPY_LOSS_INFO_H__
+#define __CROSS_ENTROPY_LOSS_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+
+namespace op::cross_entropy_loss {
+
+class CrossEntropyInfo {
+public:
+    CrossEntropyInfo() = default;
+    size_t batch = 0;
+    size_t num_classes = 0;
+    infiniDtype_t dtype;
+
+    static utils::Result<CrossEntropyInfo> create(
+        infiniopTensorDescriptor_t loss,
+        infiniopTensorDescriptor_t logits,
+        infiniopTensorDescriptor_t target) {
+
+        if (logits->ndim() != 2 || loss->ndim() != 1 || target->ndim() != 1) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        CrossEntropyInfo info;
+        info.batch = logits->dim(0);
+        info.num_classes = logits->dim(1);
+        info.dtype = logits->dtype();
+        return utils::Result<CrossEntropyInfo>(std::move(info));
+    }
+};
+
+} // namespace op::cross_entropy_loss
+
+#endif // __CROSS_ENTROPY_LOSS_INFO_H__
diff --git a/src/infiniop/ops/cross_entropy_loss/metax/cross_entropy_metax.h b/src/infiniop/ops/cross_entropy_loss/metax/cross_entropy_metax.h
new file mode 100644
index 000000000..382d555e0
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/metax/cross_entropy_metax.h
@@ -0,0 +1,8 @@
+#ifndef __CROSS_ENTROPY_METAX_H__
+#define __CROSS_ENTROPY_METAX_H__
+
+#include "../cross_entropy_loss.h"
+
+DESCRIPTOR(metax)
+
+#endif // __CROSS_ENTROPY_METAX_H__
diff --git a/src/infiniop/ops/cross_entropy_loss/metax/cross_entropy_metax.maca b/src/infiniop/ops/cross_entropy_loss/metax/cross_entropy_metax.maca
new file mode 100644
index 000000000..94f611e7a
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/metax/cross_entropy_metax.maca
@@ -0,0 +1,145 @@
+#include <hpcc_fp16.h>
+#include <math.h>
+#include <vector>
+#include <memory>
+#include <numeric>
+#include <limits>
+#include <cstdint> 
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+#include "cross_entropy_metax.h"
+
+#include "../cuda/kernel.cuh"
+
+namespace op::cross_entropy_loss::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+    std::vector<size_t> logits_shape;
+    Opaque(std::shared_ptr<device::metax::Handle::Internal> internal_ptr)
+        : internal(internal_ptr) {}
+    ~Opaque() = default;
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t /*loss_desc*/,
+    infiniopTensorDescriptor_t logits_desc,
+    infiniopTensorDescriptor_t /*target_desc*/) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = logits_desc->dtype();
+    if (dtype != INFINI_DTYPE_F32 && dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_BF16) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    auto opaque = new Opaque(handle->internal());
+    const auto &orig_shape = logits_desc->shape();
+
+    if (orig_shape.size() == 1) {
+        opaque->logits_shape = {1, orig_shape[0]};
+    } else {
+        opaque->logits_shape = orig_shape;
+    }
+    
+    if (opaque->logits_shape.size() < 2) return INFINI_STATUS_BAD_TENSOR_SHAPE;
+
+    const auto &s = opaque->logits_shape;
+    long long N = (long long)s[0];
+    long long inner = 1;
+    for (size_t i = 2; i < s.size(); ++i) inner *= (long long)s[i];
+
+    size_t workspace_size = (size_t)(N * inner) * sizeof(float);
+    *desc_ptr = new Descriptor(dtype, workspace_size, opaque, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, size_t workspace_size, void *loss,
+    const void *logits, const void *target, void *stream_) const {
+
+    const auto &shape = _opaque->logits_shape;
+    int N = (int)shape[0];
+    int C = (int)shape[1];
+    long long inner_size = 1;
+    for (size_t i = 2; i < shape.size(); ++i)
+        inner_size *= shape[i];
+
+    long long total = (long long)N * inner_size;
+
+    size_t need_ws = (size_t)total * sizeof(float);
+    if (workspace_size < need_ws) return INFINI_STATUS_INTERNAL_ERROR;
+    float* per_sample_loss = reinterpret_cast<float*>(workspace);
+
+    const int64_t *typed_target = reinterpret_cast<const int64_t *>(target);
+    const int64_t ignore_index = -100;
+    hcStream_t stream = (hcStream_t)stream_;
+
+    dim3 blockSize(256);
+    dim3 gridSize((total + blockSize.x - 1) / blockSize.x);
+
+    if (_dtype == INFINI_DTYPE_F32) {
+        cross_entropy_loss_kernel<float, float>
+            <<<gridSize, blockSize, 0, stream>>>(
+                per_sample_loss, (const float*)logits, typed_target,
+                N, C, inner_size, ignore_index);
+    } else if (_dtype == INFINI_DTYPE_F16) {
+        cross_entropy_loss_kernel<half, float>
+            <<<gridSize, blockSize, 0, stream>>>(
+                per_sample_loss, (const half*)logits, typed_target,
+                N, C, inner_size, ignore_index);
+    } else if (_dtype == INFINI_DTYPE_BF16) {
+        cross_entropy_loss_kernel<__hpcc_bfloat16, float>
+            <<<gridSize, blockSize, 0, stream>>>(
+                per_sample_loss, (const __hpcc_bfloat16*)logits, typed_target,
+                N, C, inner_size, ignore_index);
+    } else {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    if (hcGetLastError() != hcSuccess) return INFINI_STATUS_INTERNAL_ERROR;
+
+    std::vector<float> h_loss((size_t)total);
+    std::vector<int64_t> h_target((size_t)total);
+    if (hcMemcpyAsync(h_loss.data(), per_sample_loss, need_ws, hcMemcpyDeviceToHost, stream) != hcSuccess)
+        return INFINI_STATUS_INTERNAL_ERROR;
+    if (hcMemcpyAsync(h_target.data(), typed_target, (size_t)total * sizeof(int64_t), hcMemcpyDeviceToHost, stream) != hcSuccess)
+        return INFINI_STATUS_INTERNAL_ERROR;
+    if (hcStreamSynchronize(stream) != hcSuccess)
+        return INFINI_STATUS_INTERNAL_ERROR;
+
+    double acc = 0.0;
+    long long count = 0;
+    for (long long i = 0; i < total; ++i) {
+        if (h_target[i] != ignore_index) {
+            acc += (double)h_loss[i];
+            count++;
+        }
+    }
+    double mean = (count > 0) ? (acc / (double)count) : 0.0;
+
+    if (_dtype == INFINI_DTYPE_F32) {
+        float v = (float)mean;
+        if (hcMemcpyAsync(loss, &v, sizeof(float), hcMemcpyHostToDevice, stream) != hcSuccess)
+            return INFINI_STATUS_INTERNAL_ERROR;
+    } else if (_dtype == INFINI_DTYPE_F16) {
+        half v = __float2half((float)mean);
+        if (hcMemcpyAsync(loss, &v, sizeof(half), hcMemcpyHostToDevice, stream) != hcSuccess)
+            return INFINI_STATUS_INTERNAL_ERROR;
+    } else if (_dtype == INFINI_DTYPE_BF16) {
+        __hpcc_bfloat16 v = __float2bfloat16_rn((float)mean);
+        if (hcMemcpyAsync(loss, &v, sizeof(__hpcc_bfloat16), hcMemcpyHostToDevice, stream) != hcSuccess)
+            return INFINI_STATUS_INTERNAL_ERROR;
+    }
+    if (hcStreamSynchronize(stream) != hcSuccess)
+        return INFINI_STATUS_INTERNAL_ERROR;
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::cross_entropy_loss::metax
diff --git a/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cu b/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cu
new file mode 100644
index 000000000..3d795a67a
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cu
@@ -0,0 +1,217 @@
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <limits.h>
+#include <math_constants.h>
+#include <memory>
+#include <numeric>
+#include <stdio.h>
+#include <vector>
+
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "cross_entropy_loss_nvidia.cuh"
+
+namespace op::cross_entropy_loss::nvidia {
+namespace cuda {
+
+__device__ __forceinline__ float to_float(float v) { return v; }
+__device__ __forceinline__ float to_float(double v) { return (float)v; }
+__device__ __forceinline__ float to_float(half v) { return __half2float(v); }
+__device__ __forceinline__ float to_float(__nv_bfloat16 v) {
+    return __bfloat162float(v);
+}
+
+template <typename T_in, typename T_out = float>
+__global__ void
+softmaxCrossEntropy_per_sample(T_out *__restrict__ loss,
+                               const T_in *__restrict__ logits,
+                               const int64_t *__restrict__ target, int N, int C,
+                               long long inner_size, int64_t ignore_index) {
+    long long total = (long long)N * inner_size;
+    long long idx = (long long)blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= total) {
+        return;
+    }
+
+    int n = (int)(idx / inner_size);
+    int inr = (int)(idx % inner_size);
+
+    int64_t t = target[(long long)n * inner_size + inr];
+    if (ignore_index != LLONG_MIN && t == ignore_index) {
+        loss[idx] = (T_out)0;
+        return;
+    }
+    if (t < 0 || t >= C) {
+        loss[idx] = (T_out)0;
+        return;
+    }
+
+    const long long base = ((long long)n * C * inner_size) + inr;
+
+    // 数值稳定 LSE：lse = log(sum exp(x - m)) + m
+    float m = -CUDART_INF_F;
+    for (int c = 0; c < C; ++c) {
+        m = fmaxf(m, to_float(logits[base + (long long)c * inner_size]));
+    }
+
+    float sum_exp = 0.f;
+    for (int c = 0; c < C; ++c) {
+        sum_exp += expf(to_float(logits[base + (long long)c * inner_size]) - m);
+    }
+
+    float lse = logf(sum_exp) + m;
+    float logit_t = to_float(logits[base + (long long)(int)t * inner_size]);
+    loss[idx] = (T_out)(lse - logit_t);
+}
+
+} // namespace cuda
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+    std::vector<size_t> logits_shape;
+    Opaque(std::shared_ptr<device::nvidia::Handle::Internal> p) : internal(p) {}
+    ~Opaque() = default;
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t /*loss_desc*/,
+                                  infiniopTensorDescriptor_t logits_desc,
+                                  infiniopTensorDescriptor_t /*target_desc*/) {
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = logits_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16);
+
+    const auto &orig = logits_desc->shape();
+    auto opaque = new Opaque(handle->internal());
+
+    if (orig.size() == 1) {
+        opaque->logits_shape = {1, orig[0]};
+    } else {
+        opaque->logits_shape = orig;
+    }
+
+    const auto &s = opaque->logits_shape;
+    long long N = (long long)s[0];
+    long long inner = 1;
+    for (size_t i = 2; i < s.size(); ++i) {
+        inner *= (long long)s[i];
+    }
+
+    size_t workspace_size = (size_t)(N * inner) * sizeof(float);
+    *desc_ptr = new Descriptor(dtype, workspace_size, opaque, handle->device,
+                               handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *loss, const void *logits,
+                                     const void *target, void *stream) const {
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+    const auto &s = _opaque->logits_shape;
+    int N = (int)s[0];
+    int C = (int)s[1];
+    long long inner = 1;
+    for (size_t i = 2; i < s.size(); ++i) {
+        inner *= (long long)s[i];
+    }
+    long long total = (long long)N * inner;
+
+    size_t need_ws = (size_t)total * sizeof(float);
+    if (workspace_size < need_ws) {
+        return INFINI_STATUS_INTERNAL_ERROR;
+    }
+    float *per_sample = reinterpret_cast<float *>(workspace);
+
+    const int64_t *tgt_i64 = reinterpret_cast<const int64_t *>(target);
+    const int64_t ignore_index = -100;
+
+    // 1) 写 per-sample loss -> workspace（float）
+    dim3 block(256);
+    dim3 grid((total + block.x - 1) / block.x);
+    cudaStream_t st = (cudaStream_t)stream;
+
+    if (_dtype == INFINI_DTYPE_F32) {
+        cuda::softmaxCrossEntropy_per_sample<float, float><<<grid, block, 0, st>>>(
+            per_sample, (const float *)logits, tgt_i64, N, C, inner, ignore_index);
+    } else if (_dtype == INFINI_DTYPE_F16) {
+        cuda::softmaxCrossEntropy_per_sample<half, float><<<grid, block, 0, st>>>(
+            per_sample, (const half *)logits, tgt_i64, N, C, inner, ignore_index);
+    } else if (_dtype == INFINI_DTYPE_BF16) {
+        cuda::softmaxCrossEntropy_per_sample<__nv_bfloat16, float>
+            <<<grid, block, 0, st>>>(per_sample, (const __nv_bfloat16 *)logits,
+                                     tgt_i64, N, C, inner, ignore_index);
+    }
+    {
+        auto err = cudaGetLastError();
+        if (err != cudaSuccess) {
+            return INFINI_STATUS_INTERNAL_ERROR;
+        }
+    }
+
+    // 2) host 侧 mean（仅统计 target != ignore_index）
+    std::vector<float> h_loss((size_t)total);
+    std::vector<int64_t> h_tgt((size_t)total);
+    if (cudaMemcpyAsync(h_loss.data(), per_sample, need_ws,
+                        cudaMemcpyDeviceToHost, st)
+        != cudaSuccess) {
+        return INFINI_STATUS_INTERNAL_ERROR;
+    }
+    if (cudaMemcpyAsync(h_tgt.data(), tgt_i64, (size_t)total * sizeof(int64_t),
+                        cudaMemcpyDeviceToHost, st)
+        != cudaSuccess) {
+        return INFINI_STATUS_INTERNAL_ERROR;
+    }
+    if (cudaStreamSynchronize(st) != cudaSuccess) {
+        return INFINI_STATUS_INTERNAL_ERROR;
+    }
+
+    double acc = 0.0;
+    long long cnt = 0;
+    for (long long i = 0; i < total; ++i) {
+        if (h_tgt[i] != ignore_index) {
+            acc += (double)h_loss[i];
+            ++cnt;
+        }
+    }
+    double mean = (cnt > 0) ? (acc / (double)cnt) : 0.0;
+
+    // 3) 把标量 mean 写回 device 的 loss 指针（按输入 dtype 写 1 个元素）
+    if (_dtype == INFINI_DTYPE_F32) {
+        float v = (float)mean;
+        if (cudaMemcpyAsync(loss, &v, sizeof(float), cudaMemcpyHostToDevice, st) != cudaSuccess) {
+            return INFINI_STATUS_INTERNAL_ERROR;
+        }
+    } else if (_dtype == INFINI_DTYPE_F16) {
+        half v = __float2half((float)mean);
+        if (cudaMemcpyAsync(loss, &v, sizeof(half), cudaMemcpyHostToDevice, st) != cudaSuccess) {
+            return INFINI_STATUS_INTERNAL_ERROR;
+        }
+    } else if (_dtype == INFINI_DTYPE_BF16) {
+        __nv_bfloat16 v = __float2bfloat16((float)mean);
+        if (cudaMemcpyAsync(loss, &v, sizeof(__nv_bfloat16), cudaMemcpyHostToDevice,
+                            st)
+            != cudaSuccess) {
+            return INFINI_STATUS_INTERNAL_ERROR;
+        }
+    }
+    if (cudaStreamSynchronize(st) != cudaSuccess) {
+        return INFINI_STATUS_INTERNAL_ERROR;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+} // namespace op::cross_entropy_loss::nvidia
diff --git a/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cuh b/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cuh
new file mode 100644
index 000000000..843fc943d
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/nvidia/cross_entropy_loss_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __CROSS_ENTROPY_LOSS_CUDA_CUH__
+#define __CROSS_ENTROPY_LOSS_CUDA_CUH__
+
+#include "../cross_entropy_loss.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __CROSS_ENTROPY_LOSS_CUDA_CUH__
diff --git a/src/infiniop/ops/cross_entropy_loss/operator.cc b/src/infiniop/ops/cross_entropy_loss/operator.cc
new file mode 100644
index 000000000..e9a47558f
--- /dev/null
+++ b/src/infiniop/ops/cross_entropy_loss/operator.cc
@@ -0,0 +1,143 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/cross_entropy_loss.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/cross_entropy_loss_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/cross_entropy_loss_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/cross_entropy_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateCrossEntropyLossDescriptor(
+    infiniopHandle_t handle,
+    infiniopCrossEntropyLossDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t loss_desc,
+    infiniopTensorDescriptor_t logits_desc,
+    infiniopTensorDescriptor_t target_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                               \
+        return op::cross_entropy_loss::NAMESPACE::Descriptor::create(        \
+            handle,                                                          \
+            reinterpret_cast<                                                \
+                op::cross_entropy_loss::NAMESPACE::Descriptor **>(desc_ptr), \
+            loss_desc, logits_desc, target_desc)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetCrossEntropyLossWorkspaceSize(
+    infiniopCrossEntropyLossDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                     \
+    case CASE:                                                                   \
+        *size = reinterpret_cast<                                                \
+                    const op::cross_entropy_loss::NAMESPACE::Descriptor *>(desc) \
+                    ->workspaceSize();                                           \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef GET
+}
+
+__C infiniStatus_t infiniopCrossEntropyLoss(
+    infiniopCrossEntropyLossDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *loss,
+    const void *logits,
+    const void *target,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                              \
+    case CASE:                                                                  \
+        return reinterpret_cast<                                                \
+                   const op::cross_entropy_loss::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, loss, logits, target,        \
+                        stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t infiniopDestroyCrossEntropyLossDescriptor(
+    infiniopCrossEntropyLossDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                           \
+    case CASE:                                                            \
+        delete reinterpret_cast<                                          \
+            const op::cross_entropy_loss::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.cc b/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.cc
new file mode 100644
index 000000000..508dcecc6
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.cc
@@ -0,0 +1,284 @@
+#include "interpolate_nearest_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../devices/cpu/cpu_handle.h"
+#include "../info.h"
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+
+namespace op::interpolate_nearest::cpu {
+
+struct Descriptor::Opaque {
+    device::cpu::Handle *handle;
+    InterpolateNearestInfo info;
+    size_t workspace_size = 0;
+
+private:
+    Opaque(device::cpu::Handle *handle_ptr, const InterpolateNearestInfo &interpolate_info)
+        : handle(handle_ptr), info(interpolate_info) {
+        workspace_size = 0;
+    }
+
+    template <typename T>
+    size_t compute_input_index_1d(size_t idx) const {
+        size_t temp = idx;
+
+        // 1D插值：3D张量 (N, C, W)
+        size_t w = temp % info.output_size[0];
+        temp /= info.output_size[0];
+        size_t c = temp % info.channels;
+        size_t b = temp / info.channels;
+
+        float inv_scale = static_cast<float>(info.input_size[0]) / static_cast<float>(info.output_size[0]);
+        size_t input_w = std::min(static_cast<size_t>(std::floor(static_cast<float>(w) * inv_scale)),
+                                  info.input_size[0] - 1);
+
+        return b * info.input_stride[0] + c * info.input_stride[1] + input_w * info.input_stride[2];
+    }
+
+    // 计算2D插值的输入索引
+    template <typename T>
+    size_t compute_input_index_2d(size_t idx) const {
+        size_t temp = idx;
+
+        // 2D插值：4D张量 (N, C, H, W)
+        size_t w = temp % info.output_size[1]; // width在索引1
+        temp /= info.output_size[1];
+        size_t h = temp % info.output_size[0]; // height在索引0
+        temp /= info.output_size[0];
+        size_t c = temp % info.channels;
+        size_t b = temp / info.channels;
+
+        float inv_scale_h = static_cast<float>(info.input_size[0]) / static_cast<float>(info.output_size[0]);
+        float inv_scale_w = static_cast<float>(info.input_size[1]) / static_cast<float>(info.output_size[1]);
+
+        size_t input_h = std::min(static_cast<size_t>(std::floor(static_cast<float>(h) * inv_scale_h)),
+                                  info.input_size[0] - 1);
+        size_t input_w = std::min(static_cast<size_t>(std::floor(static_cast<float>(w) * inv_scale_w)),
+                                  info.input_size[1] - 1);
+
+        return b * info.input_stride[0] + c * info.input_stride[1] + input_h * info.input_stride[2] + input_w * info.input_stride[3];
+    }
+
+    // 计算3D插值的输入索引
+    template <typename T>
+    size_t compute_input_index_3d(size_t idx) const {
+        size_t temp = idx;
+
+        // 3D插值：5D张量 (N, C, D, H, W)
+        size_t w = temp % info.output_size[2]; // width在索引2
+        temp /= info.output_size[2];
+        size_t h = temp % info.output_size[1]; // height在索引1
+        temp /= info.output_size[1];
+        size_t d = temp % info.output_size[0]; // depth在索引0
+        temp /= info.output_size[0];
+        size_t c = temp % info.channels;
+        size_t b = temp / info.channels;
+
+        float inv_scale_d = static_cast<float>(info.input_size[0]) / static_cast<float>(info.output_size[0]);
+        float inv_scale_h = static_cast<float>(info.input_size[1]) / static_cast<float>(info.output_size[1]);
+        float inv_scale_w = static_cast<float>(info.input_size[2]) / static_cast<float>(info.output_size[2]);
+
+        size_t input_d = std::min(static_cast<size_t>(std::floor(static_cast<float>(d) * inv_scale_d)),
+                                  info.input_size[0] - 1);
+        size_t input_h = std::min(static_cast<size_t>(std::floor(static_cast<float>(h) * inv_scale_h)),
+                                  info.input_size[1] - 1);
+        size_t input_w = std::min(static_cast<size_t>(std::floor(static_cast<float>(w) * inv_scale_w)),
+                                  info.input_size[2] - 1);
+
+        return b * info.input_stride[0] + c * info.input_stride[1] + input_d * info.input_stride[2] + input_h * info.input_stride[3] + input_w * info.input_stride[4];
+    }
+
+    // 计算输出索引
+    template <typename T>
+    size_t compute_output_index(size_t idx) const {
+        size_t temp = idx;
+        size_t w, h, d, c, b;
+
+        switch (info.dim) {
+        case INTERPOLATE_1D: {
+            // 3D张量 (N, C, W)
+            w = temp % info.output_size[0];
+            temp /= info.output_size[0];
+            c = temp % info.channels;
+            b = temp / info.channels;
+            return b * info.output_stride[0] + c * info.output_stride[1] + w * info.output_stride[2];
+        }
+
+        case INTERPOLATE_2D: {
+            // 4D张量 (N, C, H, W)
+            w = temp % info.output_size[1];
+            temp /= info.output_size[1];
+            h = temp % info.output_size[0];
+            temp /= info.output_size[0];
+            c = temp % info.channels;
+            b = temp / info.channels;
+            return b * info.output_stride[0] + c * info.output_stride[1] + h * info.output_stride[2] + w * info.output_stride[3];
+        }
+
+        case INTERPOLATE_3D: {
+            // 5D张量 (N, C, D, H, W)
+            w = temp % info.output_size[2];
+            temp /= info.output_size[2];
+            h = temp % info.output_size[1];
+            temp /= info.output_size[1];
+            d = temp % info.output_size[0];
+            temp /= info.output_size[0];
+            c = temp % info.channels;
+            b = temp / info.channels;
+            return b * info.output_stride[0] + c * info.output_stride[1] + d * info.output_stride[2] + h * info.output_stride[3] + w * info.output_stride[4];
+        }
+
+        default:
+            return 0;
+        }
+    }
+
+    // 计算总元素数
+    size_t calculate_total_elements() const {
+        size_t total = info.batch_size * info.channels;
+        switch (info.dim) {
+        case INTERPOLATE_1D:
+            total *= info.output_size[0]; // width
+            break;
+        case INTERPOLATE_2D:
+            total *= info.output_size[0] * info.output_size[1]; // height * width
+            break;
+        case INTERPOLATE_3D:
+            total *= info.output_size[0] * info.output_size[1] * info.output_size[2]; // depth * height * width
+            break;
+        }
+        return total;
+    }
+
+    // 主要的插值计算函数
+    template <typename T>
+    void interpolate_nearest_cpu(T *output, const T *input) const {
+        size_t total_elements = calculate_total_elements();
+
+#pragma omp parallel for schedule(static)
+        for (ptrdiff_t idx = 0; idx < static_cast<ptrdiff_t>(total_elements); ++idx) {
+            size_t input_idx;
+
+            switch (info.dim) {
+            case INTERPOLATE_1D:
+                input_idx = compute_input_index_1d<T>(idx);
+                break;
+            case INTERPOLATE_2D:
+                input_idx = compute_input_index_2d<T>(idx);
+                break;
+            case INTERPOLATE_3D:
+                input_idx = compute_input_index_3d<T>(idx);
+                break;
+            default:
+                continue;
+            }
+
+            size_t output_idx = compute_output_index<T>(idx);
+            output[output_idx] = input[input_idx];
+        }
+    }
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : handle(other.handle),
+          info(std::move(other.info)),
+          workspace_size(other.workspace_size) {
+        other.handle = nullptr;
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() = default;
+
+    static inline utils::Result<Opaque>
+    create(device::cpu::Handle *handle_ptr,
+           const InterpolateNearestInfo &info,
+           infiniDtype_t data_type) {
+        if (data_type != INFINI_DTYPE_F32 && data_type != INFINI_DTYPE_F16 && data_type != INFINI_DTYPE_BF16 && data_type != INFINI_DTYPE_I8) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        Opaque opaque(handle_ptr, info);
+        return utils::Result<Opaque>(std::move(opaque));
+    }
+
+    infiniStatus_t calculate(void *workspace, size_t workspace_size,
+                             void *output, const void *input, infiniDtype_t dtype) const {
+
+        if (!output || !input) {
+            return INFINI_STATUS_BAD_PARAM;
+        }
+
+        switch (dtype) {
+        case INFINI_DTYPE_F32: {
+            float *typed_output = static_cast<float *>(output);
+            const float *typed_input = static_cast<const float *>(input);
+            interpolate_nearest_cpu(typed_output, typed_input);
+            break;
+        }
+
+        case INFINI_DTYPE_F16: {
+            fp16_t *typed_output = static_cast<fp16_t *>(output);
+            const fp16_t *typed_input = static_cast<const fp16_t *>(input);
+            interpolate_nearest_cpu(typed_output, typed_input);
+            break;
+        }
+
+        case INFINI_DTYPE_BF16: {
+            bf16_t *typed_output = static_cast<bf16_t *>(output);
+            const bf16_t *typed_input = static_cast<const bf16_t *>(input);
+            interpolate_nearest_cpu(typed_output, typed_input);
+            break;
+        }
+
+        case INFINI_DTYPE_I8: {
+            int8_t *typed_output = static_cast<int8_t *>(output);
+            const int8_t *typed_input = static_cast<const int8_t *>(input);
+            interpolate_nearest_cpu(typed_output, typed_input);
+            break;
+        }
+
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        return INFINI_STATUS_SUCCESS;
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  infiniopTensorDescriptor_t input_desc) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = output_desc->dtype();
+
+    // 检查数据类型支持
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16, INFINI_DTYPE_I8);
+
+    InterpolateNearestInfo info;
+    CHECK_STATUS(InterpolateNearestInfo::create(&info, output_desc, input_desc));
+
+    auto opaque_result = Opaque::create(handle, info, dtype);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    *desc_ptr = new Descriptor(dtype, info, opaque->workspace_size, opaque,
+                               handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *output, const void *input,
+                                     void *stream) const {
+    return _opaque->calculate(workspace, workspace_size, output, input, _dtype);
+}
+
+} // namespace op::interpolate_nearest::cpu
diff --git a/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.h b/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.h
new file mode 100644
index 000000000..78dd3ff97
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/cpu/interpolate_nearest_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __INTERPOLATE_NEAREST_CPU_H__
+#define __INTERPOLATE_NEAREST_CPU_H__
+
+#include "../interpolate_nearest.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __INTERPOLATE_NEAREST_CPU_H__
diff --git a/src/infiniop/ops/interpolate_nearest/cuda/kernel.cuh b/src/infiniop/ops/interpolate_nearest/cuda/kernel.cuh
new file mode 100644
index 000000000..60c798792
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/cuda/kernel.cuh
@@ -0,0 +1,168 @@
+#ifndef INTERPOLATE_NEAREST_KERNEL_CUH
+#define INTERPOLATE_NEAREST_KERNEL_CUH
+
+#include "../info.h"
+#include <cmath>
+
+template <typename T>
+__device__ inline size_t
+compute_input_index_1d(size_t idx, const InterpolateNearestInfo &info) {
+    size_t temp = idx;
+
+    // 1D 插值：3D 张量 (N, C, W)
+    size_t w = temp % info.output_size[0]; // width 在索引 0
+    temp /= info.output_size[0];
+    size_t c = temp % info.channels;
+    size_t b = temp / info.channels;
+
+    float inv_scale = static_cast<float>(info.input_size[0]) / static_cast<float>(info.output_size[0]);
+    size_t input_w = min(static_cast<size_t>(floorf(static_cast<float>(w) * inv_scale)),
+                         info.input_size[0] - 1);
+
+    return b * info.input_stride[0] + c * info.input_stride[1] + input_w * info.input_stride[2];
+}
+
+template <typename T>
+__device__ inline size_t
+compute_input_index_2d(size_t idx, const InterpolateNearestInfo &info) {
+    size_t temp = idx;
+
+    // 2D 插值：4D 张量 (N, C, H, W)
+    size_t w = temp % info.output_size[1]; // width 在索引 1
+    temp /= info.output_size[1];
+    size_t h = temp % info.output_size[0]; // height 在索引 0
+    temp /= info.output_size[0];
+    size_t c = temp % info.channels;
+    size_t b = temp / info.channels;
+
+    float inv_scale_h = static_cast<float>(info.input_size[0]) / static_cast<float>(info.output_size[0]);
+    float inv_scale_w = static_cast<float>(info.input_size[1]) / static_cast<float>(info.output_size[1]);
+
+    size_t input_h = min(static_cast<size_t>(floorf(static_cast<float>(h) * inv_scale_h)),
+                         info.input_size[0] - 1);
+    size_t input_w = min(static_cast<size_t>(floorf(static_cast<float>(w) * inv_scale_w)),
+                         info.input_size[1] - 1);
+
+    return b * info.input_stride[0] + c * info.input_stride[1] + input_h * info.input_stride[2] + input_w * info.input_stride[3];
+}
+
+template <typename T>
+__device__ inline size_t
+compute_input_index_3d(size_t idx, const InterpolateNearestInfo &info) {
+    size_t temp = idx;
+
+    // 3D 插值：5D 张量 (N, C, D, H, W)
+    size_t w = temp % info.output_size[2]; // width 在索引 2
+    temp /= info.output_size[2];
+    size_t h = temp % info.output_size[1]; // height 在索引 1
+    temp /= info.output_size[1];
+    size_t d = temp % info.output_size[0]; // depth 在索引 0
+    temp /= info.output_size[0];
+    size_t c = temp % info.channels;
+    size_t b = temp / info.channels;
+
+    float inv_scale_d = static_cast<float>(info.input_size[0]) / static_cast<float>(info.output_size[0]);
+    float inv_scale_h = static_cast<float>(info.input_size[1]) / static_cast<float>(info.output_size[1]);
+    float inv_scale_w = static_cast<float>(info.input_size[2]) / static_cast<float>(info.output_size[2]);
+
+    size_t input_d = min(static_cast<size_t>(floorf(static_cast<float>(d) * inv_scale_d)),
+                         info.input_size[0] - 1);
+    size_t input_h = min(static_cast<size_t>(floorf(static_cast<float>(h) * inv_scale_h)),
+                         info.input_size[1] - 1);
+    size_t input_w = min(static_cast<size_t>(floorf(static_cast<float>(w) * inv_scale_w)),
+                         info.input_size[2] - 1);
+
+    return b * info.input_stride[0] + c * info.input_stride[1] + input_d * info.input_stride[2] + input_h * info.input_stride[3] + input_w * info.input_stride[4];
+}
+
+template <typename T>
+__device__ inline size_t
+compute_output_index(size_t idx, const InterpolateNearestInfo &info) {
+    size_t temp = idx;
+    size_t w, h, d, c, b;
+
+    switch (info.dim) {
+    case INTERPOLATE_1D: {
+        // 3D 张量 (N, C, W)
+        w = temp % info.output_size[0];
+        temp /= info.output_size[0];
+        c = temp % info.channels;
+        b = temp / info.channels;
+        return b * info.output_stride[0] + c * info.output_stride[1] + w * info.output_stride[2];
+    }
+
+    case INTERPOLATE_2D: {
+        // 4D 张量 (N, C, H, W)
+        w = temp % info.output_size[1];
+        temp /= info.output_size[1];
+        h = temp % info.output_size[0];
+        temp /= info.output_size[0];
+        c = temp % info.channels;
+        b = temp / info.channels;
+        return b * info.output_stride[0] + c * info.output_stride[1] + h * info.output_stride[2] + w * info.output_stride[3];
+    }
+
+    case INTERPOLATE_3D: {
+        // 5D 张量 (N, C, D, H, W)
+        w = temp % info.output_size[2];
+        temp /= info.output_size[2];
+        h = temp % info.output_size[1];
+        temp /= info.output_size[1];
+        d = temp % info.output_size[0];
+        temp /= info.output_size[0];
+        c = temp % info.channels;
+        b = temp / info.channels;
+        return b * info.output_stride[0] + c * info.output_stride[1] + d * info.output_stride[2] + h * info.output_stride[3] + w * info.output_stride[4];
+    }
+
+    default:
+        return 0;
+    }
+}
+
+__host__ __device__ inline size_t
+calculate_total_elements(const InterpolateNearestInfo &info) {
+    size_t total = info.batch_size * info.channels;
+    switch (info.dim) {
+    case INTERPOLATE_1D:
+        total *= info.output_size[0]; // width
+        break;
+    case INTERPOLATE_2D:
+        total *= info.output_size[0] * info.output_size[1]; // height * width
+        break;
+    case INTERPOLATE_3D:
+        total *= info.output_size[0] * info.output_size[1] * info.output_size[2]; // depth * height * width
+        break;
+    }
+    return total;
+}
+
+template <typename T>
+__global__ void interpolate_nearest_kernel(T *output, const T *input,
+                                           InterpolateNearestInfo info) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t total_elements = calculate_total_elements(info);
+
+    if (idx < total_elements) {
+        size_t input_idx;
+
+        switch (info.dim) {
+        case INTERPOLATE_1D:
+            input_idx = compute_input_index_1d<T>(idx, info);
+            break;
+        case INTERPOLATE_2D:
+            input_idx = compute_input_index_2d<T>(idx, info);
+            break;
+        case INTERPOLATE_3D:
+            input_idx = compute_input_index_3d<T>(idx, info);
+            break;
+        default:
+            return;
+        }
+
+        size_t output_idx = compute_output_index<T>(idx, info);
+        output[output_idx] = input[input_idx];
+    }
+}
+
+#endif // INTERPOLATE_NEAREST_KERNEL_CUH
diff --git a/src/infiniop/ops/interpolate_nearest/info.h b/src/infiniop/ops/interpolate_nearest/info.h
new file mode 100644
index 000000000..162d6eb02
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/info.h
@@ -0,0 +1,118 @@
+#ifndef __INTERPOLATE_NEAREST_INFO_H__
+#define __INTERPOLATE_NEAREST_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include <cstddef>
+
+enum InterpolateDim {
+    INTERPOLATE_1D = 1, // 3D 张量 (N, C, W)
+    INTERPOLATE_2D = 2, // 4D 张量 (N, C, H, W)
+    INTERPOLATE_3D = 3  // 5D 张量 (N, C, D, H, W)
+};
+
+struct InterpolateNearestInfo {
+    size_t batch_size;
+    size_t channels;
+
+    // 输入和输出的空间维度大小
+    size_t input_size[3];  // [depth/height/width] 根据维度使用不同数量
+    size_t output_size[3]; // [depth/height/width] 根据维度使用不同数量
+
+    InterpolateDim dim; // 插值维度：1D, 2D, 3D
+    infiniDtype_t dtype;
+
+    // 张量步长（最多支持 5D 张量）
+    size_t input_stride[5];
+    size_t output_stride[5];
+
+    static infiniStatus_t create(
+        InterpolateNearestInfo *info,
+        infiniopTensorDescriptor_t output_desc,
+        infiniopTensorDescriptor_t input_desc) {
+
+        // 检查数据类型
+        if (input_desc->dtype() != output_desc->dtype()) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        auto input_shape = input_desc->shape();
+        auto output_shape = output_desc->shape();
+        auto input_stride = input_desc->strides();
+        auto output_stride = output_desc->strides();
+
+        // 根据张量维度确定插值类型
+        if (input_desc->ndim() == 3 && output_desc->ndim() == 3) {
+            // 1D 插值：3D 张量 (N, C, W)
+            info->dim = INTERPOLATE_1D;
+            info->batch_size = input_shape[0];
+            info->channels = input_shape[1];
+            info->input_size[0] = input_shape[2];   // width
+            info->output_size[0] = output_shape[2]; // width
+
+            // 检查 N,C 维度匹配
+            if (input_shape[0] != output_shape[0] || input_shape[1] != output_shape[1]) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+
+            // 复制步长
+            for (int i = 0; i < 3; ++i) {
+                info->input_stride[i] = input_stride[i];
+                info->output_stride[i] = output_stride[i];
+            }
+
+        } else if (input_desc->ndim() == 4 && output_desc->ndim() == 4) {
+            // 2D 插值：4D 张量 (N, C, H, W)
+            info->dim = INTERPOLATE_2D;
+            info->batch_size = input_shape[0];
+            info->channels = input_shape[1];
+            info->input_size[0] = input_shape[2];   // height
+            info->input_size[1] = input_shape[3];   // width
+            info->output_size[0] = output_shape[2]; // height
+            info->output_size[1] = output_shape[3]; // width
+
+            // 检查 N,C 维度匹配
+            if (input_shape[0] != output_shape[0] || input_shape[1] != output_shape[1]) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+
+            // 复制步长
+            for (int i = 0; i < 4; ++i) {
+                info->input_stride[i] = input_stride[i];
+                info->output_stride[i] = output_stride[i];
+            }
+
+        } else if (input_desc->ndim() == 5 && output_desc->ndim() == 5) {
+            // 3D 插值：5D 张量 (N, C, D, H, W)
+            info->dim = INTERPOLATE_3D;
+            info->batch_size = input_shape[0];
+            info->channels = input_shape[1];
+            info->input_size[0] = input_shape[2];   // depth
+            info->input_size[1] = input_shape[3];   // height
+            info->input_size[2] = input_shape[4];   // width
+            info->output_size[0] = output_shape[2]; // depth
+            info->output_size[1] = output_shape[3]; // height
+            info->output_size[2] = output_shape[4]; // width
+
+            // 检查 N,C 维度匹配
+            if (input_shape[0] != output_shape[0] || input_shape[1] != output_shape[1]) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+
+            // 复制步长
+            for (int i = 0; i < 5; ++i) {
+                info->input_stride[i] = input_stride[i];
+                info->output_stride[i] = output_stride[i];
+            }
+
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        info->dtype = input_desc->dtype();
+        return INFINI_STATUS_SUCCESS;
+    }
+};
+
+#endif // __INTERPOLATE_NEAREST_INFO_H__
diff --git a/src/infiniop/ops/interpolate_nearest/interpolate_nearest.h b/src/infiniop/ops/interpolate_nearest/interpolate_nearest.h
new file mode 100644
index 000000000..73499c2ff
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/interpolate_nearest.h
@@ -0,0 +1,51 @@
+#ifndef __INTERPOLATE_NEAREST_H__
+#define __INTERPOLATE_NEAREST_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::interpolate_nearest::NAMESPACE {               \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+                                                                 \
+        InterpolateNearestInfo _info;                            \
+        infiniDtype_t _dtype;                                    \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            infiniDtype_t dtype,                                 \
+            InterpolateNearestInfo info,                         \
+            size_t workspace_size,                               \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _info(info),                                       \
+              _dtype(dtype),                                     \
+              _workspace_size(workspace_size) {}                 \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t output_desc,              \
+            infiniopTensorDescriptor_t input_desc);              \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace,                                     \
+            size_t workspace_size,                               \
+            void *output,                                        \
+            const void *input,                                   \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __INTERPOLATE_NEAREST_H__
diff --git a/src/infiniop/ops/interpolate_nearest/metax/interpolate_nearest_metax.h b/src/infiniop/ops/interpolate_nearest/metax/interpolate_nearest_metax.h
new file mode 100644
index 000000000..1619dbf2f
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/metax/interpolate_nearest_metax.h
@@ -0,0 +1,8 @@
+#ifndef __INTERPOLATE_NEAREST_METAX_H__
+#define __INTERPOLATE_NEAREST_METAX_H__
+
+#include "../interpolate_nearest.h"
+
+DESCRIPTOR(metax)
+
+#endif // __INTERPOLATE_NEAREST_METAX_H__
diff --git a/src/infiniop/ops/interpolate_nearest/metax/interpolate_nearest_metax.maca b/src/infiniop/ops/interpolate_nearest/metax/interpolate_nearest_metax.maca
new file mode 100644
index 000000000..5cf0e5e66
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/metax/interpolate_nearest_metax.maca
@@ -0,0 +1,86 @@
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+#include "interpolate_nearest_metax.h"
+#include <cstddef>
+#include <cstdint>
+
+#include "../cuda/kernel.cuh"
+
+namespace op::interpolate_nearest::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+    Opaque(std::shared_ptr<device::metax::Handle::Internal> internal_) : internal(internal_) {}
+};
+
+Descriptor::~Descriptor() { delete _opaque; }
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = output_desc->dtype();
+
+    if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32 &&
+        dtype != INFINI_DTYPE_BF16 && dtype != INFINI_DTYPE_I8) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    InterpolateNearestInfo info;
+    CHECK_STATUS(InterpolateNearestInfo::create(&info, output_desc, input_desc));
+
+    *desc_ptr = new Descriptor(dtype, info, 0, new Opaque{handle->internal()}, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T>
+inline void launch_interpolate_nearest_kernel(T *output, const T *input, InterpolateNearestInfo info, int grid_size, int block_size, hcStream_t stream) {
+    interpolate_nearest_kernel<T><<<grid_size, block_size, 0, stream>>>(output, input, info);
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *output, const void *input,
+                                     void *stream_) const {
+    hcStream_t stream = (hcStream_t)stream_;
+
+    int total_elements = calculate_total_elements(_info);
+    int block_size = 256;
+    int grid_size = (total_elements + block_size - 1) / block_size;
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F32: {
+        auto typed_output = reinterpret_cast<float *>(output);
+        auto typed_input = reinterpret_cast<const float *>(input);
+        launch_interpolate_nearest_kernel<float>(typed_output, typed_input, _info, grid_size, block_size, stream);
+        break;
+    }
+    case INFINI_DTYPE_F16: {
+        auto typed_output = reinterpret_cast<half *>(output);
+        auto typed_input = reinterpret_cast<const half *>(input);
+        launch_interpolate_nearest_kernel<half>(typed_output, typed_input, _info, grid_size, block_size, stream);
+        break;
+    }
+    case INFINI_DTYPE_BF16: {
+        auto typed_output = reinterpret_cast<__hpcc_bfloat16 *>(output);
+        auto typed_input = reinterpret_cast<const __hpcc_bfloat16 *>(input);
+        launch_interpolate_nearest_kernel<__hpcc_bfloat16>(typed_output, typed_input, _info, grid_size, block_size, stream);
+        break;
+    }
+    case INFINI_DTYPE_I8: {
+        auto typed_output = reinterpret_cast<int8_t *>(output);
+        auto typed_input = reinterpret_cast<const int8_t *>(input);
+        launch_interpolate_nearest_kernel<int8_t>(typed_output, typed_input, _info, grid_size, block_size, stream);
+        break;
+    }
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::interpolate_nearest::metax
diff --git a/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cu b/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cu
new file mode 100644
index 000000000..a7b63c6f4
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cu
@@ -0,0 +1,93 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "../cuda/kernel.cuh"
+#include "interpolate_nearest_nvidia.cuh"
+#include <cstddef>
+#include <cstdint>
+#include <cuda_bf16.h>
+
+namespace op::interpolate_nearest::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+
+    Opaque(std::shared_ptr<device::nvidia::Handle::Internal> internal_)
+        : internal(internal_) {}
+};
+
+Descriptor::~Descriptor() { delete _opaque; }
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  infiniopTensorDescriptor_t input_desc) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = output_desc->dtype();
+
+    // Check supported data types
+    if (dtype != INFINI_DTYPE_F16 && dtype != INFINI_DTYPE_F32 && dtype != INFINI_DTYPE_BF16 && dtype != INFINI_DTYPE_I8) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    InterpolateNearestInfo info;
+    CHECK_STATUS(InterpolateNearestInfo::create(&info, output_desc, input_desc));
+
+    *desc_ptr = new Descriptor(dtype, info, 0, new Opaque{handle->internal()},
+                               handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *output, const void *input,
+                                     void *stream) const {
+
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+    size_t total_elements = calculate_total_elements(_info);
+
+    int block_size = 256;
+    int grid_size = (total_elements + block_size - 1) / block_size;
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F32: {
+        float *typed_output = reinterpret_cast<float *>(output);
+        const float *typed_input = reinterpret_cast<const float *>(input);
+        interpolate_nearest_kernel<float>
+            <<<grid_size, block_size, 0, cuda_stream>>>(typed_output, typed_input,
+                                                        _info);
+    } break;
+
+    case INFINI_DTYPE_F16: {
+        half *typed_output = reinterpret_cast<half *>(output);
+        const half *typed_input = reinterpret_cast<const half *>(input);
+        interpolate_nearest_kernel<half><<<grid_size, block_size, 0, cuda_stream>>>(
+            typed_output, typed_input, _info);
+    } break;
+
+    case INFINI_DTYPE_BF16: {
+        auto typed_output = reinterpret_cast<__nv_bfloat16 *>(output);
+        auto typed_input = reinterpret_cast<const __nv_bfloat16 *>(input);
+        interpolate_nearest_kernel<__nv_bfloat16>
+            <<<grid_size, block_size, 0, cuda_stream>>>(typed_output, typed_input,
+                                                        _info);
+    } break;
+
+    case INFINI_DTYPE_I8: {
+        auto typed_output = reinterpret_cast<int8_t *>(output);
+        auto typed_input = reinterpret_cast<const int8_t *>(input);
+        interpolate_nearest_kernel<int8_t>
+            <<<grid_size, block_size, 0, cuda_stream>>>(typed_output, typed_input,
+                                                        _info);
+    } break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    CHECK_CUDA(cudaGetLastError());
+    CHECK_CUDA(cudaStreamSynchronize(cuda_stream));
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::interpolate_nearest::nvidia
diff --git a/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cuh b/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cuh
new file mode 100644
index 000000000..aab5f7882
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/nvidia/interpolate_nearest_nvidia.cuh
@@ -0,0 +1,9 @@
+#ifndef __INTERPOLATE_NEAREST_NVIDIA_CUH__
+#define __INTERPOLATE_NEAREST_NVIDIA_CUH__
+
+#include "../../../devices/nvidia/nvidia_handle.h"
+#include "../interpolate_nearest.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __INTERPOLATE_NEAREST_NVIDIA_CUH__
diff --git a/src/infiniop/ops/interpolate_nearest/operator.cc b/src/infiniop/ops/interpolate_nearest/operator.cc
new file mode 100644
index 000000000..0a0f99ee1
--- /dev/null
+++ b/src/infiniop/ops/interpolate_nearest/operator.cc
@@ -0,0 +1,145 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/interpolate_nearest.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/interpolate_nearest_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/interpolate_nearest_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/interpolate_nearest_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateInterpolateNearestDescriptor(
+    infiniopHandle_t handle,
+    infiniopInterpolateNearestDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                            \
+    case CASE:                                                                             \
+        return op::interpolate_nearest::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                        \
+            reinterpret_cast<op::interpolate_nearest::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                                   \
+            input_desc)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetInterpolateNearestWorkspaceSize(
+    infiniopInterpolateNearestDescriptor_t desc,
+    size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                               \
+    case CASE:                                                                                             \
+        *size = reinterpret_cast<op::interpolate_nearest::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef GET
+}
+
+__C infiniStatus_t infiniopInterpolateNearest(
+    infiniopInterpolateNearestDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                            \
+    case CASE:                                                                                \
+        return reinterpret_cast<const op::interpolate_nearest::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, output, input, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t infiniopDestroyInterpolateNearestDescriptor(
+    infiniopInterpolateNearestDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                                \
+    case CASE:                                                                                 \
+        delete reinterpret_cast<const op::interpolate_nearest::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/maxpool/cpu/maxpool_cpu.cc b/src/infiniop/ops/maxpool/cpu/maxpool_cpu.cc
new file mode 100644
index 000000000..5c729e7e8
--- /dev/null
+++ b/src/infiniop/ops/maxpool/cpu/maxpool_cpu.cc
@@ -0,0 +1,322 @@
+#include "maxpool_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../devices/cpu/cpu_handle.h"
+#include "../info.h"
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <limits>
+
+namespace op::maxpool::cpu {
+
+struct Descriptor::Opaque {
+    device::cpu::Handle *handle;
+    MaxPoolInfo info;
+    size_t workspace_size = 0;
+
+private:
+    Opaque(device::cpu::Handle *handle_ptr, const MaxPoolInfo &maxpool_info)
+        : handle(handle_ptr), info(maxpool_info) {
+        // CPU实现通常不需要额外的工作空间
+        workspace_size = 0;
+    }
+
+    // 获取数据类型的最小值
+    template <typename T>
+    static T get_min_value() {
+        if constexpr (std::is_same<T, float>::value) {
+            return -std::numeric_limits<float>::infinity();
+        } else if constexpr (std::is_same<T, fp16_t>::value) {
+            return _f32_to_f16(-std::numeric_limits<float>::infinity());
+        } else if constexpr (std::is_same<T, bf16_t>::value) {
+            return _f32_to_bf16(-std::numeric_limits<float>::infinity());
+        } else {
+            return std::numeric_limits<T>::lowest();
+        }
+    }
+
+    // 比较两个值的大小（处理半精度类型）
+    template <typename T>
+    static bool is_greater(const T &a, const T &b) {
+        if constexpr (std::is_same<T, fp16_t>::value) {
+            return utils::cast<float>(a) > utils::cast<float>(b);
+        } else if constexpr (std::is_same<T, bf16_t>::value) {
+            return utils::cast<float>(a) > utils::cast<float>(b);
+        } else {
+            return a > b;
+        }
+    }
+
+    // 1D最大池化
+    template <typename T>
+    void maxpool_1d(T *output, const T *input) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_width = info.input_dims[0];
+        size_t output_width = info.output_dims[0];
+        size_t kernel_width = info.kernel_sizes[0];
+        size_t stride_width = info.strides[0];
+        size_t pad_width = info.pads[0];
+
+        // 并行处理每个批次和通道
+#pragma omp parallel for collapse(2) schedule(static)
+        for (size_t b = 0; b < batch_size; ++b) {
+            for (size_t c = 0; c < channels; ++c) {
+                size_t input_offset = b * channels * input_width + c * input_width;
+                size_t output_offset = b * channels * output_width + c * output_width;
+
+                for (size_t ow = 0; ow < output_width; ++ow) {
+                    T max_val = get_min_value<T>();
+                    bool found_valid = false;
+
+                    int start_w = static_cast<int>(ow * stride_width) - static_cast<int>(pad_width);
+                    int end_w = start_w + static_cast<int>(kernel_width);
+
+                    for (int kw = start_w; kw < end_w; ++kw) {
+                        if (kw >= 0 && kw < static_cast<int>(input_width)) {
+                            T val = input[input_offset + kw];
+                            if (!found_valid || is_greater(val, max_val)) {
+                                max_val = val;
+                                found_valid = true;
+                            }
+                        }
+                    }
+
+                    output[output_offset + ow] = max_val;
+                }
+            }
+        }
+    }
+
+    // 2D最大池化
+    template <typename T>
+    void maxpool_2d(T *output, const T *input) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_height = info.input_dims[0];
+        size_t input_width = info.input_dims[1];
+        size_t output_height = info.output_dims[0];
+        size_t output_width = info.output_dims[1];
+        size_t kernel_height = info.kernel_sizes[0];
+        size_t kernel_width = info.kernel_sizes[1];
+        size_t stride_height = info.strides[0];
+        size_t stride_width = info.strides[1];
+        size_t pad_height = info.pads[0];
+        size_t pad_width = info.pads[1];
+
+        // 并行处理每个批次和通道
+#pragma omp parallel for collapse(2) schedule(static)
+        for (size_t b = 0; b < batch_size; ++b) {
+            for (size_t c = 0; c < channels; ++c) {
+                size_t input_offset = b * channels * input_height * input_width + c * input_height * input_width;
+                size_t output_offset = b * channels * output_height * output_width + c * output_height * output_width;
+
+                for (size_t oh = 0; oh < output_height; ++oh) {
+                    for (size_t ow = 0; ow < output_width; ++ow) {
+                        T max_val = get_min_value<T>();
+                        bool found_valid = false;
+
+                        int start_h = static_cast<int>(oh * stride_height) - static_cast<int>(pad_height);
+                        int end_h = start_h + static_cast<int>(kernel_height);
+                        int start_w = static_cast<int>(ow * stride_width) - static_cast<int>(pad_width);
+                        int end_w = start_w + static_cast<int>(kernel_width);
+
+                        for (int kh = start_h; kh < end_h; ++kh) {
+                            for (int kw = start_w; kw < end_w; ++kw) {
+                                if (kh >= 0 && kh < static_cast<int>(input_height) && kw >= 0 && kw < static_cast<int>(input_width)) {
+                                    T val = input[input_offset + kh * input_width + kw];
+                                    if (!found_valid || is_greater(val, max_val)) {
+                                        max_val = val;
+                                        found_valid = true;
+                                    }
+                                }
+                            }
+                        }
+
+                        output[output_offset + oh * output_width + ow] = max_val;
+                    }
+                }
+            }
+        }
+    }
+
+    // 3D最大池化
+    template <typename T>
+    void maxpool_3d(T *output, const T *input) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_depth = info.input_dims[0];
+        size_t input_height = info.input_dims[1];
+        size_t input_width = info.input_dims[2];
+        size_t output_depth = info.output_dims[0];
+        size_t output_height = info.output_dims[1];
+        size_t output_width = info.output_dims[2];
+        size_t kernel_depth = info.kernel_sizes[0];
+        size_t kernel_height = info.kernel_sizes[1];
+        size_t kernel_width = info.kernel_sizes[2];
+        size_t stride_depth = info.strides[0];
+        size_t stride_height = info.strides[1];
+        size_t stride_width = info.strides[2];
+        size_t pad_depth = info.pads[0];
+        size_t pad_height = info.pads[1];
+        size_t pad_width = info.pads[2];
+
+        // 并行处理每个批次和通道
+#pragma omp parallel for collapse(2) schedule(static)
+        for (size_t b = 0; b < batch_size; ++b) {
+            for (size_t c = 0; c < channels; ++c) {
+                size_t input_offset = b * channels * input_depth * input_height * input_width + c * input_depth * input_height * input_width;
+                size_t output_offset = b * channels * output_depth * output_height * output_width + c * output_depth * output_height * output_width;
+
+                for (size_t od = 0; od < output_depth; ++od) {
+                    for (size_t oh = 0; oh < output_height; ++oh) {
+                        for (size_t ow = 0; ow < output_width; ++ow) {
+                            T max_val = get_min_value<T>();
+                            bool found_valid = false;
+
+                            int start_d = static_cast<int>(od * stride_depth) - static_cast<int>(pad_depth);
+                            int end_d = start_d + static_cast<int>(kernel_depth);
+                            int start_h = static_cast<int>(oh * stride_height) - static_cast<int>(pad_height);
+                            int end_h = start_h + static_cast<int>(kernel_height);
+                            int start_w = static_cast<int>(ow * stride_width) - static_cast<int>(pad_width);
+                            int end_w = start_w + static_cast<int>(kernel_width);
+
+                            for (int kd = start_d; kd < end_d; ++kd) {
+                                for (int kh = start_h; kh < end_h; ++kh) {
+                                    for (int kw = start_w; kw < end_w; ++kw) {
+                                        if (kd >= 0 && kd < static_cast<int>(input_depth) && kh >= 0 && kh < static_cast<int>(input_height) && kw >= 0 && kw < static_cast<int>(input_width)) {
+                                            T val = input[input_offset + kd * input_height * input_width + kh * input_width + kw];
+                                            if (!found_valid || is_greater(val, max_val)) {
+                                                max_val = val;
+                                                found_valid = true;
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+
+                            output[output_offset + od * output_height * output_width + oh * output_width + ow] = max_val;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // 主要的最大池化计算函数
+    template <typename T>
+    void maxpool_cpu(T *output, const T *input) const {
+        switch (info.ndim) {
+        case 1:
+            maxpool_1d(output, input);
+            break;
+        case 2:
+            maxpool_2d(output, input);
+            break;
+        case 3:
+            maxpool_3d(output, input);
+            break;
+        default:
+            break;
+        }
+    }
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : handle(other.handle),
+          info(std::move(other.info)),
+          workspace_size(other.workspace_size) {
+        other.handle = nullptr;
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() = default;
+
+    static inline utils::Result<Opaque>
+    create(device::cpu::Handle *handle_ptr,
+           MaxPoolInfo &info,
+           infiniDtype_t data_type) {
+        if (data_type != INFINI_DTYPE_F32 && data_type != INFINI_DTYPE_F16 && data_type != INFINI_DTYPE_BF16) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        Opaque opaque(handle_ptr, info);
+        return utils::Result<Opaque>(std::move(opaque));
+    }
+
+    infiniStatus_t calculate(void *workspace, size_t workspace_size,
+                             void *output, const void *input, infiniDtype_t dtype) const {
+
+        if (!output || !input) {
+            return INFINI_STATUS_BAD_PARAM;
+        }
+
+        switch (dtype) {
+        case INFINI_DTYPE_F32: {
+            float *typed_output = static_cast<float *>(output);
+            const float *typed_input = static_cast<const float *>(input);
+            maxpool_cpu(typed_output, typed_input);
+            break;
+        }
+
+        case INFINI_DTYPE_F16: {
+            fp16_t *typed_output = static_cast<fp16_t *>(output);
+            const fp16_t *typed_input = static_cast<const fp16_t *>(input);
+            maxpool_cpu(typed_output, typed_input);
+            break;
+        }
+
+        case INFINI_DTYPE_BF16: {
+            bf16_t *typed_output = static_cast<bf16_t *>(output);
+            const bf16_t *typed_input = static_cast<const bf16_t *>(input);
+            maxpool_cpu(typed_output, typed_input);
+            break;
+        }
+
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        return INFINI_STATUS_SUCCESS;
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  infiniopTensorDescriptor_t input_desc,
+                                  void *kernel_size, void *strides, void *pads,
+                                  bool ceil_mode) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = input_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16);
+
+    auto result = MaxPoolInfo::create(output_desc, input_desc, kernel_size,
+                                      strides, pads, ceil_mode);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    auto opaque_result = Opaque::create(handle, info, dtype);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size,
+                               opaque, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *output, const void *input,
+                                     void *stream) const {
+    return _opaque->calculate(workspace, workspace_size, output, input, _dtype);
+}
+
+} // namespace op::maxpool::cpu
diff --git a/src/infiniop/ops/maxpool/cpu/maxpool_cpu.h b/src/infiniop/ops/maxpool/cpu/maxpool_cpu.h
new file mode 100644
index 000000000..f3ecd349d
--- /dev/null
+++ b/src/infiniop/ops/maxpool/cpu/maxpool_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __MAX_POOL_CPU_H__
+#define __MAX_POOL_CPU_H__
+
+#include "../maxpool.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __MAX_POOL_CPU_H__
diff --git a/src/infiniop/ops/maxpool/info.h b/src/infiniop/ops/maxpool/info.h
new file mode 100644
index 000000000..ff56fe28c
--- /dev/null
+++ b/src/infiniop/ops/maxpool/info.h
@@ -0,0 +1,113 @@
+#ifndef __MAX_POOL_INFO_H__
+#define __MAX_POOL_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::maxpool {
+
+inline utils::Result<size_t> calculateMaxPoolOutputSize(
+    size_t input_size,
+    size_t kernel_size,
+    size_t stride,
+    size_t padding = 0,
+    bool ceil_mode = false) {
+
+    if (stride == 0) {
+        return utils::Result<size_t>(INFINI_STATUS_BAD_PARAM);
+    }
+    if (kernel_size == 0) {
+        return utils::Result<size_t>(INFINI_STATUS_BAD_PARAM);
+    }
+
+    // 理论最大输出数
+    size_t max_output = 0;
+    if (ceil_mode) {
+        max_output = (input_size + 2 * padding - kernel_size + stride - 1) / stride + 1;
+    } else {
+        max_output = (input_size + 2 * padding - kernel_size) / stride + 1;
+    }
+
+    size_t valid_output = 0;
+    for (size_t i = 0; i < max_output; ++i) {
+        int64_t start = static_cast<int64_t>(i) * stride - padding;
+        int64_t end = start + kernel_size;
+        // 判断区间 [start, end) 和 [0, input_size) 是否有交集
+        int64_t real_start = std::max(start, int64_t(0));
+        int64_t real_end = std::min(end, int64_t(input_size));
+        if (real_end > real_start) {
+            ++valid_output;
+        }
+    }
+    return utils::Result<size_t>(valid_output);
+}
+
+class MaxPoolInfo {
+    MaxPoolInfo() = default;
+
+public:
+    std::vector<size_t> input_dims;
+    std::vector<size_t> output_dims;
+    std::vector<size_t> kernel_sizes;
+    std::vector<size_t> strides;
+    std::vector<size_t> pads;
+    bool ceil_mode;
+    size_t ndim;
+    size_t batch;
+    size_t channels;
+
+    static utils::Result<MaxPoolInfo> create(
+        infiniopTensorDescriptor_t output_desc,
+        infiniopTensorDescriptor_t input_desc,
+        void *kernel_size,
+        void *strides,
+        void *pads,
+        bool ceil_mode) {
+
+        MaxPoolInfo info;
+
+        if (input_desc->ndim() < 3 || input_desc->ndim() > 5) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        if (input_desc->ndim() != output_desc->ndim()) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        if (input_desc->dim(0) != output_desc->dim(0) || input_desc->dim(1) != output_desc->dim(1)) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        info.ndim = input_desc->ndim() - 2; // spatial dimensions
+        info.batch = input_desc->dim(0);
+        info.channels = input_desc->dim(1);
+        info.ceil_mode = ceil_mode;
+
+        auto kernel_ptr = reinterpret_cast<const size_t *>(kernel_size);
+        auto stride_ptr = reinterpret_cast<const size_t *>(strides);
+        auto pad_ptr = reinterpret_cast<const size_t *>(pads);
+
+        // Get spatial dimensions
+        for (size_t i = 0; i < info.ndim; ++i) {
+            info.input_dims.push_back(input_desc->dim(i + 2));
+            info.kernel_sizes.push_back(kernel_ptr[i]);
+            info.strides.push_back(stride_ptr[i]);
+            info.pads.push_back(pad_ptr[i]);
+            auto output_size = calculateMaxPoolOutputSize(
+                info.input_dims[i], info.kernel_sizes[i], info.strides[i], info.pads[i], info.ceil_mode);
+            CHECK_RESULT(output_size);
+            size_t expected_size = output_size.take();
+            if (expected_size != output_desc->dim(i + 2)) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+
+            info.output_dims.push_back(output_desc->dim(i + 2));
+        }
+        return utils::Result<MaxPoolInfo>(std::move(info));
+    }
+};
+} // namespace op::maxpool
+
+#endif // __MAX_POOL_INFO_H__
diff --git a/src/infiniop/ops/maxpool/maxpool.h b/src/infiniop/ops/maxpool/maxpool.h
new file mode 100644
index 000000000..5ee7703c5
--- /dev/null
+++ b/src/infiniop/ops/maxpool/maxpool.h
@@ -0,0 +1,53 @@
+#ifndef __MAX_POOL_H__
+#define __MAX_POOL_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::maxpool::NAMESPACE {                           \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        infiniDtype_t _dtype;                                    \
+        MaxPoolInfo _info;                                       \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            infiniDtype_t dtype,                                 \
+            MaxPoolInfo info,                                    \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _dtype(dtype),                                     \
+              _info(info),                                       \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t output_desc,              \
+            infiniopTensorDescriptor_t input_desc,               \
+            void *kernel_size,                                   \
+            void *strides,                                       \
+            void *pads,                                          \
+            bool ceil_mode);                                     \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            void *output,                                        \
+            const void *input,                                   \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __MAX_POOL_H__
diff --git a/src/infiniop/ops/maxpool/metax/maxpool_metax.cc b/src/infiniop/ops/maxpool/metax/maxpool_metax.cc
new file mode 100644
index 000000000..b70286abd
--- /dev/null
+++ b/src/infiniop/ops/maxpool/metax/maxpool_metax.cc
@@ -0,0 +1,217 @@
+#include "maxpool_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+
+#define DESTROY_hcdnn_DESCRIPTOR(desc_ptr, destroy_func) \
+    do {                                                 \
+        if (desc_ptr) {                                  \
+            destroy_func(desc_ptr);                      \
+            desc_ptr = nullptr;                          \
+        }                                                \
+    } while (0)
+
+#define CLEANUP_hcdnn_DESCRIPTORS()                                            \
+    do {                                                                       \
+        DESTROY_hcdnn_DESCRIPTOR(input_desc, hcdnnDestroyTensorDescriptor);    \
+        DESTROY_hcdnn_DESCRIPTOR(output_desc, hcdnnDestroyTensorDescriptor);   \
+        DESTROY_hcdnn_DESCRIPTOR(pooling_desc, hcdnnDestroyPoolingDescriptor); \
+    } while (0)
+
+namespace op::maxpool::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+    size_t workspace_size = 0;
+
+#ifdef ENABLE_HCDNN_API
+    hcdnnTensorDescriptor_t input_desc = nullptr;
+    hcdnnTensorDescriptor_t output_desc = nullptr;
+    hcdnnPoolingDescriptor_t pooling_desc = nullptr;
+#endif
+
+private:
+    Opaque(std::shared_ptr<device::metax::Handle::Internal> internal_ptr)
+        : internal(internal_ptr) {}
+
+#ifdef ENABLE_HCDNN_API
+    infiniStatus_t createPoolingDescriptors(const MaxPoolInfo &info,
+                                            hcdnnDataType_t hcdnn_data_type) {
+        // 创建输入输出张量描述符
+        CHECK_MCDNN(hcdnnCreateTensorDescriptor(&input_desc));
+        CHECK_MCDNN(hcdnnCreateTensorDescriptor(&output_desc));
+        CHECK_MCDNN(hcdnnCreatePoolingDescriptor(&pooling_desc));
+
+        // 构建输入输出维度（NCHW格式）
+        std::vector<int> input_dims = {static_cast<int>(info.batch),
+                                       static_cast<int>(info.channels)};
+        std::vector<int> output_dims = {static_cast<int>(info.batch),
+                                        static_cast<int>(info.channels)};
+        for (size_t i = 0; i < info.ndim; ++i) {
+            input_dims.push_back(static_cast<int>(info.input_dims[i]));
+            output_dims.push_back(static_cast<int>(info.output_dims[i]));
+        }
+
+        // 1D池化补充维度
+        if (info.ndim == 1) {
+            input_dims.push_back(1);
+            output_dims.push_back(1);
+        }
+
+        // 计算输入输出张量的步幅
+        std::vector<int> input_strides(input_dims.size(), 1);
+        std::vector<int> output_strides(output_dims.size(), 1);
+        for (int i = input_dims.size() - 2; i >= 0; --i) {
+            input_strides[i] = input_strides[i + 1] * input_dims[i + 1];
+            output_strides[i] = output_strides[i + 1] * output_dims[i + 1];
+        }
+
+        // 设置张量描述符（NCHW格式）
+        CHECK_MCDNN(hcdnnSetTensorNdDescriptor(
+            input_desc, hcdnn_data_type, input_dims.size(), input_dims.data(), input_strides.data()));
+        CHECK_MCDNN(hcdnnSetTensorNdDescriptor(
+            output_desc, hcdnn_data_type, output_dims.size(), output_dims.data(),
+            output_strides.data()));
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t setupPoolingDescriptor(const MaxPoolInfo &info) {
+        // 构建池化参数
+        std::vector<int> kernel_size, strides, pads;
+        for (size_t i = 0; i < info.ndim; ++i) {
+            kernel_size.push_back(static_cast<int>(info.kernel_sizes[i]));
+            strides.push_back(static_cast<int>(info.strides[i]));
+            pads.push_back(static_cast<int>(info.pads[i]));
+        }
+
+        // 1D池化补充维度
+        if (info.ndim == 1) {
+            kernel_size.push_back(1);
+            strides.push_back(1);
+            pads.push_back(0);
+        }
+
+        // 设置最大池化描述符（确定性模式）
+        CHECK_MCDNN(hcdnnSetPoolingNdDescriptor(
+            pooling_desc, HCDNN_POOLING_MAX_DETERMINISTIC, // 确定性最大池化
+            HCDNN_NOT_PROPAGATE_NAN,                       // 不传播NaN
+            kernel_size.size(),
+            kernel_size.data(),
+            pads.data(),
+            strides.data()));
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t initializehcdnnContext(MaxPoolInfo &info,
+                                          infiniDtype_t data_type) {
+        hcdnnDataType_t hcdnn_data_type = device::metax::getHcdnnDtype(data_type);
+        CHECK_STATUS(createPoolingDescriptors(info, hcdnn_data_type));
+        CHECK_STATUS(setupPoolingDescriptor(info));
+
+        // 最大池化通常不需要工作空间
+        workspace_size = 0;
+
+        return INFINI_STATUS_SUCCESS;
+    }
+#endif
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : internal(std::move(other.internal)),
+          workspace_size(other.workspace_size)
+#ifdef ENABLE_HCDNN_API
+          ,
+          input_desc(other.input_desc), output_desc(other.output_desc), pooling_desc(other.pooling_desc)
+#endif
+    {
+#ifdef ENABLE_HCDNN_API
+        other.input_desc = nullptr;
+        other.output_desc = nullptr;
+        other.pooling_desc = nullptr;
+#endif
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() {
+#ifdef ENABLE_HCDNN_API
+        CLEANUP_hcdnn_DESCRIPTORS();
+#endif
+    }
+
+    static inline utils::Result<Opaque>
+    create(std::shared_ptr<device::metax::Handle::Internal> internal_ptr,
+           MaxPoolInfo &info, infiniDtype_t data_type) {
+#ifdef ENABLE_HCDNN_API
+        Opaque opaque(internal_ptr);
+        auto status = opaque.initializehcdnnContext(info, data_type);
+        if (status != INFINI_STATUS_SUCCESS) {
+            return status;
+        }
+        return utils::Result<Opaque>(std::move(opaque));
+#else
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  infiniopTensorDescriptor_t input_desc,
+                                  void *kernel_size, void *strides, void *pads,
+                                  bool ceil_mode) {
+
+#ifdef ENABLE_HCDNN_API
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = input_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    auto result = MaxPoolInfo::create(output_desc, input_desc, kernel_size,
+                                      strides, pads, ceil_mode);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    auto opaque_result = Opaque::create(handle->internal(), info, dtype);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size,
+                               opaque, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *output, const void *input,
+                                     void *stream) const {
+
+#ifdef ENABLE_HCDNN_API
+    const float alpha = 1.0f, beta = 0.0f;
+
+    // 执行最大池化前向计算
+    CHECK_STATUS(_opaque->internal->useMcdnn(
+        (hcStream_t)stream, [&](hcdnnHandle_t handle) {
+            CHECK_MCDNN(hcdnnPoolingForward(handle, _opaque->pooling_desc, &alpha,
+                                            _opaque->input_desc, input, &beta,
+                                            _opaque->output_desc, output));
+            return INFINI_STATUS_SUCCESS;
+        }));
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+} // namespace op::maxpool::metax
diff --git a/src/infiniop/ops/maxpool/metax/maxpool_metax.h b/src/infiniop/ops/maxpool/metax/maxpool_metax.h
new file mode 100644
index 000000000..5051358de
--- /dev/null
+++ b/src/infiniop/ops/maxpool/metax/maxpool_metax.h
@@ -0,0 +1,8 @@
+#ifndef __MAX_POOL_METAX_H__
+#define __MAX_POOL_METAX_H__
+
+#include "../maxpool.h"
+
+DESCRIPTOR(metax)
+
+#endif // __MAX_POOL_METAX_CUH__
diff --git a/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cu b/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cu
new file mode 100644
index 000000000..8b94a29c1
--- /dev/null
+++ b/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cu
@@ -0,0 +1,240 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "maxpool_nvidia.cuh"
+
+#define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \
+    do {                                                 \
+        if (desc_ptr) {                                  \
+            destroy_func(desc_ptr);                      \
+            desc_ptr = nullptr;                          \
+        }                                                \
+    } while (0)
+
+#define CLEANUP_CUDNN_DESCRIPTORS()                                            \
+    do {                                                                       \
+        DESTROY_CUDNN_DESCRIPTOR(input_desc, cudnnDestroyTensorDescriptor);    \
+        DESTROY_CUDNN_DESCRIPTOR(output_desc, cudnnDestroyTensorDescriptor);   \
+        DESTROY_CUDNN_DESCRIPTOR(pooling_desc, cudnnDestroyPoolingDescriptor); \
+    } while (0)
+
+namespace op::maxpool::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+    size_t workspace_size = 0;
+
+#ifdef ENABLE_CUDNN_API
+    cudnnTensorDescriptor_t input_desc = nullptr;
+    cudnnTensorDescriptor_t output_desc = nullptr;
+    cudnnPoolingDescriptor_t pooling_desc = nullptr;
+#endif
+
+private:
+    Opaque(std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr)
+        : internal(internal_ptr) {}
+
+#ifdef ENABLE_CUDNN_API
+    infiniStatus_t getCudnnDataType(infiniDtype_t data_type,
+                                    cudnnDataType_t &cudnn_data_type) const {
+        if (data_type == INFINI_DTYPE_F16) {
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
+        } else if (data_type == INFINI_DTYPE_F32) {
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
+        } else if (data_type == INFINI_DTYPE_BF16) {
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t createPoolingDescriptors(const MaxPoolInfo &info,
+                                            cudnnDataType_t cudnn_data_type) {
+        // Create CUDNN descriptors
+        CHECK_CUDNN(cudnnCreateTensorDescriptor(&input_desc));
+        CHECK_CUDNN(cudnnCreateTensorDescriptor(&output_desc));
+        CHECK_CUDNN(cudnnCreatePoolingDescriptor(&pooling_desc));
+
+        // Setup tensor descriptors
+        std::vector<int> input_dims_vec = {static_cast<int>(info.batch),
+                                           static_cast<int>(info.channels)};
+        std::vector<int> output_dims_vec = {static_cast<int>(info.batch),
+                                            static_cast<int>(info.channels)};
+
+        for (size_t i = 0; i < info.ndim; ++i) {
+            input_dims_vec.push_back(static_cast<int>(info.input_dims[i]));
+            output_dims_vec.push_back(static_cast<int>(info.output_dims[i]));
+        }
+
+        if (info.ndim == 1) {
+            // For 1D pooling, add dummy dimension
+            input_dims_vec.push_back(1);
+            output_dims_vec.push_back(1);
+        }
+
+        CHECK_CUDNN(cudnnSetTensorNdDescriptorEx(
+            input_desc, CUDNN_TENSOR_NCHW, cudnn_data_type, input_dims_vec.size(),
+            input_dims_vec.data()));
+
+        CHECK_CUDNN(cudnnSetTensorNdDescriptorEx(
+            output_desc, CUDNN_TENSOR_NCHW, cudnn_data_type, output_dims_vec.size(),
+            output_dims_vec.data()));
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t setupPoolingDescriptor(const MaxPoolInfo &info) {
+        // Setup pooling descriptor
+        std::vector<int> kernel_vec, stride_vec, pad_vec;
+        for (size_t i = 0; i < info.ndim; ++i) {
+            kernel_vec.push_back(static_cast<int>(info.kernel_sizes[i]));
+            stride_vec.push_back(static_cast<int>(info.strides[i]));
+            pad_vec.push_back(static_cast<int>(info.pads[i]));
+        }
+
+        if (info.ndim == 1) {
+            // For 1D pooling, add dummy dimension
+            kernel_vec.push_back(1);
+            stride_vec.push_back(1);
+            pad_vec.push_back(0);
+        }
+
+        CHECK_CUDNN(cudnnSetPoolingNdDescriptor(
+            pooling_desc, CUDNN_POOLING_MAX, CUDNN_NOT_PROPAGATE_NAN,
+            kernel_vec.size(), kernel_vec.data(), pad_vec.data(),
+            stride_vec.data()));
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t initializeCudnnContext(MaxPoolInfo &info,
+                                          infiniDtype_t data_type) {
+        cudnnDataType_t cudnn_data_type;
+        CHECK_STATUS(getCudnnDataType(data_type, cudnn_data_type));
+
+        CHECK_STATUS(createPoolingDescriptors(info, cudnn_data_type));
+        CHECK_STATUS(setupPoolingDescriptor(info));
+
+        // Max pooling typically doesn't need workspace
+        workspace_size = 0;
+
+        return INFINI_STATUS_SUCCESS;
+    }
+#endif
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : internal(std::move(other.internal)),
+          workspace_size(other.workspace_size)
+    // clang-format off
+#ifdef ENABLE_CUDNN_API
+          , input_desc(other.input_desc)
+          , output_desc(other.output_desc)
+          , pooling_desc(other.pooling_desc)
+#endif
+    // clang-format on
+    {
+#ifdef ENABLE_CUDNN_API
+        other.input_desc = nullptr;
+        other.output_desc = nullptr;
+        other.pooling_desc = nullptr;
+#endif
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() {
+#ifdef ENABLE_CUDNN_API
+        CLEANUP_CUDNN_DESCRIPTORS();
+#endif
+    }
+
+    static inline utils::Result<Opaque>
+    create(std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr,
+           MaxPoolInfo &info, infiniDtype_t data_type) {
+#ifdef ENABLE_CUDNN_API
+        Opaque opaque(internal_ptr);
+        auto status = opaque.initializeCudnnContext(info, data_type);
+        if (status != INFINI_STATUS_SUCCESS) {
+            return status;
+        }
+        return utils::Result<Opaque>(std::move(opaque));
+#else
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t output_desc,
+                                  infiniopTensorDescriptor_t input_desc,
+                                  void *kernel_size, void *strides, void *pads,
+                                  bool ceil_mode) {
+
+#ifdef ENABLE_CUDNN_API
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = input_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    auto result = MaxPoolInfo::create(output_desc, input_desc, kernel_size,
+                                      strides, pads, ceil_mode);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    auto opaque_result = Opaque::create(handle->internal(), info, dtype);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size,
+                               opaque, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *output, const void *input,
+                                     void *stream) const {
+
+#ifdef ENABLE_CUDNN_API
+    const float alpha = 1.0f, beta = 0.0f;
+
+    // 打印input展平后的前十个数据
+    //  printf("MaxPool input (first 10 elements): ");
+    //  const uint16_t *input_data = static_cast<const uint16_t *>(input);
+    //  for (int i = 0; i < 10; ++i) {
+    //    // 将BF16转换为float显示
+    //    union {
+    //      uint32_t bits;
+    //      float value;
+    //    } converter;
+    //    uint16_t bf16_val = input_data[i];
+    //    converter.bits = static_cast<uint32_t>(bf16_val) << 16;
+    //    printf("%f ", converter.value);
+    //  }
+    //  printf("\n");
+
+    CHECK_STATUS(_opaque->internal->useCudnn(
+        (cudaStream_t)stream, [&](cudnnHandle_t handle) {
+            CHECK_CUDNN(cudnnPoolingForward(handle, _opaque->pooling_desc, &alpha,
+                                            _opaque->input_desc, input, &beta,
+                                            _opaque->output_desc, output));
+            return INFINI_STATUS_SUCCESS;
+        }));
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+} // namespace op::maxpool::nvidia
diff --git a/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cuh b/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cuh
new file mode 100644
index 000000000..539ad5a1a
--- /dev/null
+++ b/src/infiniop/ops/maxpool/nvidia/maxpool_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __MAX_POOL_CUDA_CUH__
+#define __MAX_POOL_CUDA_CUH__
+
+#include "../maxpool.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __MAX_POOL_CUDA_CUH__
diff --git a/src/infiniop/ops/maxpool/operator.cc b/src/infiniop/ops/maxpool/operator.cc
new file mode 100644
index 000000000..aedfc0585
--- /dev/null
+++ b/src/infiniop/ops/maxpool/operator.cc
@@ -0,0 +1,155 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/maxpool.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/maxpool_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/maxpool_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/maxpool_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateMaxPoolDescriptor(
+    infiniopHandle_t handle,
+    infiniopMaxPoolDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    void *kernel_size,
+    void *strides,
+    void *pads,
+    bool ceil_mode) {
+
+#define CREATE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        return op::maxpool::NAMESPACE::Descriptor::create(                     \
+            handle,                                                            \
+            reinterpret_cast<op::maxpool::NAMESPACE::Descriptor **>(desc_ptr), \
+            output_desc,                                                       \
+            input_desc,                                                        \
+            kernel_size,                                                       \
+            strides,                                                           \
+            pads,                                                              \
+            ceil_mode)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetMaxPoolWorkspaceSize(
+    infiniopMaxPoolDescriptor_t desc,
+    size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                         \
+    case CASE:                                                                                       \
+        *size = reinterpret_cast<const op::maxpool::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef GET
+}
+
+__C infiniStatus_t infiniopMaxPool(
+    infiniopMaxPoolDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                    \
+        return reinterpret_cast<const op::maxpool::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size,                                \
+                        output,                                                   \
+                        input,                                                    \
+                        stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t infiniopDestroyMaxPoolDescriptor(infiniopMaxPoolDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                    \
+    case CASE:                                                                     \
+        delete reinterpret_cast<const op::maxpool::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/src/infiniop/ops/maxpool_backward/cpu/maxpool_backward_cpu.cc b/src/infiniop/ops/maxpool_backward/cpu/maxpool_backward_cpu.cc
new file mode 100644
index 000000000..ee2ba3628
--- /dev/null
+++ b/src/infiniop/ops/maxpool_backward/cpu/maxpool_backward_cpu.cc
@@ -0,0 +1,638 @@
+#include "maxpool_backward_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include "../../../devices/cpu/cpu_handle.h"
+#include "../info.h"
+#include <algorithm>
+#include <cmath>
+#include <cstring>
+#include <limits>
+#include <vector>
+
+namespace op::maxpool_backward::cpu {
+
+struct Descriptor::Opaque {
+    device::cpu::Handle *handle;
+    MaxPoolBackwardInfo info;
+    size_t workspace_size = 0;
+
+private:
+    Opaque(device::cpu::Handle *handle_ptr, const MaxPoolBackwardInfo &maxpool_info)
+        : handle(handle_ptr), info(maxpool_info) {
+        workspace_size = 0;
+    }
+
+    // F16专用：使用float计算的最大池化反向传播
+    void maxpool_backward_f16_as_float(fp16_t *grad_input, const fp16_t *grad_output, const fp16_t *input) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+
+        // 计算总的输入和输出大小
+        size_t total_input_size = batch_size * channels;
+        size_t total_output_size = batch_size * channels;
+
+        for (size_t i = 0; i < info.ndim; ++i) {
+            total_input_size *= info.input_dims[i];
+            total_output_size *= info.output_dims[i];
+        }
+
+        // 分配float临时缓冲区
+        std::vector<float> float_input(total_input_size);
+        std::vector<float> float_grad_output(total_output_size);
+        std::vector<float> float_grad_input(total_input_size, 0.0f);
+
+        // 转换输入数据为float
+        for (size_t i = 0; i < total_input_size; ++i) {
+            float_input[i] = utils::cast<float>(input[i]);
+        }
+        for (size_t i = 0; i < total_output_size; ++i) {
+            float_grad_output[i] = utils::cast<float>(grad_output[i]);
+        }
+
+        // 使用float精度进行计算
+        maxpool_backward_cpu_float(float_grad_input.data(), float_grad_output.data(), float_input.data());
+
+        // 转换结果回F16
+        for (size_t i = 0; i < total_input_size; ++i) {
+            grad_input[i] = utils::cast<fp16_t>(float_grad_input[i]);
+        }
+    }
+
+    // Float版本的最大池化反向传播
+    void maxpool_backward_cpu_float(float *grad_input, const float *grad_output, const float *input) const {
+        switch (info.ndim) {
+        case 1:
+            maxpool_backward_1d_float(grad_input, grad_output, input);
+            break;
+        case 2:
+            maxpool_backward_2d_float(grad_input, grad_output, input);
+            break;
+        case 3:
+            maxpool_backward_3d_float(grad_input, grad_output, input);
+            break;
+        default:
+            break;
+        }
+    }
+
+    // 1D float版本
+    void maxpool_backward_1d_float(float *grad_input, const float *grad_output, const float *input) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_width = info.input_dims[0];
+        size_t output_width = info.output_dims[0];
+        size_t kernel_width = info.kernel_sizes[0];
+        size_t stride_width = info.strides[0];
+        size_t pad_width = info.pads[0];
+
+        // 初始化梯度输入为零
+        size_t total_input_size = batch_size * channels * input_width;
+        std::fill(grad_input, grad_input + total_input_size, 0.0f);
+
+        for (size_t b = 0; b < batch_size; ++b) {
+            for (size_t c = 0; c < channels; ++c) {
+                size_t input_offset = b * channels * input_width + c * input_width;
+                size_t output_offset = b * channels * output_width + c * output_width;
+
+                for (size_t ow = 0; ow < output_width; ++ow) {
+                    float max_val = -std::numeric_limits<float>::infinity();
+                    size_t max_idx = 0;
+                    bool found_max = false;
+
+                    int start_w = static_cast<int>(ow * stride_width) - static_cast<int>(pad_width);
+                    int end_w = start_w + static_cast<int>(kernel_width);
+
+                    for (int kw = start_w; kw < end_w; ++kw) {
+                        if (kw >= 0 && kw < static_cast<int>(input_width)) {
+                            size_t real_kw = static_cast<size_t>(kw);
+                            float val = input[input_offset + real_kw];
+
+                            if (!found_max || val > max_val || (val == max_val && real_kw < max_idx)) {
+                                max_val = val;
+                                max_idx = real_kw;
+                                found_max = true;
+                            }
+                        }
+                    }
+
+                    if (found_max) {
+                        grad_input[input_offset + max_idx] += grad_output[output_offset + ow];
+                    }
+                }
+            }
+        }
+    }
+
+    // 2D float版本
+    void maxpool_backward_2d_float(float *grad_input, const float *grad_output, const float *input) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_height = info.input_dims[0];
+        size_t input_width = info.input_dims[1];
+        size_t output_height = info.output_dims[0];
+        size_t output_width = info.output_dims[1];
+        size_t kernel_height = info.kernel_sizes[0];
+        size_t kernel_width = info.kernel_sizes[1];
+        size_t stride_height = info.strides[0];
+        size_t stride_width = info.strides[1];
+        size_t pad_height = info.pads[0];
+        size_t pad_width = info.pads[1];
+
+        // 初始化梯度输入为零
+        size_t total_input_size = batch_size * channels * input_height * input_width;
+        std::fill(grad_input, grad_input + total_input_size, 0.0f);
+
+        for (size_t b = 0; b < batch_size; ++b) {
+            for (size_t c = 0; c < channels; ++c) {
+                size_t input_offset = b * channels * input_height * input_width + c * input_height * input_width;
+                size_t output_offset = b * channels * output_height * output_width + c * output_height * output_width;
+
+                for (size_t oh = 0; oh < output_height; ++oh) {
+                    for (size_t ow = 0; ow < output_width; ++ow) {
+                        float max_val = -std::numeric_limits<float>::infinity();
+                        size_t max_h = 0, max_w = 0;
+                        bool found_max = false;
+
+                        int start_h = static_cast<int>(oh * stride_height) - static_cast<int>(pad_height);
+                        int end_h = start_h + static_cast<int>(kernel_height);
+                        int start_w = static_cast<int>(ow * stride_width) - static_cast<int>(pad_width);
+                        int end_w = start_w + static_cast<int>(kernel_width);
+
+                        for (int kh = start_h; kh < end_h; ++kh) {
+                            for (int kw = start_w; kw < end_w; ++kw) {
+                                if (kh >= 0 && kh < static_cast<int>(input_height) && kw >= 0 && kw < static_cast<int>(input_width)) {
+                                    size_t real_kh = static_cast<size_t>(kh);
+                                    size_t real_kw = static_cast<size_t>(kw);
+                                    float val = input[input_offset + real_kh * input_width + real_kw];
+
+                                    size_t linear_idx = real_kh * input_width + real_kw;
+                                    size_t old_linear_idx = found_max ? max_h * input_width + max_w : SIZE_MAX;
+
+                                    if (!found_max || val > max_val || (val == max_val && linear_idx < old_linear_idx)) {
+                                        max_val = val;
+                                        max_h = real_kh;
+                                        max_w = real_kw;
+                                        found_max = true;
+                                    }
+                                }
+                            }
+                        }
+
+                        if (found_max) {
+                            size_t grad_input_idx = input_offset + max_h * input_width + max_w;
+                            grad_input[grad_input_idx] += grad_output[output_offset + oh * output_width + ow];
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // 3D float版本
+    void maxpool_backward_3d_float(float *grad_input, const float *grad_output, const float *input) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_depth = info.input_dims[0];
+        size_t input_height = info.input_dims[1];
+        size_t input_width = info.input_dims[2];
+        size_t output_depth = info.output_dims[0];
+        size_t output_height = info.output_dims[1];
+        size_t output_width = info.output_dims[2];
+        size_t kernel_depth = info.kernel_sizes[0];
+        size_t kernel_height = info.kernel_sizes[1];
+        size_t kernel_width = info.kernel_sizes[2];
+        size_t stride_depth = info.strides[0];
+        size_t stride_height = info.strides[1];
+        size_t stride_width = info.strides[2];
+        size_t pad_depth = info.pads[0];
+        size_t pad_height = info.pads[1];
+        size_t pad_width = info.pads[2];
+
+        // 初始化梯度输入为零
+        size_t total_input_size = batch_size * channels * input_depth * input_height * input_width;
+        std::fill(grad_input, grad_input + total_input_size, 0.0f);
+
+        for (size_t b = 0; b < batch_size; ++b) {
+            for (size_t c = 0; c < channels; ++c) {
+                size_t input_offset = b * channels * input_depth * input_height * input_width + c * input_depth * input_height * input_width;
+                size_t output_offset = b * channels * output_depth * output_height * output_width + c * output_depth * output_height * output_width;
+
+                for (size_t od = 0; od < output_depth; ++od) {
+                    for (size_t oh = 0; oh < output_height; ++oh) {
+                        for (size_t ow = 0; ow < output_width; ++ow) {
+                            float max_val = -std::numeric_limits<float>::infinity();
+                            size_t max_d = 0, max_h = 0, max_w = 0;
+                            bool found_max = false;
+
+                            int start_d = static_cast<int>(od * stride_depth) - static_cast<int>(pad_depth);
+                            int end_d = start_d + static_cast<int>(kernel_depth);
+                            int start_h = static_cast<int>(oh * stride_height) - static_cast<int>(pad_height);
+                            int end_h = start_h + static_cast<int>(kernel_height);
+                            int start_w = static_cast<int>(ow * stride_width) - static_cast<int>(pad_width);
+                            int end_w = start_w + static_cast<int>(kernel_width);
+
+                            for (int kd = start_d; kd < end_d; ++kd) {
+                                for (int kh = start_h; kh < end_h; ++kh) {
+                                    for (int kw = start_w; kw < end_w; ++kw) {
+                                        if (kd >= 0 && kd < static_cast<int>(input_depth) && kh >= 0 && kh < static_cast<int>(input_height) && kw >= 0 && kw < static_cast<int>(input_width)) {
+
+                                            size_t real_kd = static_cast<size_t>(kd);
+                                            size_t real_kh = static_cast<size_t>(kh);
+                                            size_t real_kw = static_cast<size_t>(kw);
+
+                                            float val = input[input_offset + real_kd * input_height * input_width + real_kh * input_width + real_kw];
+
+                                            size_t linear_idx = real_kd * input_height * input_width + real_kh * input_width + real_kw;
+                                            size_t old_linear_idx = found_max ? max_d * input_height * input_width + max_h * input_width + max_w : SIZE_MAX;
+
+                                            if (!found_max || val > max_val || (val == max_val && linear_idx < old_linear_idx)) {
+                                                max_val = val;
+                                                max_d = real_kd;
+                                                max_h = real_kh;
+                                                max_w = real_kw;
+                                                found_max = true;
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+
+                            if (found_max) {
+                                size_t grad_input_idx = input_offset + max_d * input_height * input_width + max_h * input_width + max_w;
+                                grad_input[grad_input_idx] += grad_output[output_offset + od * output_height * output_width + oh * output_width + ow];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    // 获取数据类型的最小值
+    template <typename T>
+    static T get_min_value() {
+        if constexpr (std::is_same<T, float>::value) {
+            return -std::numeric_limits<float>::infinity();
+        } else if constexpr (std::is_same<T, fp16_t>::value) {
+            return _f32_to_f16(-std::numeric_limits<float>::infinity());
+        } else if constexpr (std::is_same<T, bf16_t>::value) {
+            return _f32_to_bf16(-std::numeric_limits<float>::infinity());
+        } else {
+            return std::numeric_limits<T>::lowest();
+        }
+    }
+
+    // 比较两个值的大小（处理半精度类型）
+    template <typename T>
+    static bool is_greater(const T &a, const T &b) {
+        if constexpr (std::is_same<T, fp16_t>::value) {
+            return utils::cast<float>(a) > utils::cast<float>(b);
+        } else if constexpr (std::is_same<T, bf16_t>::value) {
+            return utils::cast<float>(a) > utils::cast<float>(b);
+        } else {
+            return a > b;
+        }
+    }
+
+    // 检查两个值是否相等（处理半精度类型）
+    template <typename T>
+    static bool values_equal(const T &a, const T &b) {
+        if constexpr (std::is_same<T, fp16_t>::value) {
+            return utils::cast<float>(a) == utils::cast<float>(b);
+        } else if constexpr (std::is_same<T, bf16_t>::value) {
+            return utils::cast<float>(a) == utils::cast<float>(b);
+        } else {
+            return a == b;
+        }
+    }
+
+    // 原始的通用实现（用于F32和BF16）
+    template <typename T>
+    void maxpool_backward_cpu(T *grad_input, const T *grad_output, const T *input) const {
+        switch (info.ndim) {
+        case 1:
+            maxpool_backward_1d_generic(grad_input, grad_output, input);
+            break;
+        case 2:
+            maxpool_backward_2d_generic(grad_input, grad_output, input);
+            break;
+        case 3:
+            maxpool_backward_3d_generic(grad_input, grad_output, input);
+            break;
+        default:
+            break;
+        }
+    }
+
+    template <typename T>
+    void maxpool_backward_1d_generic(T *grad_input, const T *grad_output, const T *input) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_width = info.input_dims[0];
+        size_t output_width = info.output_dims[0];
+        size_t kernel_width = info.kernel_sizes[0];
+        size_t stride_width = info.strides[0];
+        size_t pad_width = info.pads[0];
+
+        size_t total_input_size = batch_size * channels * input_width;
+        std::fill(grad_input, grad_input + total_input_size, T{});
+
+        for (size_t b = 0; b < batch_size; ++b) {
+            for (size_t c = 0; c < channels; ++c) {
+                size_t input_offset = b * channels * input_width + c * input_width;
+                size_t output_offset = b * channels * output_width + c * output_width;
+
+                for (size_t ow = 0; ow < output_width; ++ow) {
+                    T max_val = get_min_value<T>();
+                    size_t max_idx = 0;
+                    bool found_max = false;
+
+                    int start_w = static_cast<int>(ow * stride_width) - static_cast<int>(pad_width);
+                    int end_w = start_w + static_cast<int>(kernel_width);
+
+                    for (int kw = start_w; kw < end_w; ++kw) {
+                        if (kw >= 0 && kw < static_cast<int>(input_width)) {
+                            size_t real_kw = static_cast<size_t>(kw);
+                            T val = input[input_offset + real_kw];
+
+                            if (!found_max || is_greater(val, max_val) || (values_equal(val, max_val) && real_kw < max_idx)) {
+                                max_val = val;
+                                max_idx = real_kw;
+                                found_max = true;
+                            }
+                        }
+                    }
+
+                    if (found_max) {
+                        if constexpr (std::is_same<T, bf16_t>::value) {
+                            float current = utils::cast<float>(grad_input[input_offset + max_idx]);
+                            float to_add = utils::cast<float>(grad_output[output_offset + ow]);
+                            grad_input[input_offset + max_idx] = utils::cast<T>(current + to_add);
+                        } else {
+                            grad_input[input_offset + max_idx] += grad_output[output_offset + ow];
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    template <typename T>
+    void maxpool_backward_2d_generic(T *grad_input, const T *grad_output, const T *input) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_height = info.input_dims[0];
+        size_t input_width = info.input_dims[1];
+        size_t output_height = info.output_dims[0];
+        size_t output_width = info.output_dims[1];
+        size_t kernel_height = info.kernel_sizes[0];
+        size_t kernel_width = info.kernel_sizes[1];
+        size_t stride_height = info.strides[0];
+        size_t stride_width = info.strides[1];
+        size_t pad_height = info.pads[0];
+        size_t pad_width = info.pads[1];
+
+        size_t total_input_size = batch_size * channels * input_height * input_width;
+        std::fill(grad_input, grad_input + total_input_size, T{});
+
+        for (size_t b = 0; b < batch_size; ++b) {
+            for (size_t c = 0; c < channels; ++c) {
+                size_t input_offset = b * channels * input_height * input_width + c * input_height * input_width;
+                size_t output_offset = b * channels * output_height * output_width + c * output_height * output_width;
+
+                for (size_t oh = 0; oh < output_height; ++oh) {
+                    for (size_t ow = 0; ow < output_width; ++ow) {
+                        T max_val = get_min_value<T>();
+                        size_t max_h = 0, max_w = 0;
+                        bool found_max = false;
+
+                        int start_h = static_cast<int>(oh * stride_height) - static_cast<int>(pad_height);
+                        int end_h = start_h + static_cast<int>(kernel_height);
+                        int start_w = static_cast<int>(ow * stride_width) - static_cast<int>(pad_width);
+                        int end_w = start_w + static_cast<int>(kernel_width);
+
+                        for (int kh = start_h; kh < end_h; ++kh) {
+                            for (int kw = start_w; kw < end_w; ++kw) {
+                                if (kh >= 0 && kh < static_cast<int>(input_height) && kw >= 0 && kw < static_cast<int>(input_width)) {
+                                    size_t real_kh = static_cast<size_t>(kh);
+                                    size_t real_kw = static_cast<size_t>(kw);
+                                    T val = input[input_offset + real_kh * input_width + real_kw];
+
+                                    size_t linear_idx = real_kh * input_width + real_kw;
+                                    size_t old_linear_idx = found_max ? max_h * input_width + max_w : SIZE_MAX;
+
+                                    if (!found_max || is_greater(val, max_val) || (values_equal(val, max_val) && linear_idx < old_linear_idx)) {
+                                        max_val = val;
+                                        max_h = real_kh;
+                                        max_w = real_kw;
+                                        found_max = true;
+                                    }
+                                }
+                            }
+                        }
+
+                        if (found_max) {
+                            size_t grad_input_idx = input_offset + max_h * input_width + max_w;
+                            if constexpr (std::is_same<T, bf16_t>::value) {
+                                float current = utils::cast<float>(grad_input[grad_input_idx]);
+                                float to_add = utils::cast<float>(grad_output[output_offset + oh * output_width + ow]);
+                                grad_input[grad_input_idx] = utils::cast<T>(current + to_add);
+                            } else {
+                                grad_input[grad_input_idx] += grad_output[output_offset + oh * output_width + ow];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    template <typename T>
+    void maxpool_backward_3d_generic(T *grad_input, const T *grad_output, const T *input) const {
+        size_t batch_size = info.batch;
+        size_t channels = info.channels;
+        size_t input_depth = info.input_dims[0];
+        size_t input_height = info.input_dims[1];
+        size_t input_width = info.input_dims[2];
+        size_t output_depth = info.output_dims[0];
+        size_t output_height = info.output_dims[1];
+        size_t output_width = info.output_dims[2];
+        size_t kernel_depth = info.kernel_sizes[0];
+        size_t kernel_height = info.kernel_sizes[1];
+        size_t kernel_width = info.kernel_sizes[2];
+        size_t stride_depth = info.strides[0];
+        size_t stride_height = info.strides[1];
+        size_t stride_width = info.strides[2];
+        size_t pad_depth = info.pads[0];
+        size_t pad_height = info.pads[1];
+        size_t pad_width = info.pads[2];
+
+        size_t total_input_size = batch_size * channels * input_depth * input_height * input_width;
+        std::fill(grad_input, grad_input + total_input_size, T{});
+
+        for (size_t b = 0; b < batch_size; ++b) {
+            for (size_t c = 0; c < channels; ++c) {
+                size_t input_offset = b * channels * input_depth * input_height * input_width + c * input_depth * input_height * input_width;
+                size_t output_offset = b * channels * output_depth * output_height * output_width + c * output_depth * output_height * output_width;
+
+                for (size_t od = 0; od < output_depth; ++od) {
+                    for (size_t oh = 0; oh < output_height; ++oh) {
+                        for (size_t ow = 0; ow < output_width; ++ow) {
+                            T max_val = get_min_value<T>();
+                            size_t max_d = 0, max_h = 0, max_w = 0;
+                            bool found_max = false;
+
+                            int start_d = static_cast<int>(od * stride_depth) - static_cast<int>(pad_depth);
+                            int end_d = start_d + static_cast<int>(kernel_depth);
+                            int start_h = static_cast<int>(oh * stride_height) - static_cast<int>(pad_height);
+                            int end_h = start_h + static_cast<int>(kernel_height);
+                            int start_w = static_cast<int>(ow * stride_width) - static_cast<int>(pad_width);
+                            int end_w = start_w + static_cast<int>(kernel_width);
+
+                            for (int kd = start_d; kd < end_d; ++kd) {
+                                for (int kh = start_h; kh < end_h; ++kh) {
+                                    for (int kw = start_w; kw < end_w; ++kw) {
+                                        if (kd >= 0 && kd < static_cast<int>(input_depth) && kh >= 0 && kh < static_cast<int>(input_height) && kw >= 0 && kw < static_cast<int>(input_width)) {
+
+                                            size_t real_kd = static_cast<size_t>(kd);
+                                            size_t real_kh = static_cast<size_t>(kh);
+                                            size_t real_kw = static_cast<size_t>(kw);
+
+                                            T val = input[input_offset + real_kd * input_height * input_width + real_kh * input_width + real_kw];
+
+                                            size_t linear_idx = real_kd * input_height * input_width + real_kh * input_width + real_kw;
+                                            size_t old_linear_idx = found_max ? max_d * input_height * input_width + max_h * input_width + max_w : SIZE_MAX;
+
+                                            if (!found_max || is_greater(val, max_val) || (values_equal(val, max_val) && linear_idx < old_linear_idx)) {
+                                                max_val = val;
+                                                max_d = real_kd;
+                                                max_h = real_kh;
+                                                max_w = real_kw;
+                                                found_max = true;
+                                            }
+                                        }
+                                    }
+                                }
+                            }
+
+                            if (found_max) {
+                                size_t grad_input_idx = input_offset + max_d * input_height * input_width + max_h * input_width + max_w;
+                                if constexpr (std::is_same<T, bf16_t>::value) {
+                                    float current = utils::cast<float>(grad_input[grad_input_idx]);
+                                    float to_add = utils::cast<float>(grad_output[output_offset + od * output_height * output_width + oh * output_width + ow]);
+                                    grad_input[grad_input_idx] = utils::cast<T>(current + to_add);
+                                } else {
+                                    grad_input[grad_input_idx] += grad_output[output_offset + od * output_height * output_width + oh * output_width + ow];
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : handle(other.handle),
+          info(std::move(other.info)),
+          workspace_size(other.workspace_size) {
+        other.handle = nullptr;
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() = default;
+
+    static inline utils::Result<Opaque>
+    create(device::cpu::Handle *handle_ptr,
+           MaxPoolBackwardInfo &info,
+           infiniDtype_t data_type) {
+        if (data_type != INFINI_DTYPE_F32 && data_type != INFINI_DTYPE_F16 && data_type != INFINI_DTYPE_BF16) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        Opaque opaque(handle_ptr, info);
+        return utils::Result<Opaque>(std::move(opaque));
+    }
+
+    infiniStatus_t calculate(void *workspace, size_t workspace_size,
+                             void *grad_input, const void *grad_output,
+                             const void *input, infiniDtype_t dtype) const {
+
+        if (!grad_input || !grad_output || !input) {
+            return INFINI_STATUS_BAD_PARAM;
+        }
+
+        switch (dtype) {
+        case INFINI_DTYPE_F32: {
+            float *typed_grad_input = static_cast<float *>(grad_input);
+            const float *typed_grad_output = static_cast<const float *>(grad_output);
+            const float *typed_input = static_cast<const float *>(input);
+            maxpool_backward_cpu(typed_grad_input, typed_grad_output, typed_input);
+            break;
+        }
+
+        case INFINI_DTYPE_F16: {
+            // F16特殊处理：转换为float计算
+            fp16_t *typed_grad_input = static_cast<fp16_t *>(grad_input);
+            const fp16_t *typed_grad_output = static_cast<const fp16_t *>(grad_output);
+            const fp16_t *typed_input = static_cast<const fp16_t *>(input);
+            maxpool_backward_f16_as_float(typed_grad_input, typed_grad_output, typed_input);
+            break;
+        }
+
+        case INFINI_DTYPE_BF16: {
+            bf16_t *typed_grad_input = static_cast<bf16_t *>(grad_input);
+            const bf16_t *typed_grad_output = static_cast<const bf16_t *>(grad_output);
+            const bf16_t *typed_input = static_cast<const bf16_t *>(input);
+            maxpool_backward_cpu(typed_grad_input, typed_grad_output, typed_input);
+            break;
+        }
+
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        return INFINI_STATUS_SUCCESS;
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t grad_input_desc,
+                                  infiniopTensorDescriptor_t grad_output_desc,
+                                  infiniopTensorDescriptor_t input_desc,
+                                  void *kernel_size, void *strides, void *pads,
+                                  bool ceil_mode) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = input_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32, INFINI_DTYPE_F16, INFINI_DTYPE_BF16);
+
+    auto result = MaxPoolBackwardInfo::create(grad_input_desc, grad_output_desc, input_desc,
+                                              kernel_size, strides, pads, ceil_mode);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    auto opaque_result = Opaque::create(handle, info, dtype);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size,
+                               opaque, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *grad_input, const void *grad_output,
+                                     const void *input, void *stream) const {
+    return _opaque->calculate(workspace, workspace_size, grad_input, grad_output, input, _dtype);
+}
+
+} // namespace op::maxpool_backward::cpu
diff --git a/src/infiniop/ops/maxpool_backward/cpu/maxpool_backward_cpu.h b/src/infiniop/ops/maxpool_backward/cpu/maxpool_backward_cpu.h
new file mode 100644
index 000000000..753a01f58
--- /dev/null
+++ b/src/infiniop/ops/maxpool_backward/cpu/maxpool_backward_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __MAXPOOL_BACKWARD_CPU_H__
+#define __MAXPOOL_BACKWARD_CPU_H__
+
+#include "../maxpool_backward.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __MAXPOOL_BACKWARD_CPU_H__
diff --git a/src/infiniop/ops/maxpool_backward/info.h b/src/infiniop/ops/maxpool_backward/info.h
new file mode 100644
index 000000000..e8d88c577
--- /dev/null
+++ b/src/infiniop/ops/maxpool_backward/info.h
@@ -0,0 +1,81 @@
+#ifndef __MAXPOOL_BACKWARD_INFO_H__
+#define __MAXPOOL_BACKWARD_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::maxpool_backward {
+
+class MaxPoolBackwardInfo {
+    MaxPoolBackwardInfo() = default;
+
+public:
+    std::vector<size_t> input_dims;  // original input dimensions
+    std::vector<size_t> output_dims; // pooled output dimensions
+    std::vector<size_t> kernel_sizes;
+    std::vector<size_t> strides;
+    std::vector<size_t> pads;
+    bool ceil_mode;
+    size_t ndim;
+    size_t batch;
+    size_t channels;
+
+    static utils::Result<MaxPoolBackwardInfo> create(
+        infiniopTensorDescriptor_t grad_input_desc,  // gradient w.r.t. input
+        infiniopTensorDescriptor_t grad_output_desc, // gradient w.r.t. output
+        infiniopTensorDescriptor_t input_desc,       // original input
+        void *kernel_size,
+        void *strides,
+        void *pads,
+        bool ceil_mode) {
+
+        MaxPoolBackwardInfo info;
+
+        // Validate tensor dimensions
+        if (input_desc->ndim() < 3 || input_desc->ndim() > 5) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        if (input_desc->ndim() != grad_input_desc->ndim() || grad_output_desc->ndim() != grad_input_desc->ndim()) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        // Check batch and channel dimensions match
+        if (input_desc->dim(0) != grad_input_desc->dim(0) || input_desc->dim(1) != grad_input_desc->dim(1) || grad_output_desc->dim(0) != grad_input_desc->dim(0) || grad_output_desc->dim(1) != grad_input_desc->dim(1)) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        // Check spatial dimensions consistency
+        for (size_t i = 2; i < input_desc->ndim(); ++i) {
+            if (input_desc->dim(i) != grad_input_desc->dim(i)) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+        }
+
+        info.ndim = input_desc->ndim() - 2; // spatial dimensions
+        info.batch = input_desc->dim(0);
+        info.channels = input_desc->dim(1);
+        info.ceil_mode = ceil_mode;
+
+        auto kernel_ptr = reinterpret_cast<const size_t *>(kernel_size);
+        auto stride_ptr = reinterpret_cast<const size_t *>(strides);
+        auto pad_ptr = reinterpret_cast<const size_t *>(pads);
+
+        // Store spatial dimensions and pooling parameters
+        for (size_t i = 0; i < info.ndim; ++i) {
+            info.input_dims.push_back(input_desc->dim(i + 2));
+            info.output_dims.push_back(grad_output_desc->dim(i + 2));
+            info.kernel_sizes.push_back(kernel_ptr[i]);
+            info.strides.push_back(stride_ptr[i]);
+            info.pads.push_back(pad_ptr[i]);
+        }
+
+        return utils::Result<MaxPoolBackwardInfo>(std::move(info));
+    }
+};
+
+} // namespace op::maxpool_backward
+
+#endif // __MAXPOOL_BACKWARD_INFO_H__
diff --git a/src/infiniop/ops/maxpool_backward/maxpool_backward.h b/src/infiniop/ops/maxpool_backward/maxpool_backward.h
new file mode 100644
index 000000000..459559b4a
--- /dev/null
+++ b/src/infiniop/ops/maxpool_backward/maxpool_backward.h
@@ -0,0 +1,55 @@
+#ifndef __MAXPOOL_BACKWARD_H__
+#define __MAXPOOL_BACKWARD_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+#define DESCRIPTOR(NAMESPACE)                                    \
+                                                                 \
+    namespace op::maxpool_backward::NAMESPACE {                  \
+    class Descriptor final : public InfiniopDescriptor {         \
+        struct Opaque;                                           \
+        Opaque *_opaque;                                         \
+        infiniDtype_t _dtype;                                    \
+        MaxPoolBackwardInfo _info;                               \
+        size_t _workspace_size;                                  \
+                                                                 \
+        Descriptor(                                              \
+            infiniDtype_t dtype,                                 \
+            MaxPoolBackwardInfo info,                            \
+            size_t workspace_size_,                              \
+            Opaque *opaque,                                      \
+            infiniDevice_t device_type,                          \
+            int device_id)                                       \
+            : InfiniopDescriptor{device_type, device_id},        \
+              _opaque(opaque),                                   \
+              _dtype(dtype),                                     \
+              _info(info),                                       \
+              _workspace_size(workspace_size_) {}                \
+                                                                 \
+    public:                                                      \
+        ~Descriptor();                                           \
+                                                                 \
+        size_t workspaceSize() const { return _workspace_size; } \
+                                                                 \
+        static infiniStatus_t create(                            \
+            infiniopHandle_t handle,                             \
+            Descriptor **desc_ptr,                               \
+            infiniopTensorDescriptor_t grad_input_desc,          \
+            infiniopTensorDescriptor_t grad_output_desc,         \
+            infiniopTensorDescriptor_t input_desc,               \
+            void *kernel_size,                                   \
+            void *strides,                                       \
+            void *pads,                                          \
+            bool ceil_mode);                                     \
+                                                                 \
+        infiniStatus_t calculate(                                \
+            void *workspace, size_t workspace_size,              \
+            void *grad_input,                                    \
+            const void *grad_output,                             \
+            const void *input,                                   \
+            void *stream) const;                                 \
+    };                                                           \
+    }
+
+#endif // __MAXPOOL_BACKWARD_H__
diff --git a/src/infiniop/ops/maxpool_backward/metax/maxpool_backward_metax.cc b/src/infiniop/ops/maxpool_backward/metax/maxpool_backward_metax.cc
new file mode 100644
index 000000000..0a8b1a1ee
--- /dev/null
+++ b/src/infiniop/ops/maxpool_backward/metax/maxpool_backward_metax.cc
@@ -0,0 +1,248 @@
+#include "maxpool_backward_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+
+#define DESTROY_HCDNN_DESCRIPTOR(desc_ptr, destroy_func) \
+    do {                                                 \
+        if (desc_ptr) {                                  \
+            destroy_func(desc_ptr);                      \
+            desc_ptr = nullptr;                          \
+        }                                                \
+    } while (0)
+
+#define CLEANUP_HCDNN_DESCRIPTORS()                                               \
+    do {                                                                          \
+        DESTROY_HCDNN_DESCRIPTOR(input_desc, hcdnnDestroyTensorDescriptor);       \
+        DESTROY_HCDNN_DESCRIPTOR(grad_input_desc, hcdnnDestroyTensorDescriptor);  \
+        DESTROY_HCDNN_DESCRIPTOR(grad_output_desc, hcdnnDestroyTensorDescriptor); \
+        DESTROY_HCDNN_DESCRIPTOR(pooling_backward_desc,                           \
+                                 hcdnnDestroyPoolingDescriptor);                  \
+    } while (0)
+
+namespace op::maxpool_backward::metax {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::metax::Handle::Internal> internal;
+    size_t workspace_size = 0;
+
+#ifdef ENABLE_HCDNN_API
+    hcdnnTensorDescriptor_t input_desc = nullptr;
+    hcdnnTensorDescriptor_t grad_input_desc = nullptr;
+    hcdnnTensorDescriptor_t grad_output_desc = nullptr;
+    hcdnnPoolingDescriptor_t pooling_backward_desc = nullptr;
+#endif
+
+private:
+    Opaque(std::shared_ptr<device::metax::Handle::Internal> internal_ptr)
+        : internal(internal_ptr) {}
+
+#ifdef ENABLE_HCDNN_API
+    void calculateStrides(const std::vector<int> &dims, std::vector<int> &strides,
+                          int ndim) const {
+        strides[ndim - 1] = 1;
+        for (int d = ndim - 2; d >= 0; --d) {
+            strides[d] = strides[d + 1] * dims[d + 1];
+        }
+    }
+
+    infiniStatus_t createPoolingDescriptors(const MaxPoolBackwardInfo &info,
+                                            hcdnnDataType_t hcdnn_data_type) {
+        // 创建hcdnn描述符
+        CHECK_MCDNN(hcdnnCreateTensorDescriptor(&input_desc));
+        CHECK_MCDNN(hcdnnCreateTensorDescriptor(&grad_input_desc));
+        CHECK_MCDNN(hcdnnCreateTensorDescriptor(&grad_output_desc));
+        CHECK_MCDNN(hcdnnCreatePoolingDescriptor(&pooling_backward_desc));
+
+        // 构建输入、输出梯度维度（NCHW格式）
+        std::vector<int> input_dims_vec = {static_cast<int>(info.batch),
+                                           static_cast<int>(info.channels)};
+        std::vector<int> output_dims_vec = {static_cast<int>(info.batch),
+                                            static_cast<int>(info.channels)};
+        for (size_t i = 0; i < info.ndim; ++i) {
+            input_dims_vec.push_back(static_cast<int>(info.input_dims[i]));
+            output_dims_vec.push_back(static_cast<int>(info.output_dims[i]));
+        }
+
+        // 1D池化补充维度
+        if (info.ndim == 1) {
+            input_dims_vec.push_back(1);
+            output_dims_vec.push_back(1);
+        }
+
+        // 计算内存步幅
+        std::vector<int> input_strides_vec(input_dims_vec.size());
+        std::vector<int> output_strides_vec(output_dims_vec.size());
+        calculateStrides(input_dims_vec, input_strides_vec, input_dims_vec.size());
+        calculateStrides(output_dims_vec, output_strides_vec, output_dims_vec.size());
+
+        // 设置张量描述符（带步幅）
+        CHECK_MCDNN(hcdnnSetTensorNdDescriptor(
+            input_desc, hcdnn_data_type, input_dims_vec.size(),
+            input_dims_vec.data(), input_strides_vec.data()));
+
+        CHECK_MCDNN(hcdnnSetTensorNdDescriptor(
+            grad_input_desc, hcdnn_data_type, input_dims_vec.size(),
+            input_dims_vec.data(), input_strides_vec.data()));
+
+        CHECK_MCDNN(hcdnnSetTensorNdDescriptor(
+            grad_output_desc, hcdnn_data_type, output_dims_vec.size(),
+            output_dims_vec.data(), output_strides_vec.data()));
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t setupPoolingDescriptor(const MaxPoolBackwardInfo &info) {
+        // 构建池化参数
+        std::vector<int> kernel_vec, stride_vec, pad_vec;
+        for (size_t i = 0; i < info.ndim; ++i) {
+            kernel_vec.push_back(static_cast<int>(info.kernel_sizes[i]));
+            stride_vec.push_back(static_cast<int>(info.strides[i]));
+            pad_vec.push_back(static_cast<int>(info.pads[i]));
+        }
+
+        // 1D池化补充维度
+        if (info.ndim == 1) {
+            kernel_vec.push_back(1);
+            stride_vec.push_back(1);
+            pad_vec.push_back(0);
+        }
+
+        // 设置最大池化反向描述符（确定性模式）
+        CHECK_MCDNN(hcdnnSetPoolingNdDescriptor(
+            pooling_backward_desc, HCDNN_POOLING_MAX_DETERMINISTIC, // 确定性最大池化
+            HCDNN_NOT_PROPAGATE_NAN,                                // 不传播NaN
+            kernel_vec.size(),
+            kernel_vec.data(),
+            pad_vec.data(),
+            stride_vec.data()));
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t initializeHcdnnContext(MaxPoolBackwardInfo &info,
+                                          infiniDtype_t data_type) {
+        hcdnnDataType_t hcdnn_data_type = device::metax::getHcdnnDtype(data_type);
+
+        CHECK_STATUS(createPoolingDescriptors(info, hcdnn_data_type));
+        CHECK_STATUS(setupPoolingDescriptor(info));
+
+        // 计算工作空间大小（需存储前向输出用于反向计算）
+        CHECK_MCDNN(hcdnnGetTensorSizeInBytes(grad_output_desc, &workspace_size));
+
+        return INFINI_STATUS_SUCCESS;
+    }
+#endif
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : internal(std::move(other.internal)),
+          workspace_size(other.workspace_size)
+#ifdef ENABLE_HCDNN_API
+          ,
+          input_desc(other.input_desc), grad_input_desc(other.grad_input_desc), grad_output_desc(other.grad_output_desc), pooling_backward_desc(other.pooling_backward_desc)
+#endif
+    {
+#ifdef ENABLE_HCDNN_API
+        other.input_desc = nullptr;
+        other.grad_input_desc = nullptr;
+        other.grad_output_desc = nullptr;
+        other.pooling_backward_desc = nullptr;
+#endif
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() {
+#ifdef ENABLE_HCDNN_API
+        CLEANUP_HCDNN_DESCRIPTORS();
+#endif
+    }
+
+    static inline utils::Result<Opaque>
+    create(std::shared_ptr<device::metax::Handle::Internal> internal_ptr,
+           MaxPoolBackwardInfo &info, infiniDtype_t data_type) {
+#ifdef ENABLE_HCDNN_API
+        Opaque opaque(internal_ptr);
+        auto status = opaque.initializeHcdnnContext(info, data_type);
+        if (status != INFINI_STATUS_SUCCESS) {
+            return status;
+        }
+        return utils::Result<Opaque>(std::move(opaque));
+#else
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t grad_input_desc,
+                                  infiniopTensorDescriptor_t grad_output_desc,
+                                  infiniopTensorDescriptor_t input_desc,
+                                  void *kernel_size, void *strides, void *pads,
+                                  bool ceil_mode) {
+
+#ifdef ENABLE_HCDNN_API
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = input_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    auto result = MaxPoolBackwardInfo::create(grad_input_desc, grad_output_desc, input_desc,
+                                              kernel_size, strides, pads, ceil_mode);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    auto opaque_result = Opaque::create(handle->internal(), info, dtype);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size,
+                               opaque, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *grad_input, const void *grad_output,
+                                     const void *input, void *stream) const {
+
+#ifdef ENABLE_HCDNN_API
+    const float alpha = 1.0f, beta = 0.0f;
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    CHECK_STATUS(_opaque->internal->useMcdnn(
+        (hcStream_t)stream, [&](hcdnnHandle_t handle) {
+            void *temp_output = workspace;
+            CHECK_MCDNN(hcdnnPoolingForward(
+                handle, _opaque->pooling_backward_desc, &alpha,
+                _opaque->input_desc, input, &beta, _opaque->grad_output_desc, temp_output));
+
+            CHECK_MCDNN(hcdnnPoolingBackward(
+                handle, _opaque->pooling_backward_desc, &alpha,
+                _opaque->grad_output_desc, temp_output, // 前向输出（用于定位最大值）
+                _opaque->grad_output_desc, grad_output, // 输出梯度
+                _opaque->input_desc, input,             // 前向输入
+                &beta,
+                _opaque->grad_input_desc, grad_input // 输入梯度（输出）
+                ));
+            return INFINI_STATUS_SUCCESS;
+        }));
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+} // namespace op::maxpool_backward::metax
diff --git a/src/infiniop/ops/maxpool_backward/metax/maxpool_backward_metax.h b/src/infiniop/ops/maxpool_backward/metax/maxpool_backward_metax.h
new file mode 100644
index 000000000..5133090e2
--- /dev/null
+++ b/src/infiniop/ops/maxpool_backward/metax/maxpool_backward_metax.h
@@ -0,0 +1,8 @@
+#ifndef __MAXPOOL_BACKWARD_METAX_H__
+#define __MAXPOOL_BACKWARD_METAX_H__
+
+#include "../maxpool_backward.h"
+
+DESCRIPTOR(metax)
+
+#endif // __MAXPOOL_BACKWARD_METAX_H__
diff --git a/src/infiniop/ops/maxpool_backward/nvidia/maxpool_backward_nvidia.cu b/src/infiniop/ops/maxpool_backward/nvidia/maxpool_backward_nvidia.cu
new file mode 100644
index 000000000..7247ffede
--- /dev/null
+++ b/src/infiniop/ops/maxpool_backward/nvidia/maxpool_backward_nvidia.cu
@@ -0,0 +1,270 @@
+#include "../../../devices/nvidia/nvidia_common.cuh"
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "maxpool_backward_nvidia.cuh"
+
+#define DESTROY_CUDNN_DESCRIPTOR(desc_ptr, destroy_func) \
+    do {                                                 \
+        if (desc_ptr) {                                  \
+            destroy_func(desc_ptr);                      \
+            desc_ptr = nullptr;                          \
+        }                                                \
+    } while (0)
+
+#define CLEANUP_CUDNN_DESCRIPTORS()                                               \
+    do {                                                                          \
+        DESTROY_CUDNN_DESCRIPTOR(input_desc, cudnnDestroyTensorDescriptor);       \
+        DESTROY_CUDNN_DESCRIPTOR(grad_input_desc, cudnnDestroyTensorDescriptor);  \
+        DESTROY_CUDNN_DESCRIPTOR(grad_output_desc, cudnnDestroyTensorDescriptor); \
+        DESTROY_CUDNN_DESCRIPTOR(pooling_backward_desc,                           \
+                                 cudnnDestroyPoolingDescriptor);                  \
+    } while (0)
+
+namespace op::maxpool_backward::nvidia {
+
+struct Descriptor::Opaque {
+    std::shared_ptr<device::nvidia::Handle::Internal> internal;
+    size_t workspace_size = 0;
+
+#ifdef ENABLE_CUDNN_API
+    cudnnTensorDescriptor_t input_desc = nullptr;
+    cudnnTensorDescriptor_t grad_input_desc = nullptr;
+    cudnnTensorDescriptor_t grad_output_desc = nullptr;
+    cudnnPoolingDescriptor_t pooling_backward_desc = nullptr;
+#endif
+
+private:
+    Opaque(std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr)
+        : internal(internal_ptr) {}
+
+#ifdef ENABLE_CUDNN_API
+    infiniStatus_t getCudnnDataType(infiniDtype_t data_type,
+                                    cudnnDataType_t &cudnn_data_type) const {
+        if (data_type == INFINI_DTYPE_F16) {
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
+        } else if (data_type == INFINI_DTYPE_F32) {
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
+        } else if (data_type == INFINI_DTYPE_BF16) {
+            cudnn_data_type = device::nvidia::getCudnnDtype(data_type);
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    void calculateStrides(const std::vector<int> &dims, std::vector<int> &strides,
+                          int ndim) const {
+        strides[ndim - 1] = 1;
+        for (int d = ndim - 2; d >= 0; --d) {
+            strides[d] = strides[d + 1] * dims[d + 1];
+        }
+    }
+
+    infiniStatus_t createPoolingDescriptors(const MaxPoolBackwardInfo &info,
+                                            cudnnDataType_t cudnn_data_type) {
+        // Create CUDNN descriptors
+        CHECK_CUDNN(cudnnCreateTensorDescriptor(&input_desc));
+        CHECK_CUDNN(cudnnCreateTensorDescriptor(&grad_input_desc));
+        CHECK_CUDNN(cudnnCreateTensorDescriptor(&grad_output_desc));
+        CHECK_CUDNN(cudnnCreatePoolingDescriptor(&pooling_backward_desc));
+
+        // Setup tensor descriptors
+        std::vector<int> input_dims_vec = {static_cast<int>(info.batch),
+                                           static_cast<int>(info.channels)};
+        std::vector<int> output_dims_vec = {static_cast<int>(info.batch),
+                                            static_cast<int>(info.channels)};
+
+        for (size_t i = 0; i < info.ndim; ++i) {
+            input_dims_vec.push_back(static_cast<int>(info.input_dims[i]));
+            output_dims_vec.push_back(static_cast<int>(info.output_dims[i]));
+        }
+
+        if (info.ndim == 1) {
+            // For 1D pooling, add dummy dimension
+            input_dims_vec.push_back(1);
+            output_dims_vec.push_back(1);
+        }
+
+        // Calculate memory strides
+        std::vector<int> input_strides_vec(input_dims_vec.size());
+        std::vector<int> output_strides_vec(output_dims_vec.size());
+        calculateStrides(input_dims_vec, input_strides_vec, input_dims_vec.size());
+        calculateStrides(output_dims_vec, output_strides_vec,
+                         output_dims_vec.size());
+
+        // Set tensor descriptors with strides
+        CHECK_CUDNN(cudnnSetTensorNdDescriptor(
+            input_desc, cudnn_data_type, input_dims_vec.size(),
+            input_dims_vec.data(), input_strides_vec.data()));
+
+        CHECK_CUDNN(cudnnSetTensorNdDescriptor(
+            grad_input_desc, cudnn_data_type, input_dims_vec.size(),
+            input_dims_vec.data(), input_strides_vec.data()));
+
+        CHECK_CUDNN(cudnnSetTensorNdDescriptor(
+            grad_output_desc, cudnn_data_type, output_dims_vec.size(),
+            output_dims_vec.data(), output_strides_vec.data()));
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t setupPoolingDescriptor(const MaxPoolBackwardInfo &info) {
+        // Setup pooling descriptor
+        std::vector<int> kernel_vec, stride_vec, pad_vec, pad_vec_backward;
+        for (size_t i = 0; i < info.ndim; ++i) {
+            kernel_vec.push_back(static_cast<int>(info.kernel_sizes[i]));
+            stride_vec.push_back(static_cast<int>(info.strides[i]));
+            pad_vec.push_back(static_cast<int>(info.pads[i]));
+        }
+
+        if (info.ndim == 1) {
+            // For 1D pooling, add dummy dimension
+            kernel_vec.push_back(1);
+            stride_vec.push_back(1);
+            pad_vec.push_back(0);
+        }
+
+        CHECK_CUDNN(cudnnSetPoolingNdDescriptor(
+            pooling_backward_desc, CUDNN_POOLING_MAX, CUDNN_NOT_PROPAGATE_NAN,
+            kernel_vec.size(), kernel_vec.data(), pad_vec.data(),
+            stride_vec.data()));
+
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    infiniStatus_t initializeCudnnContext(MaxPoolBackwardInfo &info,
+                                          infiniDtype_t data_type) {
+        cudnnDataType_t cudnn_data_type;
+        CHECK_STATUS(getCudnnDataType(data_type, cudnn_data_type));
+
+        CHECK_STATUS(createPoolingDescriptors(info, cudnn_data_type));
+        CHECK_STATUS(setupPoolingDescriptor(info));
+
+        // Calculate workspace size, workspace is required for forward output
+        CHECK_CUDNN(cudnnGetTensorSizeInBytes(grad_output_desc, &workspace_size));
+
+        return INFINI_STATUS_SUCCESS;
+    }
+#endif
+
+public:
+    Opaque(Opaque &&other) noexcept
+        : internal(std::move(other.internal)),
+          workspace_size(other.workspace_size)
+    // clang-format off
+#ifdef ENABLE_CUDNN_API
+          , input_desc(other.input_desc)
+          , grad_input_desc(other.grad_input_desc)
+          , grad_output_desc(other.grad_output_desc)
+          , pooling_backward_desc(other.pooling_backward_desc)
+#endif
+    // clang-format on
+    {
+#ifdef ENABLE_CUDNN_API
+        other.input_desc = nullptr;
+        other.grad_input_desc = nullptr;
+        other.grad_output_desc = nullptr;
+        other.pooling_backward_desc = nullptr;
+#endif
+        other.workspace_size = 0;
+    }
+
+    ~Opaque() {
+#ifdef ENABLE_CUDNN_API
+        CLEANUP_CUDNN_DESCRIPTORS();
+#endif
+    }
+
+    static inline utils::Result<Opaque>
+    create(std::shared_ptr<device::nvidia::Handle::Internal> internal_ptr,
+           MaxPoolBackwardInfo &info, infiniDtype_t data_type) {
+#ifdef ENABLE_CUDNN_API
+        Opaque opaque(internal_ptr);
+        auto status = opaque.initializeCudnnContext(info, data_type);
+        if (status != INFINI_STATUS_SUCCESS) {
+            return status;
+        }
+        return utils::Result<Opaque>(std::move(opaque));
+#else
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+    }
+};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(infiniopHandle_t handle_,
+                                  Descriptor **desc_ptr,
+                                  infiniopTensorDescriptor_t grad_input_desc,
+                                  infiniopTensorDescriptor_t grad_output_desc,
+                                  infiniopTensorDescriptor_t input_desc,
+                                  void *kernel_size, void *strides, void *pads,
+                                  bool ceil_mode) {
+
+#ifdef ENABLE_CUDNN_API
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = input_desc->dtype();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    auto result = MaxPoolBackwardInfo::create(grad_input_desc, grad_output_desc, input_desc,
+                                              kernel_size, strides, pads, ceil_mode);
+    CHECK_RESULT(result);
+    auto info = result.take();
+
+    auto opaque_result = Opaque::create(handle->internal(), info, dtype);
+    CHECK_RESULT(opaque_result);
+    auto opaque = new Opaque(opaque_result.take());
+
+    *desc_ptr = new Descriptor(dtype, std::move(info), opaque->workspace_size,
+                               opaque, handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *grad_input, const void *grad_output,
+                                     const void *input, void *stream) const {
+
+#ifdef ENABLE_CUDNN_API
+    const float alpha = 1.0f, beta = 0.0f;
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    CHECK_STATUS(_opaque->internal->useCudnn(
+        (cudaStream_t)stream, [&](cudnnHandle_t handle) {
+            size_t grad_input_size = 0;
+            CHECK_CUDNN(cudnnGetTensorSizeInBytes(_opaque->grad_input_desc,
+                                                  &grad_input_size));
+            CHECK_CUDA(cudaMemset(grad_input, 0, grad_input_size));
+            CHECK_CUDA(cudaMemset(workspace, 0, _workspace_size));
+
+            void *temp_output = workspace;
+            CHECK_CUDNN(cudnnPoolingForward(
+                handle, _opaque->pooling_backward_desc, &alpha, _opaque->input_desc,
+                input, &beta, _opaque->grad_output_desc, temp_output));
+
+            CHECK_CUDNN(cudnnPoolingBackward(
+                handle, _opaque->pooling_backward_desc, &alpha,
+                _opaque->grad_output_desc, temp_output, _opaque->grad_output_desc,
+                grad_output, _opaque->input_desc, input, &beta,
+                _opaque->grad_input_desc, grad_input));
+            return INFINI_STATUS_SUCCESS;
+        }));
+
+    return INFINI_STATUS_SUCCESS;
+#else
+    return INFINI_STATUS_NOT_IMPLEMENTED;
+#endif
+}
+
+} // namespace op::maxpool_backward::nvidia
diff --git a/src/infiniop/ops/maxpool_backward/nvidia/maxpool_backward_nvidia.cuh b/src/infiniop/ops/maxpool_backward/nvidia/maxpool_backward_nvidia.cuh
new file mode 100644
index 000000000..f83fee580
--- /dev/null
+++ b/src/infiniop/ops/maxpool_backward/nvidia/maxpool_backward_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __MAXPOOL_BACKWARD_NVIDIA_CUH__
+#define __MAXPOOL_BACKWARD_NVIDIA_CUH__
+
+#include "../maxpool_backward.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __MAXPOOL_BACKWARD_NVIDIA_CUH__
diff --git a/src/infiniop/ops/maxpool_backward/operator.cc b/src/infiniop/ops/maxpool_backward/operator.cc
new file mode 100644
index 000000000..386d9d3b3
--- /dev/null
+++ b/src/infiniop/ops/maxpool_backward/operator.cc
@@ -0,0 +1,159 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/maxpool_backward.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/maxpool_backward_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API)
+#include "nvidia/maxpool_backward_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/maxpool_backward_metax.h"
+#endif
+
+__C infiniStatus_t infiniopCreateMaxPoolBackwardDescriptor(
+    infiniopHandle_t handle,
+    infiniopMaxPoolBackwardDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t grad_input_desc,
+    infiniopTensorDescriptor_t grad_output_desc,
+    infiniopTensorDescriptor_t input_desc,
+    void *kernel_size,
+    void *strides,
+    void *pads,
+    bool ceil_mode) {
+
+#define CREATE(CASE, NAMESPACE)                                                         \
+    case CASE:                                                                          \
+        return op::maxpool_backward::NAMESPACE::Descriptor::create(                     \
+            handle,                                                                     \
+            reinterpret_cast<op::maxpool_backward::NAMESPACE::Descriptor **>(desc_ptr), \
+            grad_input_desc,                                                            \
+            grad_output_desc,                                                           \
+            input_desc,                                                                 \
+            kernel_size,                                                                \
+            strides,                                                                    \
+            pads,                                                                       \
+            ceil_mode)
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetMaxPoolBackwardWorkspaceSize(
+    infiniopMaxPoolBackwardDescriptor_t desc,
+    size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                                  \
+    case CASE:                                                                                                \
+        *size = reinterpret_cast<const op::maxpool_backward::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef GET
+}
+
+__C infiniStatus_t infiniopMaxPoolBackward(
+    infiniopMaxPoolBackwardDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *grad_input,
+    const void *grad_output,
+    const void *input,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                         \
+    case CASE:                                                                             \
+        return reinterpret_cast<const op::maxpool_backward::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size,                                         \
+                        grad_input,                                                        \
+                        grad_output,                                                       \
+                        input,                                                             \
+                        stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t infiniopDestroyMaxPoolBackwardDescriptor(infiniopMaxPoolBackwardDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                             \
+    case CASE:                                                                              \
+        delete reinterpret_cast<const op::maxpool_backward::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
diff --git a/test/infiniop/averagepool.py b/test/infiniop/averagepool.py
new file mode 100644
index 000000000..79799ca6a
--- /dev/null
+++ b/test/infiniop/averagepool.py
@@ -0,0 +1,239 @@
+import torch
+import ctypes
+from ctypes import c_uint64, c_bool
+
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from typing import Tuple
+import math
+from torch.nn import functional as F
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+_TEST_CASES = [
+    # ============ 1D Average Pooling Tests (converted to MaxPool format) ============
+    # Basic cases
+    ((4, 8, 128), None, (3,), (1,), (0,), False),  # kernel=3, stride=1, pad=0
+    ((2, 16, 256), None, (5,), (2,), (2,), False),  # kernel=5, stride=2, pad=2
+    ((8, 4, 64), None, (7,), (3,), (1,), False),  # kernel=7, stride=3, pad=1
+    # ceil_mode variations
+    ((1, 3, 99), None, (4,), (3,), (1,), True),  # kernel=4, stride=3, pad=1
+    ((3, 2, 77), None, (6,), (4,), (0,), True),  # kernel=6, stride=4, pad=0
+    # ============ 2D Average Pooling Tests ============
+    # Basic cases with square kernels
+    ((2, 3, 64, 64), None, (3, 3), (1, 1), (1, 1), False),
+    ((4, 16, 128, 128), None, (5, 5), (2, 2), (2, 2), False),
+    ((1, 8, 96, 96), None, (7, 7), (3, 3), (0, 0), False),
+    # Rectangular kernels
+    ((2, 4, 80, 120), None, (3, 5), (1, 2), (1, 2), False),
+    ((1, 6, 72, 48), None, (7, 3), (2, 1), (3, 1), False),
+    ((3, 2, 56, 84), None, (2, 4), (2, 3), (0, 2), False),
+    # ceil_mode variations
+    ((1, 1, 33, 33), None, (4, 4), (3, 3), (1, 1), True),
+    ((2, 5, 77, 89), None, (5, 3), (4, 2), (2, 1), True),
+    # ============ 3D Average Pooling Tests ============
+    # Basic cubic kernels
+    ((1, 2, 32, 32, 32), None, (3, 3, 3), (1, 1, 1), (1, 1, 1), False),
+    ((2, 4, 48, 48, 48), None, (5, 5, 5), (2, 2, 2), (2, 2, 2), False),
+    ((1, 1, 64, 64, 64), None, (7, 7, 7), (3, 3, 3), (0, 0, 0), False),
+    # Non-cubic kernels
+    ((1, 3, 24, 36, 48), None, (2, 3, 4), (1, 2, 2), (0, 1, 2), False),
+    ((2, 2, 40, 32, 56), None, (5, 3, 7), (2, 1, 3), (2, 1, 3), False),
+    ((1, 1, 28, 44, 36), None, (3, 5, 2), (2, 3, 1), (1, 2, 1), False),
+    # ceil_mode variations
+    ((1, 1, 27, 27, 27), None, (4, 4, 4), (3, 3, 3), (1, 1, 1), True),
+    ((2, 2, 33, 45, 39), None, (5, 3, 4), (3, 2, 3), (2, 1, 1), True),
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16]
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+
+def averagepool(input_tensor, kernel_size, stride, padding, ceil_mode, output_tensor):
+    ndim = len(input_tensor.shape) - 2
+    if ndim == 1:
+        result = F.avg_pool1d(
+            input_tensor.to(torch.float32), kernel_size[0], stride[0], padding[0], ceil_mode=ceil_mode
+        )
+    elif ndim == 2:
+        result = F.avg_pool2d(
+            input_tensor.to(torch.float32), kernel_size, stride, padding, ceil_mode=ceil_mode
+        )
+    elif ndim == 3:
+        result = F.avg_pool3d(
+            input_tensor.to(torch.float32), kernel_size, stride, padding, ceil_mode=ceil_mode
+        )
+    else:
+        raise ValueError(f"Unsupported spatial dimensions: {ndim}")
+    
+    # 将计算结果转换回原始数据类型
+    output_tensor.copy_(result.to(output_tensor.dtype))
+
+
+def infer_output_shape(input_shape, kernel_size, stride, padding, ceil_mode):
+    def calc_output_size(input_size, k, s, p, ceil_mode):
+        return (
+            math.ceil((input_size + 2 * p - k) / s + 1)
+            if ceil_mode
+            else math.floor((input_size + 2 * p - k) / s + 1)
+        )
+
+    batch, channel, *spatial = input_shape
+    output_spatial = [
+        calc_output_size(spatial[i], kernel_size[i], stride[i], padding[i], ceil_mode)
+        for i in range(len(spatial))
+    ]
+    return (batch, channel) + tuple(output_spatial)
+
+
+def tuple_to_void_p(py_tuple: Tuple):
+    arr = (ctypes.c_uint64 * len(py_tuple))(*py_tuple)
+    return ctypes.cast(arr, ctypes.c_void_p)
+
+
+def test(
+    handle,
+    device,
+    input_shape,
+    input_stride,
+    kernel_size,
+    stride,
+    padding,
+    ceil_mode,
+    tensor_dtype=InfiniDtype.F16,
+    sync=None,
+):
+    input_tensor = TestTensor(
+        input_shape, input_stride, dt=tensor_dtype, device=device, scale=1.0
+    )
+    output_shape = infer_output_shape(
+        input_shape, kernel_size, stride, padding, ceil_mode
+    )
+    output_tensor = TestTensor(output_shape, None, dt=tensor_dtype, device=device)
+
+    print(
+        f"Testing AvgPool on {InfiniDeviceNames[device]} with input_shape: {input_shape}, kernel_size: {kernel_size}, stride: {stride}, padding: {padding}, ceil_mode: {ceil_mode}, dtype: {InfiniDtypeNames[tensor_dtype]}"
+    )
+
+    averagepool(
+        input_tensor.torch_tensor(),
+        kernel_size,
+        stride,
+        padding,
+        ceil_mode,
+        output_tensor.torch_tensor(),
+    )
+
+    if sync:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateAvgPoolDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output_tensor.descriptor,
+            input_tensor.descriptor,
+            tuple_to_void_p(kernel_size),
+            tuple_to_void_p(stride),
+            tuple_to_void_p(padding),
+            c_bool(ceil_mode),
+        )
+    )
+
+    for tensor in [input_tensor, output_tensor]:
+        if tensor:
+            tensor.destroy_desc()
+
+    workspace_size = ctypes.c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetAvgPoolWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output_tensor.device)
+
+    def lib_averagepool():
+        check_error(
+            LIBINFINIOP.infiniopAvgPool(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output_tensor.data(),
+                input_tensor.data(),
+                None,
+            )
+        )
+
+    lib_averagepool()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype)
+    if DEBUG:
+        debug(
+            output_tensor.actual_tensor(),
+            output_tensor.torch_tensor(),
+            atol=atol,
+            rtol=rtol,
+        )
+
+    assert torch.allclose(
+        output_tensor.actual_tensor(),
+        output_tensor.torch_tensor(),
+        atol=atol,
+        rtol=rtol,
+    ), f"Mismatch for shape {input_shape}, kernel {kernel_size}"
+
+    if PROFILE:
+        profile_operation(
+            "PyTorch",
+            lambda: averagepool(
+                input_tensor.torch_tensor(),
+                kernel_size,
+                stride,
+                padding,
+                ceil_mode,
+                output_tensor.torch_tensor(),
+            ),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "   lib", lib_averagepool, device, NUM_PRERUN, NUM_ITERATIONS
+        )
+
+    check_error(LIBINFINIOP.infiniopDestroyAvgPoolDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
\ No newline at end of file
diff --git a/test/infiniop/averagepool_backward.py b/test/infiniop/averagepool_backward.py
new file mode 100644
index 000000000..6dd6613ec
--- /dev/null
+++ b/test/infiniop/averagepool_backward.py
@@ -0,0 +1,261 @@
+import torch
+import ctypes
+from ctypes import c_uint64, c_bool
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+import math
+from torch.nn import functional as F
+from typing import Tuple
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+_TEST_CASES = [
+    # ============ 1D Average Pooling Tests (converted to MaxPool format) ============
+    # Basic cases
+    ((4, 8, 128), None, (3,), (1,), (0,), False),  # kernel=3, stride=1, pad=0
+    ((2, 16, 256), None, (5,), (2,), (2,), False),  # kernel=5, stride=2, pad=2
+    ((8, 4, 64), None, (7,), (3,), (1,), False),  # kernel=7, stride=3, pad=1
+    # ceil_mode variations
+    ((1, 3, 99), None, (4,), (3,), (1,), True),  # kernel=4, stride=3, pad=1
+    ((3, 2, 77), None, (6,), (4,), (0,), True),  # kernel=6, stride=4, pad=0
+    # ============ 2D Average Pooling Tests ============
+    # Basic cases with square kernels
+    ((2, 3, 64, 64), None, (3, 3), (1, 1), (1, 1), False),
+    ((4, 16, 128, 128), None, (5, 5), (2, 2), (2, 2), False),
+    ((1, 8, 96, 96), None, (7, 7), (3, 3), (0, 0), False),
+    # Rectangular kernels
+    ((2, 4, 80, 120), None, (3, 5), (1, 2), (1, 2), False),
+    ((1, 6, 72, 48), None, (7, 3), (2, 1), (3, 1), False),
+    ((3, 2, 56, 84), None, (2, 4), (2, 3), (0, 2), False),
+    # ceil_mode variations
+    ((1, 1, 33, 33), None, (4, 4), (3, 3), (1, 1), True),
+    ((2, 5, 77, 89), None, (5, 3), (4, 2), (2, 1), True),
+    # ============ 3D Average Pooling Tests ============
+    # Basic cubic kernels
+    ((1, 2, 32, 32, 32), None, (3, 3, 3), (1, 1, 1), (1, 1, 1), False),
+    ((2, 4, 48, 48, 48), None, (5, 5, 5), (2, 2, 2), (2, 2, 2), False),
+    ((1, 1, 64, 64, 64), None, (7, 7, 7), (3, 3, 3), (0, 0, 0), False),
+    # Non-cubic kernels
+    ((1, 3, 24, 36, 48), None, (2, 3, 4), (1, 2, 2), (0, 1, 2), False),
+    ((2, 2, 40, 32, 56), None, (5, 3, 7), (2, 1, 3), (2, 1, 3), False),
+    ((1, 1, 28, 44, 36), None, (3, 5, 2), (2, 3, 1), (1, 2, 1), False),
+    # ceil_mode variations
+    ((1, 1, 27, 27, 27), None, (4, 4, 4), (3, 3, 3), (1, 1, 1), True),
+    ((2, 2, 33, 45, 39), None, (5, 3, 4), (3, 2, 3), (2, 1, 1), True),
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.F16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+
+def averagepool_backward(
+    input_tensor,
+    grad_output_tensor,
+    kernel_size,
+    stride,
+    padding,
+    ceil_mode,
+    grad_input_tensor,
+):
+    input_tensor_f32 = input_tensor.to(torch.float32).detach().clone().requires_grad_(True)
+    grad_output_tensor_f32 = grad_output_tensor.to(torch.float32)
+
+    ndim = len(input_tensor.shape) - 2
+    if ndim == 1:
+        output = F.avg_pool1d(
+            input_tensor_f32, kernel_size[0], stride[0], padding[0], ceil_mode=ceil_mode
+        )
+    elif ndim == 2:
+        output = F.avg_pool2d(
+            input_tensor_f32, kernel_size, stride, padding, ceil_mode=ceil_mode
+        )
+    elif ndim == 3:
+        output = F.avg_pool3d(
+            input_tensor_f32, kernel_size, stride, padding, ceil_mode=ceil_mode
+        )
+    else:
+        raise ValueError("Unsupported dimension")
+    
+    output.backward(grad_output_tensor_f32)
+    
+    # 将计算得到的梯度转换回原始数据类型，并复制到梯度输入张量中
+    grad_input_tensor.copy_(input_tensor_f32.grad.to(grad_input_tensor.dtype))
+
+
+def infer_output_shape(input_shape, kernel_size, stride, padding, ceil_mode):
+    def calc_output_size(input_size, k, s, p, ceil_mode):
+        if ceil_mode:
+            return math.ceil((input_size + 2 * p - k) / s + 1)
+        else:
+            return math.floor((input_size + 2 * p - k) / s + 1)
+
+    return (input_shape[0], input_shape[1]) + tuple(
+        calc_output_size(
+            input_shape[i + 2], kernel_size[i], stride[i], padding[i], ceil_mode
+        )
+        for i in range(len(kernel_size))
+    )
+
+
+def tuple_to_void_p(py_tuple: Tuple):
+    array = ctypes.c_uint64 * len(py_tuple)
+    data_array = array(*py_tuple)
+    return ctypes.cast(data_array, ctypes.c_void_p)
+
+
+def test(
+    handle,
+    device,
+    input_shape,
+    input_stride,
+    kernel_size,
+    stride,
+    padding,
+    ceil_mode,
+    tensor_dtype=InfiniDtype.F16,
+    sync=None,
+):
+    input_tensor = TestTensor(
+        input_shape, input_stride, dt=tensor_dtype, device=device, scale=1.0
+    )
+    output_shape = infer_output_shape(
+        input_shape, kernel_size, stride, padding, ceil_mode
+    )
+    grad_output_tensor = TestTensor(
+        output_shape, None, dt=tensor_dtype, device=device, scale=1.0
+    )
+    grad_input_tensor = TestTensor(
+        input_shape, input_stride, dt=tensor_dtype, device=device
+    )
+
+    print(
+        f"Testing AvgPoolBackward on {InfiniDeviceNames[device]} with input: {input_shape}, kernel: {kernel_size}, stride: {stride}, pad: {padding}, ceil_mode: {ceil_mode}"
+    )
+    print(
+        f"Input Tensor: {input_tensor.shape}, Grad Output Tensor: {grad_output_tensor.shape}, Grad Input Tensor: {grad_input_tensor.shape}"
+    )
+
+    averagepool_backward(
+        input_tensor.torch_tensor(),
+        grad_output_tensor.torch_tensor(),
+        kernel_size,
+        stride,
+        padding,
+        ceil_mode,
+        grad_input_tensor.torch_tensor(),
+    )
+
+    if sync:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateAvgPoolBackwardDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            grad_input_tensor.descriptor,
+            grad_output_tensor.descriptor,
+            input_tensor.descriptor,
+            tuple_to_void_p(kernel_size),
+            tuple_to_void_p(stride),
+            tuple_to_void_p(padding),
+            c_bool(ceil_mode),
+        )
+    )
+
+    for tensor in [input_tensor, grad_output_tensor, grad_input_tensor]:
+        if tensor:
+            tensor.destroy_desc()
+
+    workspace_size = ctypes.c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetAvgPoolBackwardWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, device)
+
+    def lib_averagepool_backward():
+        check_error(
+            LIBINFINIOP.infiniopAvgPoolBackward(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                grad_input_tensor.data(),
+                grad_output_tensor.data(),
+                input_tensor.data(),
+                None,
+            )
+        )
+
+    lib_averagepool_backward()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype)
+    if DEBUG:
+        debug(
+            grad_input_tensor.actual_tensor(),
+            grad_input_tensor.torch_tensor(),
+            atol,
+            rtol,
+        )
+    assert torch.allclose(
+        grad_input_tensor.actual_tensor(),
+        grad_input_tensor.torch_tensor(),
+        atol=atol,
+        rtol=rtol,
+    )
+
+    if PROFILE:
+        profile_operation(
+            "PyTorch",
+            lambda: averagepool_backward(
+                input_tensor.torch_tensor(),
+                grad_output_tensor.torch_tensor(),
+                kernel_size,
+                stride,
+                padding,
+                ceil_mode,
+                grad_input_tensor.torch_tensor(),
+            ),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "lib", lib_averagepool_backward, device, NUM_PRERUN, NUM_ITERATIONS
+        )
+
+    check_error(LIBINFINIOP.infiniopDestroyAvgPoolBackwardDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+    print("\033[92mAvgPoolBackward Test Passed!\033[0m")
diff --git a/test/infiniop/conv_backward.py b/test/infiniop/conv_backward.py
new file mode 100644
index 000000000..e7a262963
--- /dev/null
+++ b/test/infiniop/conv_backward.py
@@ -0,0 +1,319 @@
+import torch
+import ctypes
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from typing import List, Tuple
+import math
+
+_TENSOR_DTYPES = [InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16]
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+_TEST_CASES = [
+    # 1D Conv Backward Tests
+    # x_shape, x_stride, w_shape, w_stride, pads, strides, dilations, group
+    ((2, 4, 16), (64, 16, 1), (8, 4, 5), (20, 5, 1), (0,), (1,), (1,), 1),
+    ((2, 4, 32), (128, 32, 1), (8, 4, 3), (12, 3, 1), (1,), (2,), (1,), 1),
+    ((1, 2, 64), (128, 64, 1), (4, 2, 7), (14, 7, 1), (2,), (3,), (1,), 1),
+    # 2D Conv Backward Tests
+    (
+        (2, 3, 10, 10),
+        (300, 100, 10, 1),
+        (6, 3, 3, 3),
+        (27, 9, 3, 1),
+        (0, 1),
+        (1, 2),
+        (2, 2),
+        1,
+    ),
+    (
+        (1, 2, 28, 14),
+        (784, 392, 14, 1),
+        (4, 2, 5, 3),
+        (30, 15, 3, 1),
+        (1, 2),
+        (3, 2),
+        (1, 1),
+        1,
+    ),
+    # 3D Conv Backward Tests
+    (
+        (1, 2, 5, 5, 5),
+        (250, 125, 25, 5, 1),
+        (4, 2, 2, 2, 2), 
+        (16, 8, 4, 2, 1),
+        (0, 1, 1), 
+        (1, 1, 1),             
+        (1, 1, 1),                   
+        1,
+    ),
+    # Grouped convolution test case
+    ((2, 4, 16), (64, 16, 1), (4, 2, 3), (6, 3, 1), (1,), (1,), (1,), 2),
+]
+
+
+def inferShapeStride(
+    x_shape: List[int],
+    w_shape: List[int],
+    pads: List[int],
+    strides: List[int],
+    dilations: List[int],
+) -> Tuple[Tuple[int, ...], Tuple[int, ...]]:
+    assert (
+        len(x_shape)
+        == len(w_shape)
+        == len(pads) + 2
+        == len(dilations) + 2
+        == len(strides) + 2
+    ), "x and w should have the same length; pads, strides, and dilatinos should have the same length; the length of pads should be that of x - 2"
+    output_dims = [
+        math.floor(
+            (x_shape[i + 2] + 2 * pads[i] - dilations[i] * (w_shape[i + 2] - 1) - 1)
+            / strides[i]
+            + 1
+        )
+        for i in range(len(pads))
+    ]
+    output_shape = (x_shape[0], w_shape[0]) + tuple(output_dims)
+    output_strides = [1]
+    for s in reversed(output_shape[1:]):
+        output_strides.insert(0, output_strides[0] * s)
+    output_strides = tuple(output_strides)
+    return output_shape, output_strides
+
+
+def tuple_to_void_p(py_tuple: Tuple):
+    array = ctypes.c_int * len(py_tuple)
+    data_array = array(*py_tuple)
+    return ctypes.cast(data_array, ctypes.c_void_p)
+
+
+def test(
+    handle,
+    device,
+    input_shape,
+    input_stride,
+    weight_shape,
+    weight_stride,
+    pads,
+    strides,
+    dilations,
+    groups,
+    tensor_dtype=InfiniDtype.F16,
+    sync=None,
+):
+    assert len(pads) == len(strides) == len(dilations)
+    input = TestTensor(
+        input_shape, input_stride, dt=tensor_dtype, device=device, scale=0.01
+    )
+    weight = TestTensor(
+        weight_shape, weight_stride, dt=tensor_dtype, device=device, scale=0.01
+    )
+    output_shape, output_stride = inferShapeStride(
+        input_shape, weight_shape, pads, strides, dilations
+    )
+    # grad_output = TestTensor(output_shape, output_stride, dt=tensor_dtype, device=device)
+    bias = TestTensor(
+        (weight.shape[0],), (1,), dt=tensor_dtype, device=device, scale=0.01
+    )
+    # bias = None  # Disable bias for now
+    # 1. PyTorch reference backward
+    input_torch = input.torch_tensor().detach().clone().requires_grad_(True)
+    weight_torch = weight.torch_tensor().detach().clone().requires_grad_(True)
+    bias_torch = (
+        bias.torch_tensor().detach().clone().requires_grad_(True)
+        if bias is not None
+        else None
+    )
+    grad_output_torch = torch.randn(
+        output_shape, dtype=input_torch.dtype, device=input_torch.device
+    )
+
+    # Define forward function for reuse
+    def forward_pass(input_t, weight_t, bias_t):
+        if len(input_shape) == 3:
+            return torch.nn.functional.conv1d(
+                input_t,
+                weight_t,
+                bias=bias_t,
+                stride=strides,
+                padding=pads,
+                dilation=dilations,
+                groups=groups,
+            )
+        elif len(input_shape) == 4:
+            return torch.nn.functional.conv2d(
+                input_t,
+                weight_t,
+                bias=bias_t,
+                stride=strides,
+                padding=pads,
+                dilation=dilations,
+                groups=groups,
+            )
+        elif len(input_shape) == 5:
+            return torch.nn.functional.conv3d(
+                input_t,
+                weight_t,
+                bias=bias_t,
+                stride=strides,
+                padding=pads,
+                dilation=dilations,
+                groups=groups,
+            )
+        else:
+            raise NotImplementedError("Unsupported ndim")
+
+    # Forward
+    y_ref = forward_pass(input_torch, weight_torch, bias_torch)
+    print(
+        f"PyTorch output shape: {y_ref.shape}, dtype: {y_ref.dtype}, device: {y_ref.device}"
+    )
+    y_ref.backward(grad_output_torch)
+    grad_input_ref = input_torch.grad
+    grad_weight_ref = weight_torch.grad
+    grad_bias_ref = bias_torch.grad if bias is not None else None
+
+    # 2. infiniop backward
+    grad_output_tensor = TestTensor(
+        output_shape, output_stride, dt=tensor_dtype, device=device
+    )
+    grad_output_tensor.actual_tensor().copy_(grad_output_torch)
+    grad_input = TestTensor(input_shape, input_stride, dt=tensor_dtype, device=device)
+    grad_weight = TestTensor(
+        weight_shape, weight_stride, dt=tensor_dtype, device=device
+    )
+    grad_bias = (
+        TestTensor((weight.shape[0],), (1,), dt=tensor_dtype, device=device)
+        if bias is not None
+        else None
+    )
+
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateConvBackwardDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            grad_output_tensor.descriptor,
+            input.descriptor,
+            weight.descriptor,
+            bias.descriptor if bias is not None else None,
+            tuple_to_void_p(pads),
+            tuple_to_void_p(strides),
+            tuple_to_void_p(dilations),
+            groups,
+        )
+    )
+
+    for tensor in [
+        input,
+        grad_output_tensor,
+        weight,
+        bias,
+        grad_input,
+        grad_weight,
+        grad_bias,
+    ]:
+        if tensor is not None:
+            tensor.destroy_desc()
+
+    workspace_size = ctypes.c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetConvBackwardWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, input.device)
+
+    def lib_conv_backward():
+        check_error(
+            LIBINFINIOP.infiniopConvBackward(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                grad_input.data(),
+                grad_weight.data(),
+                grad_bias.data() if grad_bias is not None else None,
+                grad_output_tensor.data(),
+                input.data(),
+                weight.data(),
+                None,
+            )
+        )
+
+    lib_conv_backward()
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype)
+    # Compare grad_input
+    if DEBUG:
+        debug(grad_input.actual_tensor(), grad_input_ref, atol=atol, rtol=rtol)
+    assert torch.allclose(
+        grad_input.actual_tensor(), grad_input_ref, atol=atol, rtol=rtol
+    )
+    # Compare grad_weight
+    if DEBUG:
+        debug(grad_weight.actual_tensor(), grad_weight_ref, atol=atol, rtol=rtol)
+    assert torch.allclose(
+        grad_weight.actual_tensor(), grad_weight_ref, atol=atol, rtol=rtol
+    )
+    # Compare grad_bias
+    if grad_bias is not None:
+        if DEBUG:
+            debug(grad_bias.actual_tensor(), grad_bias_ref, atol=atol, rtol=rtol)
+        assert torch.allclose(
+            grad_bias.actual_tensor(), grad_bias_ref, atol=atol, rtol=rtol
+        )
+
+    if PROFILE:
+        # PyTorch backward function that recreates the computation graph each time
+        def torch_conv_backward():
+            # Recreate tensors with gradients for each call
+            input_t = input.torch_tensor().detach().clone().requires_grad_(True)
+            weight_t = weight.torch_tensor().detach().clone().requires_grad_(True)
+            bias_t = (
+                bias.torch_tensor().detach().clone().requires_grad_(True)
+                if bias is not None
+                else None
+            )
+            # Forward pass
+            y = forward_pass(input_t, weight_t, bias_t)
+            # Backward pass
+            y.backward(grad_output_torch)
+
+        # fmt: off
+        profile_operation("PyTorch", torch_conv_backward, device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lib_conv_backward, device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyConvBackwardDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+    print("\033[92mConvBackward test passed!\033[0m")
diff --git a/test/infiniop/cross_entropy_loss.py b/test/infiniop/cross_entropy_loss.py
new file mode 100644
index 000000000..acc5cadc4
--- /dev/null
+++ b/test/infiniop/cross_entropy_loss.py
@@ -0,0 +1,213 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+import numpy as np
+
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    infiniopOperatorDescriptor_t,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    TestWorkspace,
+    InfiniDeviceEnum,
+)
+from torch.nn import functional as F
+
+_TEST_CASES = [
+    # Single sample classification
+    ((10,), 10),
+    ((200,), 200),
+    # 2D: (N, C) - batch classification
+    ((4, 10), 10),
+    ((8, 5), 5),
+    ((16, 100), 100),
+    ((32, 1000), 1000),
+    ((64, 21), 21),
+    ((128, 50), 50),
+    # 3D: (N, C, d1) - sequence classification
+    ((4, 10, 5), 10),
+    # 4D: (N, C, d1, d2) - image segmentation
+    ((2, 8, 8, 8), 8),
+    # 5D: (N, C, d1, d2, d3) - 3D segmentation
+    ((3, 10, 10, 20, 30), 10),
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16]
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def cross_entropy_loss_pytorch(logits, target):
+    return F.cross_entropy(logits.double(), target.long(), reduction="mean")
+
+
+def test(
+    handle,
+    device,
+    input_shape,
+    num_classes,
+    tensor_dtype=InfiniDtype.F32,
+    sync=None,
+):
+    # 根据输入形状确定logits和target的形状
+    if len(input_shape) == 1:
+        # Shape (C,) - single sample classification
+        logits_shape = (num_classes,)
+        target_shape = (1,)  # 修改：使用 (1,) 而不是标量
+    else:
+        # Shape (N, C, [d1], [d2], ...)
+        logits_shape = input_shape
+        target_shape = (input_shape[0],) + input_shape[2:]
+
+    print(
+        f"Testing CrossEntropyLoss on {InfiniDeviceNames[device]} with logits_shape: {logits_shape}, target_shape: {target_shape}, dtype:{InfiniDtypeNames[tensor_dtype]}"
+    )
+
+    # 创建logits张量
+    logits = TestTensor(logits_shape, None, dt=tensor_dtype, device=device)
+
+    # 创建target张量
+    target_torch = torch.randint(
+        0,
+        num_classes,
+        target_shape,
+        dtype=torch.long,
+        device=logits.torch_tensor().device,
+    )
+    target = TestTensor.from_torch(target_torch, dt=InfiniDtype.I64, device=device)
+
+    # 创建loss张量
+    loss = TestTensor((1,), None, dt=tensor_dtype, device=device)
+
+    # 计算PyTorch参考损失
+    if len(input_shape) == 1:
+        # 对于一维logits，target需要是标量
+        target_scalar = target.torch_tensor()[0]
+        pytorch_loss = cross_entropy_loss_pytorch(logits.torch_tensor(), target_scalar)
+    else:
+        pytorch_loss = cross_entropy_loss_pytorch(
+            logits.torch_tensor(), target.torch_tensor()
+        )
+
+    # 将参考结果存储到loss张量
+    loss.torch_tensor()[0] = pytorch_loss.to(loss.torch_tensor().dtype)
+
+    if sync:
+        sync()
+
+    # 创建算子描述符
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateCrossEntropyLossDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            loss.descriptor,
+            logits.descriptor,
+            target.descriptor,
+        )
+    )
+
+    # 销毁tensor的描述符以防止内核直接使用
+    for tensor in [logits, target, loss]:
+        tensor.destroy_desc()
+
+    # 获取工作空间大小并创建工作空间
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetCrossEntropyLossWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, device)
+
+    # PyTorch参考实现函数
+    def torch_cross_entropy():
+        if len(input_shape) == 1:
+            target_scalar = target.torch_tensor()[0]
+            result = cross_entropy_loss_pytorch(logits.torch_tensor(), target_scalar)
+        else:
+            result = cross_entropy_loss_pytorch(
+                logits.torch_tensor(), target.torch_tensor()
+            )
+        loss.torch_tensor()[0] = result.to(loss.torch_tensor().dtype)
+
+    # InfiniOP实现函数
+    def lib_cross_entropy():
+        check_error(
+            LIBINFINIOP.infiniopCrossEntropyLoss(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                loss.data(),
+                logits.data(),
+                target.data(),
+                None,
+            )
+        )
+
+    # 执行InfiniOP算子
+    lib_cross_entropy()
+
+    if sync:
+        sync()
+
+    # 验证结果
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype)
+    actual_loss = loss.actual_tensor()[0]
+    expected_loss = loss.torch_tensor()[0]
+
+    if DEBUG:
+        print(f"Expected loss: {expected_loss.item()}")
+        print(f"Actual loss: {actual_loss.item()}")
+        if target_shape:
+            print(
+                f"Target shape: {target_shape}, first few targets: {target.torch_tensor().flatten()[:5]}"
+            )
+        else:
+            print(f"Target (scalar): {target.torch_tensor()[0].item()}")
+        debug(actual_loss, expected_loss, atol=atol, rtol=rtol)
+
+    if not torch.allclose(actual_loss, expected_loss, atol=atol, rtol=rtol):
+        print("--- ERROR ANALYSIS ---")
+        print(f"Expected: {expected_loss.item()}, Actual: {actual_loss.item()}")
+        print(f"Difference: {abs(actual_loss - expected_loss).item()}")
+        print(f"Tolerance: atol={atol}, rtol={rtol}")
+
+    assert torch.allclose(actual_loss, expected_loss, atol=atol, rtol=rtol)
+
+    # Profile功能
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: torch_cross_entropy(), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_cross_entropy(), device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+
+    check_error(LIBINFINIOP.infiniopDestroyCrossEntropyLossDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+    print("\033[92mAll CrossEntropyLoss tests passed!\033[0m")
diff --git a/test/infiniop/interpolate_nearest.py b/test/infiniop/interpolate_nearest.py
new file mode 100644
index 000000000..335bcd7fd
--- /dev/null
+++ b/test/infiniop/interpolate_nearest.py
@@ -0,0 +1,254 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+from typing import List, Tuple
+import math
+from torch.nn import functional as F
+
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+# Test cases: (input_shape, input_stride, output_shape, output_stride)
+_TEST_CASES = [
+    # 2D test cases - simplified to one line each
+    ((1, 1, 2, 2), None, (1, 1, 4, 4), None),  # Simple contiguous case
+    ((1, 3, 4, 4), (48, 16, 4, 1), (1, 3, 8, 8), (192, 64, 8, 1)),  # 2D upscaling 2x
+    ((1, 3, 8, 8), (192, 64, 8, 1), (1, 3, 4, 4), (48, 16, 4, 1)),  # 2D downscaling 2x
+    ((2, 4, 2, 2), (16, 4, 2, 1), (2, 4, 6, 6), (144, 36, 6, 1)),  # Batch upscaling
+    (
+        (1, 1, 3, 5),
+        (15, 15, 5, 1),
+        (1, 1, 9, 10),
+        (90, 90, 10, 1),
+    ),  # Different aspect ratio
+    (
+        (4, 64, 16, 16),
+        (16384, 256, 16, 1),
+        (4, 64, 32, 32),
+        (65536, 1024, 32, 1),
+    ),  # Large batch
+    ((1, 1, 1, 1), (1, 1, 1, 1), (1, 1, 7, 7), (49, 49, 7, 1)),  # Small to large
+    (
+        (1, 2, 3, 4),
+        (24, 1, 8, 2),
+        (1, 2, 6, 8),
+        (96, 1, 16, 2),
+    ),  # Non-contiguous layout
+    ((2, 3, 2, 2), (32, 8, 4, 1), (2, 3, 4, 4), (128, 32, 8, 1)),  # Padded strides
+    # 1D test cases
+    ((1, 3, 8), (24, 8, 1), (1, 3, 16), (48, 16, 1)),  # 1D upscaling 2x
+    ((2, 5, 10), (50, 10, 1), (2, 5, 5), (25, 5, 1)),  # 1D downscaling 2x
+    ((4, 2, 32), (64, 32, 1), (4, 2, 64), (128, 64, 1)),  # 1D larger upscaling
+    # 3D test cases
+    (
+        (1, 2, 2, 2, 2),
+        (16, 8, 4, 2, 1),
+        (1, 2, 4, 4, 4),
+        (128, 64, 16, 4, 1),
+    ),  # 3D upscaling 2x
+    (
+        (1, 1, 2, 3, 4),
+        (24, 24, 12, 4, 1),
+        (1, 1, 4, 6, 8),
+        (192, 192, 48, 8, 1),
+    ),  # 3D uniform upscaling
+    (
+        (3, 2, 5, 5, 5),
+        (250, 125, 25, 5, 1),
+        (3, 2, 3, 3, 3),
+        (54, 27, 9, 3, 1),
+    ),  # 3D non-uniform scaling
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F32, InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.I8]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+    InfiniDtype.I8: {"atol": 0, "rtol": 0},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+def interpolate_nearest(input_tensor, output_shape, output_tensor):
+    """
+    Perform nearest neighbor interpolation using PyTorch as reference
+    """
+    # Extract spatial dimensions (H, W)
+    target_size = output_shape[2:]  # Skip batch and channel dimensions
+
+    # Use PyTorch's interpolate function with nearest mode
+    if input_tensor.dtype in [
+        torch.int8,
+        torch.uint8,
+        torch.int16,
+        torch.int32,
+        torch.int64,
+    ]:
+        # 对于整数类型，先转换为 float32，进行插值，再转换回原类型
+        original_dtype = input_tensor.dtype
+
+        # 转换为 float32 进行插值
+        float_input = input_tensor.float()
+        result = F.interpolate(float_input, size=target_size, mode="nearest")
+
+        # 转换回原始类型
+        result = result.to(original_dtype)
+    else:
+        result = F.interpolate(input_tensor, size=target_size, mode="nearest")
+
+    output_tensor.copy_(result)
+
+
+def test(
+    handle,
+    device,
+    input_shape,
+    input_stride,
+    output_shape,
+    output_stride,
+    tensor_dtype=InfiniDtype.F16,
+    sync=None,
+):
+    # Create input and output tensors
+    input_tensor = TestTensor(
+        input_shape, input_stride, dt=tensor_dtype, device=device, scale=1.0
+    )
+    output_tensor = TestTensor(
+        output_shape, output_stride, dt=tensor_dtype, device=device
+    )
+
+    print(
+        f"Testing InterpolateNearest on {InfiniDeviceNames[device]} with "
+        f"input_shape: {input_shape}, output_shape: {output_shape}, "
+        f"input_stride: {input_stride}, output_stride: {output_stride}, "
+        f"dtype: {InfiniDtypeNames[tensor_dtype]}"
+    )
+
+    # Compute reference result using PyTorch
+    interpolate_nearest(
+        input_tensor.torch_tensor(), output_shape, output_tensor.torch_tensor()
+    )
+
+    if sync is not None:
+        sync()
+
+    # Create descriptor for our interpolate_nearest operator
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateInterpolateNearestDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output_tensor.descriptor,
+            input_tensor.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input_tensor, output_tensor]:
+        if tensor is not None:
+            tensor.destroy_desc()
+
+    # Get workspace size
+    workspace_size = ctypes.c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetInterpolateNearestWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output_tensor.device)
+
+    def lib_interpolate_nearest():
+        check_error(
+            LIBINFINIOP.infiniopInterpolateNearest(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output_tensor.data(),
+                input_tensor.data(),
+                None,
+            )
+        )
+
+    # Execute the operation
+    lib_interpolate_nearest()
+
+    # Check results
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype)
+    if DEBUG:
+        debug(
+            output_tensor.actual_tensor(),
+            output_tensor.torch_tensor(),
+            atol=atol,
+            rtol=rtol,
+        )
+
+    assert torch.allclose(
+        output_tensor.actual_tensor(),
+        output_tensor.torch_tensor(),
+        atol=atol,
+        rtol=rtol,
+    ), f"Results don't match for shape {input_shape} -> {output_shape}"
+
+    # Profiling workflow
+    if PROFILE:
+        profile_operation(
+            "PyTorch",
+            lambda: interpolate_nearest(
+                input_tensor.torch_tensor(), output_shape, output_tensor.torch_tensor()
+            ),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "    lib",
+            lambda: lib_interpolate_nearest(),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+
+    # Clean up
+    check_error(LIBINFINIOP.infiniopDestroyInterpolateNearestDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index e8963849c..53e7fe485 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -4,7 +4,7 @@
     infiniopOperatorDescriptor_t,
 )
 
-from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float
+from ctypes import c_int32, c_void_p, c_size_t, POINTER, c_float, c_bool
 
 
 class OpRegister:
@@ -619,3 +619,259 @@ def softplus_(lib):
     ]
     lib.infiniopDestroySoftplusDescriptor.restype = c_int32
     lib.infiniopDestroySoftplusDescriptor.argtypes = [infiniopOperatorDescriptor_t]
+
+
+@OpRegister.operator
+def avg_pool_(lib):
+    lib.infiniopCreateAvgPoolDescriptor.restype = c_int32
+    lib.infiniopCreateAvgPoolDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,  # output_desc
+        infiniopTensorDescriptor_t,  # input_desc
+        c_void_p,  # kernel_size
+        c_void_p,  # strides
+        c_void_p,  # pads
+        c_bool,  # ceil_mode
+    ]
+
+    lib.infiniopGetAvgPoolWorkspaceSize.restype = c_int32
+    lib.infiniopGetAvgPoolWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopAvgPool.restype = c_int32
+    lib.infiniopAvgPool.argtypes = [
+        infiniopOperatorDescriptor_t,  # descriptor
+        c_void_p,  # workspace
+        c_size_t,  # workspace_size
+        c_void_p,  # output
+        c_void_p,  # input
+        c_void_p,  # stream
+    ]
+
+    lib.infiniopDestroyAvgPoolDescriptor.restype = c_int32
+    lib.infiniopDestroyAvgPoolDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def avg_pool_backward_(lib):
+    lib.infiniopCreateAvgPoolBackwardDescriptor.restype = c_int32
+    lib.infiniopCreateAvgPoolBackwardDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,  # grad_input_desc
+        infiniopTensorDescriptor_t,  # grad_output_desc
+        infiniopTensorDescriptor_t,  # input_desc
+        c_void_p,  # kernel_size
+        c_void_p,  # strides
+        c_void_p,  # pads
+        c_bool,  # ceil_mode
+    ]
+
+    lib.infiniopGetAvgPoolBackwardWorkspaceSize.restype = c_int32
+    lib.infiniopGetAvgPoolBackwardWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopAvgPoolBackward.restype = c_int32
+    lib.infiniopAvgPoolBackward.argtypes = [
+        infiniopOperatorDescriptor_t,  # descriptor
+        c_void_p,  # workspace
+        c_size_t,  # workspace_size
+        c_void_p,  # grad_input
+        c_void_p,  # grad_output
+        c_void_p,  # input
+        c_void_p,  # stream
+    ]
+
+    lib.infiniopDestroyAvgPoolBackwardDescriptor.restype = c_int32
+    lib.infiniopDestroyAvgPoolBackwardDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def cross_entropy_loss_(lib):
+    lib.infiniopCreateCrossEntropyLossDescriptor.restype = c_int32
+    lib.infiniopCreateCrossEntropyLossDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,  # loss_desc
+        infiniopTensorDescriptor_t,  # logits_desc
+        infiniopTensorDescriptor_t,  # target_desc
+    ]
+
+    lib.infiniopGetCrossEntropyLossWorkspaceSize.restype = c_int32
+    lib.infiniopGetCrossEntropyLossWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopCrossEntropyLoss.restype = c_int32
+    lib.infiniopCrossEntropyLoss.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,  # workspace
+        c_size_t,  # workspace_size
+        c_void_p,  # loss
+        c_void_p,  # logits
+        c_void_p,  # target
+        c_void_p,  # stream
+    ]
+
+    lib.infiniopDestroyCrossEntropyLossDescriptor.restype = c_int32
+    lib.infiniopDestroyCrossEntropyLossDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def interpolate_nearest_(lib):
+    lib.infiniopCreateInterpolateNearestDescriptor.restype = c_int32
+    lib.infiniopCreateInterpolateNearestDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,  # output_desc
+        infiniopTensorDescriptor_t,  # input_desc
+    ]
+
+    lib.infiniopGetInterpolateNearestWorkspaceSize.restype = c_int32
+    lib.infiniopGetInterpolateNearestWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopInterpolateNearest.restype = c_int32
+    lib.infiniopInterpolateNearest.argtypes = [
+        infiniopOperatorDescriptor_t,  # descriptor
+        c_void_p,  # workspace
+        c_size_t,  # workspace_size
+        c_void_p,  # output
+        c_void_p,  # input
+        c_void_p,  # stream
+    ]
+
+    lib.infiniopDestroyInterpolateNearestDescriptor.restype = c_int32
+    lib.infiniopDestroyInterpolateNearestDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def max_pool_(lib):
+    lib.infiniopCreateMaxPoolDescriptor.restype = c_int32
+    lib.infiniopCreateMaxPoolDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,  # output_desc
+        infiniopTensorDescriptor_t,  # input_desc
+        c_void_p,  # kernel_size
+        c_void_p,  # strides
+        c_void_p,  # pads
+        c_bool,  # ceil_mode
+    ]
+
+    lib.infiniopGetMaxPoolWorkspaceSize.restype = c_int32
+    lib.infiniopGetMaxPoolWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopMaxPool.restype = c_int32
+    lib.infiniopMaxPool.argtypes = [
+        infiniopOperatorDescriptor_t,  # descriptor
+        c_void_p,  # workspace
+        c_size_t,  # workspace_size
+        c_void_p,  # output
+        c_void_p,  # input
+        c_void_p,  # stream
+    ]
+
+    lib.infiniopDestroyMaxPoolDescriptor.restype = c_int32
+    lib.infiniopDestroyMaxPoolDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def max_pool_backward_(lib):
+    lib.infiniopCreateMaxPoolBackwardDescriptor.restype = c_int32
+    lib.infiniopCreateMaxPoolBackwardDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,  # grad_input_desc
+        infiniopTensorDescriptor_t,  # grad_output_desc
+        infiniopTensorDescriptor_t,  # input_desc
+        c_void_p,  # kernel_size
+        c_void_p,  # strides
+        c_void_p,  # pads
+        c_bool,  # ceil_mode
+    ]
+
+    lib.infiniopGetMaxPoolBackwardWorkspaceSize.restype = c_int32
+    lib.infiniopGetMaxPoolBackwardWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopMaxPoolBackward.restype = c_int32
+    lib.infiniopMaxPoolBackward.argtypes = [
+        infiniopOperatorDescriptor_t,  # descriptor
+        c_void_p,  # workspace
+        c_size_t,  # workspace_size
+        c_void_p,  # grad_input
+        c_void_p,  # grad_output
+        c_void_p,  # input
+        c_void_p,  # stream
+    ]
+
+    lib.infiniopDestroyMaxPoolBackwardDescriptor.restype = c_int32
+    lib.infiniopDestroyMaxPoolBackwardDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
+
+
+@OpRegister.operator
+def conv_backward_(lib):
+    lib.infiniopCreateConvBackwardDescriptor.restype = c_int32
+    lib.infiniopCreateConvBackwardDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,  # grad_output_desc
+        infiniopTensorDescriptor_t,  # input_desc
+        infiniopTensorDescriptor_t,  # weight_desc
+        infiniopTensorDescriptor_t,  # bias_desc (can be None)
+        c_void_p,  # pads
+        c_void_p,  # strides
+        c_void_p,  # dilations
+        c_size_t,  # ndim
+    ]
+
+    lib.infiniopGetConvBackwardWorkspaceSize.restype = c_int32
+    lib.infiniopGetConvBackwardWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopConvBackward.restype = c_int32
+    lib.infiniopConvBackward.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,  # workspace
+        c_size_t,  # workspace_size
+        c_void_p,  # grad_input
+        c_void_p,  # grad_weight
+        c_void_p,  # grad_bias
+        c_void_p,  # grad_output
+        c_void_p,  # input
+        c_void_p,  # weight
+        c_void_p,  # stream
+    ]
+
+    lib.infiniopDestroyConvBackwardDescriptor.restype = c_int32
+    lib.infiniopDestroyConvBackwardDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
diff --git a/test/infiniop/libinfiniop/utils.py b/test/infiniop/libinfiniop/utils.py
index 510e3d2fa..455c753d2 100644
--- a/test/infiniop/libinfiniop/utils.py
+++ b/test/infiniop/libinfiniop/utils.py
@@ -66,10 +66,22 @@ def __init__(
                 torch_strides.append(strides[i])
             else:
                 torch_shape.append(shape[i])
+        torch_dtype = to_torch_dtype(dt)
         if mode == "random":
-            self._torch_tensor = torch.rand(
-                torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device]
-            )
+            if dt == InfiniDtype.I8:
+                self._torch_tensor = torch.randint(
+                    -128,
+                    128,
+                    torch_shape,
+                    dtype=to_torch_dtype(dt),
+                    device=torch_device_map[device],
+                )
+            else:
+                self._torch_tensor = torch.rand(
+                    torch_shape,
+                    dtype=to_torch_dtype(dt),
+                    device=torch_device_map[device],
+                )
         elif mode == "zeros":
             self._torch_tensor = torch.zeros(
                 torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device]
@@ -79,7 +91,13 @@ def __init__(
                 torch_shape, dtype=to_torch_dtype(dt), device=torch_device_map[device]
             )
         elif mode == "randint":
-            self._torch_tensor = torch.randint(-2000000000,2000000000, torch_shape,dtype=to_torch_dtype(dt), device=torch_device_map[device])
+            self._torch_tensor = torch.randint(
+                -2000000000,
+                2000000000,
+                torch_shape,
+                dtype=to_torch_dtype(dt),
+                device=torch_device_map[device],
+            )
         elif mode == "manual":
             assert set_tensor is not None
             assert torch_shape == list(set_tensor.shape)
@@ -91,9 +109,32 @@ def __init__(
             raise ValueError("Unsupported mode")
 
         if scale is not None:
-            self._torch_tensor *= scale
+            if torch_dtype in [
+                torch.int8,
+                torch.uint8,
+                torch.int16,
+                torch.int32,
+                torch.int64,
+            ]:
+                # 对于整数类型，先转换为 float，应用 scale，再转换回原类型
+                self._torch_tensor = (self._torch_tensor.float() * scale).to(
+                    torch_dtype
+                )
+            else:
+                self._torch_tensor *= scale
+
         if bias is not None:
-            self._torch_tensor += bias
+            if torch_dtype in [
+                torch.int8,
+                torch.uint8,
+                torch.int16,
+                torch.int32,
+                torch.int64,
+            ]:
+                # 对于整数类型，先转换为 float，应用 bias，再转换回原类型
+                self._torch_tensor = (self._torch_tensor.float() + bias).to(torch_dtype)
+            else:
+                self._torch_tensor += bias
 
         if strides is not None:
             self._data_tensor = rearrange_tensor(self._torch_tensor, torch_strides)
diff --git a/test/infiniop/maxpool.py b/test/infiniop/maxpool.py
new file mode 100644
index 000000000..81ddce060
--- /dev/null
+++ b/test/infiniop/maxpool.py
@@ -0,0 +1,242 @@
+import torch
+import ctypes
+from ctypes import c_uint64, c_bool
+
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+from typing import List, Tuple
+import math
+from torch.nn import functional as F
+
+# Configuration for profiling
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+# Test cases: (input_shape, input_stride, kernel_size, stride, padding, ceil_mode)
+_TEST_CASES = [
+    # 1D max pooling cases
+    ((1, 3, 8), None, (2,), (2,), (0,), False),
+    ((2, 4, 16), None, (3,), (2,), (1,), False),
+    ((3, 2, 77), None, (6,), (4,), (3,), True),
+    # 2D max pooling cases
+    ((1, 1, 4, 4), None, (2, 2), (2, 2), (0, 0), False),
+    ((2, 3, 8, 8), None, (3, 3), (2, 2), (1, 1), False),
+    ((1, 64, 32, 32), None, (2, 2), (2, 2), (0, 0), False),
+    ((4, 128, 16, 16), None, (3, 3), (1, 1), (1, 1), False),
+    # 3D max pooling cases
+    ((1, 1, 4, 4, 4), None, (2, 2, 2), (2, 2, 2), (0, 0, 0), False),
+    ((2, 2, 8, 8, 8), None, (2, 3, 3), (2, 2, 2), (0, 1, 1), False),
+    # Cases with ceil_mode=True
+    ((1, 1, 7, 7), None, (3, 3), (2, 2), (1, 1), True),
+    ((1, 2, 5), None, (3,), (2,), (0,), True),
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+
+def max_pool(input_tensor, kernel_size, stride, padding, ceil_mode):
+    """
+    Perform max pooling using PyTorch as reference
+    """
+    ndim = len(input_tensor.shape) - 2  # Spatial dimensions
+
+    if ndim == 1:
+        result = F.max_pool1d(
+            input_tensor,
+            kernel_size=kernel_size[0],
+            stride=stride[0],
+            padding=padding[0],
+            ceil_mode=ceil_mode,
+        )
+    elif ndim == 2:
+        result = F.max_pool2d(
+            input_tensor,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            ceil_mode=ceil_mode,
+        )
+    elif ndim == 3:
+        result = F.max_pool3d(
+            input_tensor,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            ceil_mode=ceil_mode,
+        )
+    else:
+        raise ValueError(f"Unsupported spatial dimensions: {ndim}")
+
+    return result
+
+
+def tuple_to_void_p(py_tuple: Tuple):
+    """Convert a python tuple to a ctype void pointer"""
+    array = ctypes.c_uint64 * len(py_tuple)
+    data_array = array(*py_tuple)
+    return ctypes.cast(data_array, ctypes.c_void_p)
+
+
+def test(
+    handle,
+    device,
+    input_shape,
+    input_stride,
+    kernel_size,
+    stride,
+    padding,
+    ceil_mode,
+    tensor_dtype=InfiniDtype.F16,
+    sync=None,
+):
+    # Create input tensor
+    input_tensor = TestTensor(
+        input_shape, input_stride, dt=tensor_dtype, device=device, scale=1.0
+    )
+
+    # Compute reference result using PyTorch
+    torch_ref_output = max_pool(
+        input_tensor.torch_tensor(),
+        kernel_size,
+        stride,
+        padding,
+        ceil_mode,
+    )
+
+    # Use PyTorch输出shape来初始化output_tensor
+    output_tensor = TestTensor(
+        torch_ref_output.shape, None, dt=tensor_dtype, device=device
+    )
+
+    print(
+        f"Testing MaxPool on {InfiniDeviceNames[device]} with "
+        f"input_shape: {input_shape}, kernel_size: {kernel_size}, "
+        f"stride: {stride}, padding: {padding}, ceil_mode: {ceil_mode}, "
+        f"dtype: {InfiniDtypeNames[tensor_dtype]}"
+    )
+
+    if sync is not None:
+        sync()
+
+    # Create descriptor for our max pool operator
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateMaxPoolDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            output_tensor.descriptor,
+            input_tensor.descriptor,
+            tuple_to_void_p(kernel_size),
+            tuple_to_void_p(stride),
+            tuple_to_void_p(padding),
+            c_bool(ceil_mode),
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input_tensor, output_tensor]:
+        if tensor is not None:
+            tensor.destroy_desc()
+
+    # Get workspace size
+    workspace_size = ctypes.c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetMaxPoolWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, output_tensor.device)
+
+    def lib_max_pool():
+        check_error(
+            LIBINFINIOP.infiniopMaxPool(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                output_tensor.data(),
+                input_tensor.data(),
+                None,
+            )
+        )
+
+    # Execute the operation
+    lib_max_pool()
+
+    # Check results
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype)
+    if DEBUG:
+        debug(
+            output_tensor.actual_tensor(),
+            torch_ref_output,
+            atol=atol,
+            rtol=rtol,
+        )
+
+    assert torch.allclose(
+        output_tensor.actual_tensor(),
+        torch_ref_output,
+        atol=atol,
+        rtol=rtol,
+    ), f"Results don't match for input_shape {input_shape}, kernel_size {kernel_size}"
+
+    # Profiling workflow
+    if PROFILE:
+        profile_operation(
+            "PyTorch",
+            lambda: max_pool(
+                input_tensor.torch_tensor(),
+                kernel_size,
+                stride,
+                padding,
+                ceil_mode,
+            ),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "    lib", lambda: lib_max_pool(), device, NUM_PRERUN, NUM_ITERATIONS
+        )
+
+    # Clean up
+    check_error(LIBINFINIOP.infiniopDestroyMaxPoolDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/test/infiniop/maxpool_backward.py b/test/infiniop/maxpool_backward.py
new file mode 100644
index 000000000..4d8bc073c
--- /dev/null
+++ b/test/infiniop/maxpool_backward.py
@@ -0,0 +1,315 @@
+import torch
+import ctypes
+from ctypes import c_uint64, c_bool
+
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+from typing import List, Tuple
+import math
+from torch.nn import functional as F
+
+# Configuration for profiling
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+# Test cases: (input_shape, input_stride, kernel_size, stride, padding, ceil_mode)
+_TEST_CASES = [
+    # 1D MaxPool
+    ((1, 1, 8), None, (2,), (2,), (1,), False),
+    ((2, 4, 16), None, (3,), (2,), (1,), False),
+    ((1, 2, 5), None, (3,), (2,), (0,), True),
+    ((8, 16, 1024), None, (5,), (3,), (2,), False),
+    # 2D MaxPool
+    ((2, 1, 9, 4), None, (2, 2), (2, 2), (0, 0), False),
+    ((3, 6, 16, 8), None, (3, 3), (2, 2), (1, 1), False),
+    ((3, 9, 16, 32), None, (4, 3), (2, 2), (1, 1), True),
+    ((5, 12, 24, 40), None, (4, 5), (2, 3), (1, 1), True),
+    # 3D MaxPool
+    ((1, 1, 4, 4, 4), None, (2, 2, 2), (2, 2, 2), (0, 0, 0), False),
+    ((2, 2, 8, 8, 8), None, (2, 3, 3), (2, 2, 2), (0, 1, 1), False),
+    ((1, 1, 10, 20, 30), None, (2, 3, 4), (2, 2, 3), (1, 1, 2), False),
+    # Large batch/channel
+    ((32, 64, 16, 16), None, (2, 2), (2, 2), (0, 0), False),
+    ((16, 32, 8, 8, 8), None, (2, 2, 2), (2, 2, 2), (0, 0, 0), False),
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [InfiniDtype.F32, InfiniDtype.BF16, InfiniDtype.F16]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-4, "rtol": 1e-4},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+
+def max_pool_output_shape_pt(
+    input_shape, kernel_size, stride, padding, ceil_mode, dtype=torch.float32
+):
+    # Create a dummy tensor to get PyTorch的output shape
+    dummy = torch.zeros(input_shape, dtype=dtype)
+    ndim = len(input_shape) - 2
+    if ndim == 1:
+        out = F.max_pool1d(
+            dummy, kernel_size[0], stride[0], padding[0], ceil_mode=ceil_mode
+        )
+    elif ndim == 2:
+        out = F.max_pool2d(dummy, kernel_size, stride, padding, ceil_mode=ceil_mode)
+    elif ndim == 3:
+        out = F.max_pool3d(dummy, kernel_size, stride, padding, ceil_mode=ceil_mode)
+    else:
+        raise ValueError("Unsupported ndim")
+    return tuple(out.shape)
+
+
+def max_pool_backward(
+    input_tensor,
+    grad_output_tensor,
+    kernel_size,
+    stride,
+    padding,
+    ceil_mode,
+    grad_input_tensor,
+):
+    """
+    Perform max pooling backward using PyTorch as reference
+    """
+    input_tensor = input_tensor.detach().clone().requires_grad_(True)
+    ndim = len(input_tensor.shape) - 2  # Spatial dimensions
+
+    # First do forward pass to get indices
+    if ndim == 1:
+        output = F.max_pool1d(
+            input_tensor,
+            kernel_size=kernel_size[0],
+            stride=stride[0],
+            padding=padding[0],
+            ceil_mode=ceil_mode,
+        )
+    elif ndim == 2:
+        output = F.max_pool2d(
+            input_tensor,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            ceil_mode=ceil_mode,
+        )
+    elif ndim == 3:
+        output = F.max_pool3d(
+            input_tensor,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            ceil_mode=ceil_mode,
+        )
+    else:
+        raise ValueError(f"Unsupported spatial dimensions: {ndim}")
+    output.backward(grad_output_tensor)
+    grad_input_tensor.copy_(input_tensor.grad)
+
+
+def tuple_to_void_p(py_tuple: Tuple):
+    """Convert a python tuple to a ctype void pointer"""
+    array = ctypes.c_uint64 * len(py_tuple)
+    data_array = array(*py_tuple)
+    return ctypes.cast(data_array, ctypes.c_void_p)
+
+
+def test(
+    handle,
+    device,
+    input_shape,
+    input_stride,
+    kernel_size,
+    stride,
+    padding,
+    ceil_mode,
+    tensor_dtype=InfiniDtype.F16,
+    sync=None,
+):
+    # Create input tensor (original input for forward pass)
+    input_tensor = TestTensor(
+        input_shape, input_stride, dt=tensor_dtype, device=device, scale=1.0
+    )
+
+    # 用PyTorch得出的output shape来初始化grad_output_tensor
+    torch_dtype = torch.float32  # 只用于推理shape，实际TestTensor用自己的dtype
+    output_shape = max_pool_output_shape_pt(
+        input_shape, kernel_size, stride, padding, ceil_mode, dtype=torch_dtype
+    )
+
+    # Create grad_output tensor (gradient w.r.t. pooling output)
+    grad_output_tensor = TestTensor(
+        output_shape, None, dt=tensor_dtype, device=device, scale=1.0
+    )
+
+    # Create grad_input tensor (gradient w.r.t. pooling input)
+    grad_input_tensor = TestTensor(
+        input_shape, input_stride, dt=tensor_dtype, device=device
+    )
+
+    print(
+        f"Testing MaxPoolBackward on {InfiniDeviceNames[device]} with "
+        f"input_shape: {input_shape}, output_shape: {output_shape}, "
+        f"kernel_size: {kernel_size}, stride: {stride}, padding: {padding}, "
+        f"ceil_mode: {ceil_mode}, dtype: {InfiniDtypeNames[tensor_dtype]}"
+    )
+
+    # Compute reference result using PyTorch
+    try:
+        max_pool_backward(
+            input_tensor.torch_tensor(),
+            grad_output_tensor.torch_tensor(),
+            kernel_size,
+            stride,
+            padding,
+            ceil_mode,
+            grad_input_tensor.torch_tensor(),
+        )
+    except Exception as e:
+        print(f"Error during PyTorch reference computation: {e}")
+        raise
+
+    if sync is not None:
+        sync()
+
+    # Create descriptor for our max pool backward operator
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateMaxPoolBackwardDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            grad_input_tensor.descriptor,  # gradient w.r.t. input (output of this op)
+            grad_output_tensor.descriptor,  # gradient w.r.t. output (input to this op)
+            input_tensor.descriptor,  # original input (for indices)
+            tuple_to_void_p(kernel_size),
+            tuple_to_void_p(stride),
+            tuple_to_void_p(padding),
+            c_bool(ceil_mode),
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [input_tensor, grad_output_tensor, grad_input_tensor]:
+        if tensor is not None:
+            tensor.destroy_desc()
+
+    # Get workspace size
+    workspace_size = ctypes.c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetMaxPoolBackwardWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, grad_input_tensor.device)
+
+    def lib_max_pool_backward():
+        check_error(
+            LIBINFINIOP.infiniopMaxPoolBackward(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                grad_input_tensor.data(),  # output: gradient w.r.t. input
+                grad_output_tensor.data(),  # input: gradient w.r.t. output
+                input_tensor.data(),  # input: original input tensor
+                None,
+            )
+        )
+
+    # Execute the operation
+    try:
+        lib_max_pool_backward()
+    except Exception as e:
+        print(f"Error during libinfiniop max pool backward operation: {e}")
+        raise
+
+    # Check results
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, tensor_dtype)
+    if DEBUG:
+        debug(
+            grad_input_tensor.actual_tensor(),
+            grad_input_tensor.torch_tensor(),
+            atol=atol,
+            rtol=rtol,
+        )
+
+    actual_result = grad_input_tensor.actual_tensor()
+    expected_result = grad_input_tensor.torch_tensor()
+
+    # 检查是否有 NaN 或 Inf
+    if torch.isnan(actual_result).any():
+        print("WARNING: Actual result contains NaN values!")
+    if torch.isinf(actual_result).any():
+        print("WARNING: Actual result contains Inf values!")
+    if torch.isnan(expected_result).any():
+        print("WARNING: Expected result contains NaN values!")
+    if torch.isinf(expected_result).any():
+        print("WARNING: Expected result contains Inf values!")
+
+    assert torch.allclose(
+        grad_input_tensor.actual_tensor(),
+        grad_input_tensor.torch_tensor(),
+        atol=atol,
+        rtol=rtol,
+    ), f"Results don't match for input_shape {input_shape}, kernel_size {kernel_size}"
+
+    # Profiling workflow
+    if PROFILE:
+        profile_operation(
+            "PyTorch",
+            lambda: max_pool_backward(
+                input_tensor.torch_tensor(),
+                grad_output_tensor.torch_tensor(),
+                kernel_size,
+                stride,
+                padding,
+                ceil_mode,
+                grad_input_tensor.torch_tensor(),
+            ),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+        profile_operation(
+            "    lib",
+            lambda: lib_max_pool_backward(),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS,
+        )
+
+    # Clean up
+    check_error(LIBINFINIOP.infiniopDestroyMaxPoolBackwardDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
diff --git a/xmake.lua b/xmake.lua
index 67add0d45..fbb5156c3 100644
--- a/xmake.lua
+++ b/xmake.lua
@@ -110,6 +110,16 @@ if has_config("metax-gpu") then
     includes("xmake/metax.lua")
 end
 
+option("hcdnn")
+    set_default(true)
+    set_showmenu(true)
+    set_description("Whether to compile hcdnn for Metax GPU")
+option_end()
+
+if has_config("hcdnn") then
+    add_defines("ENABLE_HCDNN_API")
+end
+
 -- 摩尔线程
 option("moore-gpu")
     set_default(false)