diff --git a/include/infinicore/ops/logcumsumexp.hpp b/include/infinicore/ops/logcumsumexp.hpp
new file mode 100644
index 000000000..7aaaf510a
--- /dev/null
+++ b/include/infinicore/ops/logcumsumexp.hpp
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class LogCumSumExp {
+public:
+    using schema = void (*)(Tensor, Tensor, int, bool, bool);
+    
+    static void execute(Tensor y, Tensor x, int axis, bool exclusive, bool reverse);
+    
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor logcumsumexp(Tensor x, int axis, bool exclusive = false, bool reverse = false);
+
+void logcumsumexp_(Tensor y, Tensor x, int axis, bool exclusive = false, bool reverse = false);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/logical_and.hpp b/include/infinicore/ops/logical_and.hpp
new file mode 100644
index 000000000..6cb543151
--- /dev/null
+++ b/include/infinicore/ops/logical_and.hpp
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class LogicalAnd {
+public:
+    // LogicalAnd 是二元操作，schema 通常定义为 (Output, Input1, Input2)
+    using schema = void (*)(Tensor, Tensor, Tensor);
+
+    static void execute(Tensor output, Tensor input1, Tensor input2);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor logical_and(Tensor input1, Tensor input2);
+void logical_and_(Tensor output, Tensor input1, Tensor input2);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/logical_not.hpp b/include/infinicore/ops/logical_not.hpp
new file mode 100644
index 000000000..68c947ce8
--- /dev/null
+++ b/include/infinicore/ops/logical_not.hpp
@@ -0,0 +1,23 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class LogicalNot {
+public:
+    // LogicalNot 是一元操作，schema 定义为 (Output, Input)
+    using schema = void (*)(Tensor, Tensor);
+
+    static void execute(Tensor output, Tensor input);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+// 构造新 Tensor 返回结果
+Tensor logical_not(Tensor input);
+
+// 将结果写入指定的 output Tensor
+void logical_not_(Tensor output, Tensor input);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/unfold.hpp b/include/infinicore/ops/unfold.hpp
new file mode 100644
index 000000000..6dea85562
--- /dev/null
+++ b/include/infinicore/ops/unfold.hpp
@@ -0,0 +1,35 @@
+#pragma once
+
+#include <vector>
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Unfold {
+public:
+    // schema: output, input, kernel_sizes, dilations, paddings, strides
+    using schema = void (*)(Tensor, Tensor, const std::vector<int64_t>&, const std::vector<int64_t>&, const std::vector<int64_t>&, const std::vector<int64_t>&);
+
+    static void execute(Tensor output, Tensor input, 
+                        const std::vector<int64_t>& kernel_sizes, 
+                        const std::vector<int64_t>& dilations, 
+                        const std::vector<int64_t>& paddings, 
+                        const std::vector<int64_t>& strides);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+// Functional API
+Tensor unfold(Tensor input, 
+              std::vector<int64_t> kernel_sizes, 
+              std::vector<int64_t> dilations, 
+              std::vector<int64_t> paddings, 
+              std::vector<int64_t> strides);
+
+void unfold_(Tensor output, Tensor input, 
+             std::vector<int64_t> kernel_sizes, 
+             std::vector<int64_t> dilations, 
+             std::vector<int64_t> paddings, 
+             std::vector<int64_t> strides);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/vander.hpp b/include/infinicore/ops/vander.hpp
new file mode 100644
index 000000000..1ee4e112e
--- /dev/null
+++ b/include/infinicore/ops/vander.hpp
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Vander {
+public:
+    // schema: output, input, N, increasing
+    using schema = void (*)(Tensor, Tensor, int64_t, bool);
+
+    static void execute(Tensor output, Tensor input, int64_t N, bool increasing);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+// N defaults to 0 (implying N = input.size(0), i.e., a square matrix)
+Tensor vander(Tensor input, int64_t N = 0, bool increasing = false);
+void vander_(Tensor output, Tensor input, int64_t N, bool increasing);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infiniop.h b/include/infiniop.h
index c0a09fcb4..301cae708 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -34,6 +34,9 @@
 #include "infiniop/ops/topkrouter.h"
 #include "infiniop/ops/topksoftmax.h"
 #include "infiniop/ops/zeros.h"
+#include "infiniop/ops/unfold.h"
+#include "infiniop/ops/vander.h"
+#include "infiniop/ops/logcumsumexp.h"
 #include "infiniop/tensor_descriptor.h"
 
 #endif // __INFINIOP_API_H__
diff --git a/include/infiniop/ops/logcumsumexp.h b/include/infiniop/ops/logcumsumexp.h
new file mode 100644
index 000000000..a1fdc1376
--- /dev/null
+++ b/include/infiniop/ops/logcumsumexp.h
@@ -0,0 +1,30 @@
+#ifndef __INFINIOP_LOGCUMSUMEXP_API_H__
+#define __INFINIOP_LOGCUMSUMEXP_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopLogCumSumExpDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateLogCumSumExpDescriptor(infiniopHandle_t handle,
+                                                                infiniopLogCumSumExpDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t y,
+                                                                infiniopTensorDescriptor_t x,
+                                                                int axis,
+                                                                int exclusive,
+                                                                int reverse);
+
+/* 获取执行 LogCumSumExp 所需的临时空间大小 */
+__C __export infiniStatus_t infiniopGetLogCumSumExpWorkspaceSize(infiniopLogCumSumExpDescriptor_t desc, 
+                                                                 size_t *size);
+
+__C __export infiniStatus_t infiniopLogCumSumExp(infiniopLogCumSumExpDescriptor_t desc,
+                                                void *workspace,
+                                                size_t workspace_size,
+                                                void *y,
+                                                const void *x,
+                                                void *stream);
+
+/* 销毁描述符 */
+__C __export infiniStatus_t infiniopDestroyLogCumSumExpDescriptor(infiniopLogCumSumExpDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/unfold.h b/include/infiniop/ops/unfold.h
new file mode 100644
index 000000000..013cba214
--- /dev/null
+++ b/include/infiniop/ops/unfold.h
@@ -0,0 +1,28 @@
+#ifndef __INFINIOP_UNFOLD_API_H__
+#define __INFINIOP_UNFOLD_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopUnfoldDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateUnfoldDescriptor(infiniopHandle_t handle,
+                                                           infiniopUnfoldDescriptor_t *desc_ptr,
+                                                           infiniopTensorDescriptor_t output,
+                                                           infiniopTensorDescriptor_t input,
+                                                           const int *kernel_sizes,
+                                                           const int *strides,
+                                                           const int *paddings,
+                                                           const int *dilations);
+
+// 获取 Unfold 工作区大小
+__C __export infiniStatus_t infiniopGetUnfoldWorkspaceSize(infiniopUnfoldDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopUnfold(infiniopUnfoldDescriptor_t desc,
+                                           void *workspace,
+                                           size_t workspace_size,
+                                           void *output,
+                                           const void *input,
+                                           void *stream);
+__C __export infiniStatus_t infiniopDestroyUnfoldDescriptor(infiniopUnfoldDescriptor_t desc);
+
+#endif // __INFINIOP_UNFOLD_API_H__
\ No newline at end of file
diff --git a/include/infiniop/ops/vander.h b/include/infiniop/ops/vander.h
new file mode 100644
index 000000000..43e560434
--- /dev/null
+++ b/include/infiniop/ops/vander.h
@@ -0,0 +1,25 @@
+#ifndef __INFINIOP_VANDER_API_H__
+#define __INFINIOP_VANDER_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopVanderDescriptor_t;
+__C __export infiniStatus_t infiniopCreateVanderDescriptor(infiniopHandle_t handle,
+                                                           infiniopVanderDescriptor_t *desc_ptr,
+                                                           infiniopTensorDescriptor_t output,
+                                                           infiniopTensorDescriptor_t input,
+                                                           int N,
+                                                           int increasing);
+
+__C __export infiniStatus_t infiniopGetVanderWorkspaceSize(infiniopVanderDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopVander(infiniopVanderDescriptor_t desc,
+                                           void *workspace,
+                                           size_t workspace_size,
+                                           void *output,
+                                           const void *input,
+                                           void *stream);
+
+__C __export infiniStatus_t infiniopDestroyVanderDescriptor(infiniopVanderDescriptor_t desc);
+
+#endif // __INFINIOP_VANDER_API_H__
\ No newline at end of file
diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
index b7288f3ac..e9869f879 100644
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
@@ -51,6 +51,10 @@
 from infinicore.ops.rearrange import rearrange
 from infinicore.ops.squeeze import squeeze
 from infinicore.ops.unsqueeze import unsqueeze
+from infinicore.ops.logcumsumexp import logcumsumexp
+from infinicore.ops.logical_and import logical_and
+from infinicore.ops.logical_not import logical_not
+from infinicore.ops.vander import vander
 from infinicore.tensor import (
     Tensor,
     empty,
@@ -121,6 +125,10 @@
     "from_list",
     "from_numpy",
     "from_torch",
+    "logcumsumexp",
+    "logical_not",
+    "logical_and",
+    "vander",
     "paged_caching",
     "paged_attention",
     "paged_attention_prefill",
diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py
index 255079790..8b58fb727 100644
--- a/python/infinicore/nn/functional/__init__.py
+++ b/python/infinicore/nn/functional/__init__.py
@@ -6,6 +6,7 @@
 from .rope import RopeAlgo, rope
 from .silu import silu
 from .swiglu import swiglu
+from .unfold import unfold
 
 __all__ = [
     "causal_softmax",
@@ -16,5 +17,6 @@
     "linear",
     "embedding",
     "rope",
+    "unfold",
     "RopeAlgo",
 ]
diff --git a/python/infinicore/nn/functional/unfold.py b/python/infinicore/nn/functional/unfold.py
new file mode 100644
index 000000000..dc7f69528
--- /dev/null
+++ b/python/infinicore/nn/functional/unfold.py
@@ -0,0 +1,58 @@
+from typing import Optional, Union, Tuple, List
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+def unfold(
+    input: Tensor, 
+    kernel_size: Union[int, Tuple[int, ...], List[int]], 
+    dilation: Union[int, Tuple[int, ...], List[int]] = 1, 
+    padding: Union[int, Tuple[int, ...], List[int]] = 0, 
+    stride: Union[int, Tuple[int, ...], List[int]] = 1, 
+    *, 
+    out: Optional[Tensor] = None
+) -> Tensor:
+    r"""Extracts sliding local blocks from a batched input tensor.
+
+    Also known as im2col. The output tensor contains the flattened blocks.
+
+    Args:
+        input (Tensor): The input tensor.
+        kernel_size (int or tuple): The size of the sliding blocks.
+        dilation (int or tuple, optional): The parameter that controls the stride of elements within the neighborhood. Default: 1.
+        padding (int or tuple, optional): Implicit zero padding to be added on both sides of input. Default: 0.
+        stride (int or tuple, optional): The stride of the sliding blocks. Default: 1.
+        out (Tensor, optional): The output tensor.
+    """
+
+    if not input.is_contiguous():
+        input = input.contiguous()
+
+    # Helper to ensure parameters are iterable (assuming 2D spatial dims for single int)
+    def _pair(x):
+        return (x, x) if isinstance(x, int) else x
+
+    k_val = _pair(kernel_size)
+    d_val = _pair(dilation)
+    p_val = _pair(padding)
+    s_val = _pair(stride)
+
+    if out is not None:
+        _infinicore.unfold_(
+            out._underlying,
+            input._underlying,
+            k_val,
+            d_val,
+            p_val,
+            s_val
+        )
+        return out
+
+    return Tensor(
+        _infinicore.unfold(
+            input._underlying,
+            k_val,
+            d_val,
+            p_val,
+            s_val
+        )
+    )
\ No newline at end of file
diff --git a/python/infinicore/ops/logcumsumexp.py b/python/infinicore/ops/logcumsumexp.py
new file mode 100644
index 000000000..52aa5fd09
--- /dev/null
+++ b/python/infinicore/ops/logcumsumexp.py
@@ -0,0 +1,23 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+def logcumsumexp(input, dim, exclusive=False, reverse=False, *, out=None):
+    if out is None:
+        # 调用 C++ 绑定的非原地操作，返回新创建的 _underlying 句柄
+        return Tensor(_infinicore.logcumsumexp(
+            input._underlying, 
+            dim, 
+            exclusive, 
+            reverse
+        ))
+
+    # 调用 C++ 绑定的原地/指定输出操作
+    _infinicore.logcumsumexp_(
+        out._underlying, 
+        input._underlying, 
+        dim, 
+        exclusive, 
+        reverse
+    )
+
+    return out
\ No newline at end of file
diff --git a/python/infinicore/ops/logical_and.py b/python/infinicore/ops/logical_and.py
new file mode 100644
index 000000000..da0e98c72
--- /dev/null
+++ b/python/infinicore/ops/logical_and.py
@@ -0,0 +1,14 @@
+import infinicore
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+def logical_and(input: Tensor, other: Tensor, *, out=None) -> Tensor:
+    r"""Computes the element-wise logical AND of the given input tensors."""
+    if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+        return infinicore.ntops.torch.logical_and(input, other, out=out)
+
+    if out is None:
+        return Tensor(_infinicore.logical_and(input._underlying, other._underlying))
+
+    _infinicore.logical_and_(out._underlying, input._underlying, other._underlying)
+    return out
\ No newline at end of file
diff --git a/python/infinicore/ops/logical_not.py b/python/infinicore/ops/logical_not.py
new file mode 100644
index 000000000..d6ff4a3b7
--- /dev/null
+++ b/python/infinicore/ops/logical_not.py
@@ -0,0 +1,17 @@
+import infinicore
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+def logical_not(input: Tensor, *, out=None) -> Tensor:
+    r"""Computes the element-wise logical NOT of the given input tensors."""
+    # 1. 如果启用了 ntops 且设备支持，调用 ntops 实现
+    if infinicore.use_ntops and input.device.type in ("cuda", "musa"):
+        return infinicore.ntops.torch.logical_not(input, out=out)
+
+    # 2. 如果没有提供 out，创建一个新的 Tensor 并返回
+    if out is None:
+        return Tensor(_infinicore.logical_not(input._underlying))
+
+    # 3. 如果提供了 out，进行原地操作 (In-place operation)
+    _infinicore.logical_not_(out._underlying, input._underlying)
+    return out
\ No newline at end of file
diff --git a/python/infinicore/ops/vander.py b/python/infinicore/ops/vander.py
new file mode 100644
index 000000000..907f5f35a
--- /dev/null
+++ b/python/infinicore/ops/vander.py
@@ -0,0 +1,45 @@
+from typing import Optional
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+def vander(
+    x: Tensor, 
+    N: Optional[int] = None, 
+    increasing: bool = False, 
+    *, 
+    out: Optional[Tensor] = None
+) -> Tensor:
+    r"""Generates a Vandermonde matrix.
+
+    The columns of the output matrix are powers of the input vector. The
+    order of the powers is determined by the ``increasing`` boolean argument.
+
+    Args:
+        x (Tensor): 1-D input tensor.
+        N (int, optional): Number of columns in the output. If None, defaults to the size of x.
+        increasing (bool, optional): Order of the powers. 
+            If False (default), the powers are descending (x^(N-1), ..., x^0).
+            If True, the powers are ascending (x^0, ..., x^(N-1)).
+        out (Tensor, optional): The output tensor.
+    """
+
+    if not x.is_contiguous():
+        x = x.contiguous()
+    N_val = N if N is not None else 0
+
+    if out is not None:
+        _infinicore.vander_(
+            out._underlying,
+            x._underlying,
+            N_val,
+            increasing
+        )
+        return out
+
+    return Tensor(
+        _infinicore.vander(
+            x._underlying,
+            N_val,
+            increasing
+        )
+    )
\ No newline at end of file
diff --git a/src/infinicore/ops/logcumsumexp/logcumsumexp.cc b/src/infinicore/ops/logcumsumexp/logcumsumexp.cc
new file mode 100644
index 000000000..274fc6a79
--- /dev/null
+++ b/src/infinicore/ops/logcumsumexp/logcumsumexp.cc
@@ -0,0 +1,35 @@
+#include "infinicore/ops/logcumsumexp.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+// 初始化 Dispatcher 单例
+common::OpDispatcher<LogCumSumExp::schema> &LogCumSumExp::dispatcher() {
+    static common::OpDispatcher<LogCumSumExp::schema> dispatcher_;
+    return dispatcher_;
+}
+
+// 算子执行逻辑：校验设备并分发任务
+void LogCumSumExp::execute(Tensor y, Tensor x, int axis, bool exclusive, bool reverse) {
+    // 确保输入输出张量在同一设备上
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, x);
+    
+    // 切换到目标设备的上下文
+    infinicore::context::setDevice(y->device());
+    dispatcher().lookup(y->device().getType())(y, x, axis, exclusive, reverse);
+}
+
+// 函数式接口：自动创建输出张量并返回
+Tensor logcumsumexp(Tensor x, int axis, bool exclusive, bool reverse) {
+    // 创建一个与输入 x 形状、类型和设备相同的空张量作为输出
+    auto y = Tensor::empty(x->shape(), x->dtype(), x->device());
+    logcumsumexp_(y, x, axis, exclusive, reverse);
+    return y;
+}
+
+// 原地/指定输出接口
+void logcumsumexp_(Tensor y, Tensor x, int axis, bool exclusive, bool reverse) {
+    LogCumSumExp::execute(y, x, axis, exclusive, reverse);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/logcumsumexp/logcumsumexp_infiniop.cc b/src/infinicore/ops/logcumsumexp/logcumsumexp_infiniop.cc
new file mode 100644
index 000000000..a102f81b6
--- /dev/null
+++ b/src/infinicore/ops/logcumsumexp/logcumsumexp_infiniop.cc
@@ -0,0 +1,69 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/logcumsumexp.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::logcumsumexp_impl::infiniop {
+
+// 定义缓存：用于管理 infiniop 算子描述符的生命周期
+thread_local common::OpCache<size_t, infiniopLogCumSumExpDescriptor_t> caches(
+    100, // 缓存容量
+    [](infiniopLogCumSumExpDescriptor_t &desc) {
+        if (desc != nullptr) {
+            // 使用 API 定义中的销毁函数
+            INFINICORE_CHECK_ERROR(infiniopDestroyLogCumSumExpDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor y, Tensor x, int axis, bool exclusive, bool reverse) {
+    // 1. 生成 Hash Key：必须包含张量信息和算子特有参数 (axis, exclusive, reverse)
+    size_t seed = hash_combine(y, x, axis, exclusive, reverse);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    // 2. 获取当前设备对应的缓存实例
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopLogCumSumExpDescriptor_t desc = nullptr;
+
+    // 3. 如果缓存未命中，创建新的描述符
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateLogCumSumExpDescriptor(
+            context::getInfiniopHandle(y->device()), 
+            &desc,
+            y->desc(),  // 输出张量描述符
+            x->desc(),  // 输入张量描述符
+            axis, 
+            exclusive, 
+            reverse));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    // 4. 获取算子执行所需的 Workspace 大小并分配内存
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetLogCumSumExpWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    // 5. 执行算子
+    INFINICORE_CHECK_ERROR(infiniopLogCumSumExp(
+        desc, 
+        workspace->data(), 
+        workspace_size,
+        y->data(), 
+        x->data(), 
+        context::getStream()));
+}
+
+// 6. 自动注册：将此实现注册到 LogCumSumExp 的调度器中
+static bool registered = []() {
+    LogCumSumExp::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::logcumsumexp_impl::infiniop
\ No newline at end of file
diff --git a/src/infinicore/ops/logical_and/logical_and.cc b/src/infinicore/ops/logical_and/logical_and.cc
new file mode 100644
index 000000000..39618ae89
--- /dev/null
+++ b/src/infinicore/ops/logical_and/logical_and.cc
@@ -0,0 +1,37 @@
+#include "infinicore/ops/logical_and.hpp"
+#include <stdexcept>
+
+namespace infinicore::op {
+
+common::OpDispatcher<LogicalAnd::schema> &LogicalAnd::dispatcher() {
+    static common::OpDispatcher<LogicalAnd::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void LogicalAnd::execute(Tensor output, Tensor input1, Tensor input2) {
+    // --- 修正点：去掉第二个参数 true ---
+    infinicore::context::setDevice(input1->device()); 
+    
+    auto device_type = context::getDevice().getType();
+    auto func = dispatcher().lookup(device_type);
+
+    if (func == nullptr) {
+        throw std::runtime_error("No LogicalAnd implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
+    }
+
+    func(output, input1, input2);
+}
+
+Tensor logical_and(Tensor input1, Tensor input2) {
+    Shape shape = input1->shape();
+    auto output = Tensor::empty(shape, input1->dtype(), input1->device());
+    
+    logical_and_(output, input1, input2);
+    return output;
+}
+
+void logical_and_(Tensor output, Tensor input1, Tensor input2) {
+    LogicalAnd::execute(output, input1, input2);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/logical_and/logical_and_cpu.cc b/src/infinicore/ops/logical_and/logical_and_cpu.cc
new file mode 100644
index 000000000..c305173a0
--- /dev/null
+++ b/src/infinicore/ops/logical_and/logical_and_cpu.cc
@@ -0,0 +1,128 @@
+#include "../../../utils.h"
+#include "infinicore/device.hpp"
+#include "infinicore/ops/logical_and.hpp" 
+#include "infinicore/tensor.hpp"
+#include <cmath>
+#include <cstring>
+#include <stdexcept>
+
+namespace infinicore::op::logical_and_impl::cpu {
+
+void calculate(Tensor output, Tensor input1, Tensor input2) {
+    auto ndim = output->ndim();
+    auto numel = output->numel();
+    auto shapes = output->shape(); 
+    
+    auto strides1 = input1->strides();
+    auto strides2 = input2->strides();
+    auto out_strides = output->strides();
+
+    auto dtype = input1->dtype(); 
+    auto dtype_size = input1->element_size();
+    
+    auto out_dtype = output->dtype();
+    auto out_dtype_size = output->element_size();
+
+    // 假设 Tensor::data() 返回的是支持字节加法的指针 (char* 或 uint8_t*)
+    // 如果是 void*，建议显式强转为 uint8_t* 或 char*
+    auto input1_base = reinterpret_cast<uint8_t*>(input1->data());
+    auto input2_base = reinterpret_cast<uint8_t*>(input2->data());
+    auto output_base = reinterpret_cast<uint8_t*>(output->data());
+
+    std::vector<size_t> indices(ndim, 0);
+
+    for (size_t idx = 0; idx < numel; ++idx) {
+        size_t offset1 = 0;
+        size_t offset2 = 0;
+        size_t out_offset = 0;
+
+        for (size_t dim = 0; dim < ndim; ++dim) {
+            offset1 += indices[dim] * strides1[dim];
+            offset2 += indices[dim] * strides2[dim];
+            out_offset += indices[dim] * out_strides[dim];
+        }
+
+        bool result = false;
+
+        // ==========================================
+        //  INPUT TYPE DISPATCH (输入类型分发)
+        // ==========================================
+        
+        // 1. 浮点型
+        if (dtype == DataType::F32) {
+            auto *p1 = reinterpret_cast<float *>(input1_base + offset1 * dtype_size);
+            auto *p2 = reinterpret_cast<float *>(input2_base + offset2 * dtype_size);
+            result = (*p1 != 0.0f) && (*p2 != 0.0f);
+        } else if (dtype == DataType::F64) {
+            auto *p1 = reinterpret_cast<double *>(input1_base + offset1 * dtype_size);
+            auto *p2 = reinterpret_cast<double *>(input2_base + offset2 * dtype_size);
+            result = (*p1 != 0.0) && (*p2 != 0.0);
+        } else if (dtype == DataType::F16) {
+            auto *p1 = reinterpret_cast<fp16_t *>(input1_base + offset1 * dtype_size);
+            auto *p2 = reinterpret_cast<fp16_t *>(input2_base + offset2 * dtype_size);
+            float v1 = utils::cast<float>(*p1);
+            float v2 = utils::cast<float>(*p2);
+            result = (v1 != 0.0f) && (v2 != 0.0f);
+        
+        // 2. 布尔与8位整型
+        } else if (dtype == DataType::BOOL || dtype == DataType::U8) { 
+             auto *p1 = reinterpret_cast<uint8_t *>(input1_base + offset1 * dtype_size);
+             auto *p2 = reinterpret_cast<uint8_t *>(input2_base + offset2 * dtype_size);
+             result = (*p1 != 0) && (*p2 != 0);
+
+        // 3. 【新增】32位整型 (修复 int32 测试失败的关键！)
+        } else if (dtype == DataType::I32 || dtype == DataType::U32) {
+             // 无论是 I32 还是 U32，做非零判断逻辑是一样的，直接强转成 int32_t 读取即可
+             auto *p1 = reinterpret_cast<int32_t *>(input1_base + offset1 * dtype_size);
+             auto *p2 = reinterpret_cast<int32_t *>(input2_base + offset2 * dtype_size);
+             result = (*p1 != 0) && (*p2 != 0);
+
+        // 4. 【新增】64位整型 (增强健壮性)
+        } else if (dtype == DataType::I64 || dtype == DataType::U64) {
+             auto *p1 = reinterpret_cast<int64_t *>(input1_base + offset1 * dtype_size);
+             auto *p2 = reinterpret_cast<int64_t *>(input2_base + offset2 * dtype_size);
+             result = (*p1 != 0) && (*p2 != 0);
+
+        // 5. 【新增】16位整型 (增强健壮性)
+        } else if (dtype == DataType::I16 || dtype == DataType::U16) {
+             auto *p1 = reinterpret_cast<int16_t *>(input1_base + offset1 * dtype_size);
+             auto *p2 = reinterpret_cast<int16_t *>(input2_base + offset2 * dtype_size);
+             result = (*p1 != 0) && (*p2 != 0);
+
+        } else {
+            // 如果遇到 I8，可以合并到 U8 处理；如果没有，这里抛出异常是正确的
+            throw std::runtime_error("Unsupported data type for logical_and operation.");
+        }
+
+        // ==========================================
+        //  OUTPUT TYPE DISPATCH (输出类型分发)
+        // ==========================================
+        if (out_dtype == DataType::BOOL || out_dtype == DataType::U8) {
+            auto *output_ptr = reinterpret_cast<uint8_t *>(output_base + out_offset * out_dtype_size);
+            *output_ptr = result ? 1 : 0;
+        } else if (out_dtype == DataType::F32) {
+             *reinterpret_cast<float *>(output_base + out_offset * out_dtype_size) = result ? 1.0f : 0.0f;
+        } else if (out_dtype == DataType::I32) { // 预防性增加对 int32 输出的支持
+             *reinterpret_cast<int32_t *>(output_base + out_offset * out_dtype_size) = result ? 1 : 0;
+        } else {
+            // 也可以选择在这里 throw，或者默认不做处理
+        }
+
+        // --- 维度索引递增逻辑 ---
+        for (ssize_t dim = ndim - 1; dim >= 0; --dim) {
+            indices[dim]++;
+            if (indices[dim] < shapes[dim]) {
+                break;
+            } else {
+                indices[dim] = 0;
+            }
+        }
+    }
+}
+
+static bool registered = []() {
+    LogicalAnd::dispatcher().registerDevice(Device::Type::CPU, &calculate);
+    return true;
+}();
+
+} // namespace infinicore::op::logical_and_impl::cpu
\ No newline at end of file
diff --git a/src/infinicore/ops/logical_not/logical_not.cc b/src/infinicore/ops/logical_not/logical_not.cc
new file mode 100644
index 000000000..c75899739
--- /dev/null
+++ b/src/infinicore/ops/logical_not/logical_not.cc
@@ -0,0 +1,38 @@
+#include "infinicore/ops/logical_not.hpp"
+#include <stdexcept>
+
+namespace infinicore::op {
+
+common::OpDispatcher<LogicalNot::schema> &
+LogicalNot::dispatcher() {
+    static common::OpDispatcher<LogicalNot::schema> dispatcher_;
+    return dispatcher_;
+}
+
+void LogicalNot::execute(Tensor output, Tensor input) {
+    infinicore::context::setDevice(input->device());
+
+    auto device_type = context::getDevice().getType();
+    auto func = dispatcher().lookup(device_type);
+    if (func == nullptr) {
+        throw std::runtime_error(
+            "No LogicalNot implementation found for device type: " +
+            std::to_string(static_cast<int>(device_type))
+        );
+    }
+
+    func(output, input);
+}
+
+Tensor logical_not(Tensor input) {
+    auto output =
+        Tensor::empty(input->shape(), input->dtype(), input->device());
+    logical_not_(output, input);
+    return output;
+}
+
+void logical_not_(Tensor output, Tensor input) {
+    LogicalNot::execute(output, input);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/logical_not/logical_not_cpu.cc b/src/infinicore/ops/logical_not/logical_not_cpu.cc
new file mode 100644
index 000000000..0b1e5e5cf
--- /dev/null
+++ b/src/infinicore/ops/logical_not/logical_not_cpu.cc
@@ -0,0 +1,169 @@
+#include "../../../utils.h"
+#include "infinicore/device.hpp"
+#include "infinicore/ops/logical_not.hpp"
+#include "infinicore/tensor.hpp"
+
+#include <cstdint>
+#include <cstring>   // memcpy
+#include <stdexcept>
+#include <vector>
+
+namespace infinicore::op::logical_not_impl::cpu {
+
+// ---------- safe load/store (avoid strict-aliasing & alignment UB) ----------
+template <typename T>
+static inline T load_scalar(const uint8_t* p) {
+    T v;
+    std::memcpy(&v, p, sizeof(T));
+    return v;
+}
+
+template <typename T>
+static inline void store_scalar(uint8_t* p, T v) {
+    std::memcpy(p, &v, sizeof(T));
+}
+
+// ---------- read "truthiness": treat nonzero as true ----------
+static inline bool read_truth(const uint8_t* base, ptrdiff_t elem_off, DataType dtype, size_t elem_size) {
+    const uint8_t* p = base + elem_off * static_cast<ptrdiff_t>(elem_size);
+
+    switch (dtype) {
+        case DataType::BOOL:
+        case DataType::U8:
+            return load_scalar<uint8_t>(p) != 0;
+
+        case DataType::I16:
+            return load_scalar<int16_t>(p) != 0;
+        case DataType::U16:
+            return load_scalar<uint16_t>(p) != 0;
+
+        case DataType::I32:
+            return load_scalar<int32_t>(p) != 0;
+        case DataType::U32:
+            return load_scalar<uint32_t>(p) != 0;
+
+        case DataType::I64:
+            return load_scalar<int64_t>(p) != 0;
+        case DataType::U64:
+            return load_scalar<uint64_t>(p) != 0;
+
+        case DataType::F32:
+            return load_scalar<float>(p) != 0.0f;
+        case DataType::F64:
+            return load_scalar<double>(p) != 0.0;
+
+        case DataType::F16: {
+            fp16_t h = load_scalar<fp16_t>(p);
+            float fv = utils::cast<float>(h);
+            return fv != 0.0f;
+        }
+
+        default:
+            throw std::runtime_error("logical_not(cpu): unsupported input dtype.");
+    }
+}
+
+// ---------- write result (bool -> out_dtype as 0/1) ----------
+static inline void write_bool(uint8_t* base, ptrdiff_t elem_off, DataType out_dtype, size_t out_elem_size, bool b) {
+    uint8_t* p = base + elem_off * static_cast<ptrdiff_t>(out_elem_size);
+
+    switch (out_dtype) {
+        case DataType::BOOL:
+        case DataType::U8:
+            store_scalar<uint8_t>(p, b ? 1 : 0);
+            return;
+
+        case DataType::I16:
+            store_scalar<int16_t>(p, b ? 1 : 0);
+            return;
+        case DataType::U16:
+            store_scalar<uint16_t>(p, b ? 1 : 0);
+            return;
+
+        case DataType::I32:
+            store_scalar<int32_t>(p, b ? 1 : 0);
+            return;
+        case DataType::U32:
+            store_scalar<uint32_t>(p, b ? 1u : 0u);
+            return;
+
+        case DataType::I64:
+            store_scalar<int64_t>(p, b ? 1 : 0);
+            return;
+        case DataType::U64:
+            store_scalar<uint64_t>(p, b ? 1ull : 0ull);
+            return;
+
+        case DataType::F32:
+            store_scalar<float>(p, b ? 1.0f : 0.0f);
+            return;
+        case DataType::F64:
+            store_scalar<double>(p, b ? 1.0 : 0.0);
+            return;
+
+        case DataType::F16: {
+            fp16_t h = utils::cast<fp16_t>(b ? 1.0f : 0.0f);
+            store_scalar<fp16_t>(p, h);
+            return;
+        }
+
+        default:
+            throw std::runtime_error("logical_not(cpu): unsupported output dtype.");
+    }
+}
+
+void calculate(Tensor output, Tensor input) {
+    const auto ndim = output->ndim();
+    if (input->ndim() != ndim) {
+        throw std::runtime_error("logical_not(cpu): input/output ndim mismatch.");
+    }
+
+    const auto numel = output->numel();
+    const auto shape = output->shape();
+
+    // IMPORTANT: strides in TensorImpl are "element strides" (see tensor.cc calculate_contiguous_strides)
+    const auto in_strides  = input->strides();
+    const auto out_strides = output->strides();
+
+    const auto in_dtype  = input->dtype();
+    const auto out_dtype = output->dtype();
+
+    const auto in_es  = input->element_size();
+    const auto out_es = output->element_size();
+
+    const uint8_t* in_base  = reinterpret_cast<const uint8_t*>(input->data());
+    uint8_t* out_base       = reinterpret_cast<uint8_t*>(output->data());
+
+    std::vector<size_t> idx(ndim, 0);
+
+    for (size_t linear = 0; linear < numel; ++linear) {
+        // use ptrdiff_t all the way (must NOT cast strides to size_t)
+        ptrdiff_t in_elem_off  = 0;
+        ptrdiff_t out_elem_off = 0;
+
+        for (size_t d = 0; d < ndim; ++d) {
+            in_elem_off  += static_cast<ptrdiff_t>(idx[d]) * in_strides[d];
+            out_elem_off += static_cast<ptrdiff_t>(idx[d]) * out_strides[d];
+        }
+
+        // logical_not: result == (input == 0) == !(truthiness)
+        const bool truth  = read_truth(in_base, in_elem_off, in_dtype, in_es);
+        const bool result = !truth;
+
+        write_bool(out_base, out_elem_off, out_dtype, out_es, result);
+
+        // increment multi-d index
+        for (ptrdiff_t d = static_cast<ptrdiff_t>(ndim) - 1; d >= 0; --d) {
+            idx[static_cast<size_t>(d)]++;
+            if (idx[static_cast<size_t>(d)] < shape[static_cast<size_t>(d)]) break;
+            idx[static_cast<size_t>(d)] = 0;
+        }
+    }
+}
+
+static bool registered = []() {
+    LogicalNot::dispatcher().registerDevice(Device::Type::CPU, &calculate);
+    return true;
+}();
+
+} // namespace infinicore::op::logical_not_impl::cpu
diff --git a/src/infinicore/ops/unfold/unfold.cc b/src/infinicore/ops/unfold/unfold.cc
new file mode 100644
index 000000000..b14fcbcfe
--- /dev/null
+++ b/src/infinicore/ops/unfold/unfold.cc
@@ -0,0 +1,73 @@
+#include "infinicore/ops/unfold.hpp"
+
+namespace infinicore::op {
+
+// 1. 定义 Dispatcher 单例
+common::OpDispatcher<Unfold::schema> &Unfold::dispatcher() {
+    static common::OpDispatcher<Unfold::schema> dispatcher_;
+    return dispatcher_;
+};
+
+// 2. Execute 方法实现
+void Unfold::execute(Tensor output, Tensor input, 
+                     const std::vector<int64_t>& kernel_sizes, 
+                     const std::vector<int64_t>& dilations, 
+                     const std::vector<int64_t>& paddings, 
+                     const std::vector<int64_t>& strides) {
+    dispatcher().lookup(context::getDevice().getType())(output, input, kernel_sizes, dilations, paddings, strides);
+}
+
+// 3. 函数式接口
+Tensor unfold(Tensor input, 
+              std::vector<int64_t> kernel_sizes, 
+              std::vector<int64_t> dilations, 
+              std::vector<int64_t> paddings, 
+              std::vector<int64_t> strides) {
+    
+    // 基础维度校验与获取
+    const auto& input_shape = input->shape();
+    int64_t n_dim = input->ndim();
+    int64_t spatial_dims = n_dim - 2; // N, C, D1, D2... -> spatial starts at 2
+    int64_t N = input_shape[0];
+    int64_t C = input_shape[1];
+    
+    // 计算 dim 1: C * kernel_sizes[0] * kernel_sizes[1] ...
+    int64_t output_dim1 = C;
+    for (auto k : kernel_sizes) {
+        output_dim1 *= k;
+    }
+    
+    int64_t L = 1;
+    for (int i = 0; i < spatial_dims; ++i) {
+        int64_t input_dim = input_shape[i + 2];
+        int64_t k = kernel_sizes[i];
+        int64_t p = paddings[i];
+        int64_t d = dilations[i];
+        int64_t s = strides[i];
+
+        // 公式: out = floor((in + 2*p - d*(k-1) - 1) / s + 1)
+        int64_t output_spatial = (input_dim + 2 * p - d * (k - 1) - 1) / s + 1;
+        L *= output_spatial;
+    }
+    Shape output_shape = {
+        static_cast<size_t>(N), 
+        static_cast<size_t>(output_dim1), 
+        static_cast<size_t>(L)
+    };
+
+    auto output = Tensor::empty(output_shape, input->dtype(), input->device());
+    
+    unfold_(output, input, kernel_sizes, dilations, paddings, strides);
+    return output;
+}
+
+// 4. In-place / 显式输出接口
+void unfold_(Tensor output, Tensor input, 
+             std::vector<int64_t> kernel_sizes, 
+             std::vector<int64_t> dilations, 
+             std::vector<int64_t> paddings, 
+             std::vector<int64_t> strides) {
+    Unfold::execute(output, input, kernel_sizes, dilations, paddings, strides);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/unfold/unfold_infiniop.cc b/src/infinicore/ops/unfold/unfold_infiniop.cc
new file mode 100644
index 000000000..b75aadf53
--- /dev/null
+++ b/src/infinicore/ops/unfold/unfold_infiniop.cc
@@ -0,0 +1,101 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/unfold.hpp"
+#include <infiniop.h>
+#include <vector>
+#include <algorithm>
+
+namespace infinicore::op::unfold_impl::infiniop {
+
+// 定义描述符缓存
+thread_local common::OpCache<size_t, infiniopUnfoldDescriptor_t> caches(
+    100, // capacity
+    [](infiniopUnfoldDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyUnfoldDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor output, Tensor input, 
+               const std::vector<int64_t>& kernel_sizes, 
+               const std::vector<int64_t>& dilations, 
+               const std::vector<int64_t>& paddings, 
+               const std::vector<int64_t>& strides) {
+    
+    // 1. 计算 Hash Key (修复点：手动拆解，避开 hash.hpp 的递归 bug 和 vector 不支持问题)
+    size_t seed = 0;
+
+    // 基础 Tensor 支持 (hash.hpp 中有 Tensor 重载)
+    hash_combine(seed, output);
+    hash_combine(seed, input);
+
+    // Vector 类型必须手动遍历 (hash.hpp 不支持 vector 直接 hash)
+    for (auto v : kernel_sizes) hash_combine(seed, v);
+    for (auto v : dilations)    hash_combine(seed, v);
+    for (auto v : paddings)     hash_combine(seed, v);
+    for (auto v : strides)      hash_combine(seed, v);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopUnfoldDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        // 2. 创建描述符
+        
+        // 辅助函数：将 int64_t vector 转换为 int vector 以匹配 C API 的 int* 签名
+        auto to_int_vec = [](const std::vector<int64_t>& src) {
+            std::vector<int> dst(src.size());
+            std::transform(src.begin(), src.end(), dst.begin(), 
+                           [](int64_t val) { return static_cast<int>(val); });
+            return dst;
+        };
+
+        std::vector<int> k_int = to_int_vec(kernel_sizes);
+        std::vector<int> s_int = to_int_vec(strides);
+        std::vector<int> p_int = to_int_vec(paddings);
+        std::vector<int> d_int = to_int_vec(dilations);
+
+        INFINICORE_CHECK_ERROR(infiniopCreateUnfoldDescriptor(
+            context::getInfiniopHandle(output->device()),
+            &desc,
+            output->desc(),
+            input->desc(),
+            k_int.data(),
+            s_int.data(),
+            p_int.data(),
+            d_int.data()
+        ));
+        
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    // 3. 获取 Workspace 并执行
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetUnfoldWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopUnfold(
+        desc,
+        workspace->data(),
+        workspace_size,
+        output->data(),
+        input->data(),
+        context::getStream()
+    ));
+}
+
+// 4. 注册算子实现
+static bool registered = []() {
+    Unfold::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::unfold_impl::infiniop
\ No newline at end of file
diff --git a/src/infinicore/ops/vander/vander.cc b/src/infinicore/ops/vander/vander.cc
new file mode 100644
index 000000000..5f3d8dd29
--- /dev/null
+++ b/src/infinicore/ops/vander/vander.cc
@@ -0,0 +1,36 @@
+#include "infinicore/ops/vander.hpp"
+
+namespace infinicore::op {
+
+// 1. 定义 Dispatcher 单例
+common::OpDispatcher<Vander::schema> &Vander::dispatcher() {
+    static common::OpDispatcher<Vander::schema> dispatcher_;
+    return dispatcher_;
+};
+
+// 2. Execute 方法实现
+void Vander::execute(Tensor output, Tensor input, int64_t N, bool increasing) {
+    dispatcher().lookup(context::getDevice().getType())(output, input, N, increasing);
+}
+
+// 3. 函数式接口
+Tensor vander(Tensor input, int64_t N, bool increasing) {
+    int64_t input_size = input->shape()[0];
+    int64_t cols = (N > 0) ? N : input_size;
+    Shape output_shape = {
+        static_cast<size_t>(input_size), 
+        static_cast<size_t>(cols)
+    };
+
+    auto output = Tensor::empty(output_shape, input->dtype(), input->device());
+    
+    vander_(output, input, N, increasing);
+    return output;
+}
+
+// 4. In-place / 显式输出接口
+void vander_(Tensor output, Tensor input, int64_t N, bool increasing) {
+    Vander::execute(output, input, N, increasing);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/vander/vander_infiniop.cc b/src/infinicore/ops/vander/vander_infiniop.cc
new file mode 100644
index 000000000..f0ab7b30f
--- /dev/null
+++ b/src/infinicore/ops/vander/vander_infiniop.cc
@@ -0,0 +1,70 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/vander.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::vander_impl::infiniop {
+
+// 定义描述符缓存
+thread_local common::OpCache<size_t, infiniopVanderDescriptor_t> caches(
+    100, // capacity
+    [](infiniopVanderDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyVanderDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor output, Tensor input, int64_t N, bool increasing) {
+    // 1. 计算 Hash Key
+    // 直接组合 output, input 以及标量参数 N, increasing
+    size_t seed = hash_combine(output, input, N, increasing);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopVanderDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        // 2. 创建描述符
+        // 注意：将 int64_t N 转换为 int，bool increasing 转换为 int 以匹配 C API 签名
+        INFINICORE_CHECK_ERROR(infiniopCreateVanderDescriptor(
+            context::getInfiniopHandle(output->device()),
+            &desc,
+            output->desc(),
+            input->desc(),
+            static_cast<int>(N),
+            static_cast<int>(increasing)
+        ));
+        
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    // 3. 获取 Workspace 并执行
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetVanderWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopVander(
+        desc,
+        workspace->data(),
+        workspace_size,
+        output->data(),
+        input->data(),
+        context::getStream()
+    ));
+}
+
+// 4. 注册算子实现
+static bool registered = []() {
+    Vander::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::vander_impl::infiniop
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp
index 431c3a37b..1eeb46ef4 100644
--- a/src/infinicore/pybind11/ops.hpp
+++ b/src/infinicore/pybind11/ops.hpp
@@ -18,7 +18,11 @@
 #include "ops/rope.hpp"
 #include "ops/silu.hpp"
 #include "ops/swiglu.hpp"
-
+#include "ops/vander.hpp"
+#include "ops/unfold.hpp"
+#include "ops/logcumsumexp.hpp"
+#include "ops/logical_and.hpp" 
+#include "ops/logical_not.hpp" 
 namespace py = pybind11;
 
 namespace infinicore::ops {
@@ -38,6 +42,11 @@ inline void bind(py::module &m) {
     bind_rms_norm(m);
     bind_silu(m);
     bind_swiglu(m);
+    bind_logcumsumexp(m);
+    bind_logical_and(m); 
+    bind_logical_not(m);
+    bind_vander(m);
+    bind_unfold(m); 
     bind_rope(m);
     bind_embedding(m);
 }
diff --git a/src/infinicore/pybind11/ops/logcumsumexp.hpp b/src/infinicore/pybind11/ops/logcumsumexp.hpp
new file mode 100644
index 000000000..3359cfe56
--- /dev/null
+++ b/src/infinicore/pybind11/ops/logcumsumexp.hpp
@@ -0,0 +1,31 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include "infinicore/ops/logcumsumexp.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_logcumsumexp(py::module &m) {
+    // 绑定非原地操作接口 (返回新 Tensor)
+    m.def("logcumsumexp",
+          &op::logcumsumexp,
+          py::arg("input"),
+          py::arg("dim"),
+          py::arg("exclusive") = false,
+          py::arg("reverse") = false,
+          R"doc(Computes the logarithm of the cumulative summation of the exponentiation of elements.)doc");
+
+    // 绑定原地/指定输出接口
+    m.def("logcumsumexp_",
+          &op::logcumsumexp_,
+          py::arg("out"),
+          py::arg("input"),
+          py::arg("dim"),
+          py::arg("exclusive") = false,
+          py::arg("reverse") = false,
+          R"doc(In-place version of logcumsumexp.)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/logical_and.hpp b/src/infinicore/pybind11/ops/logical_and.hpp
new file mode 100644
index 000000000..c68614e96
--- /dev/null
+++ b/src/infinicore/pybind11/ops/logical_and.hpp
@@ -0,0 +1,27 @@
+#pragma once
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/logical_and.hpp"
+
+namespace py = pybind11;
+namespace infinicore::ops {
+
+inline void bind_logical_and(py::module &m) {
+    // 绑定常规函数: logical_and(input, other) -> Tensor
+    m.def("logical_and",
+          &op::logical_and,
+          py::arg("input"),
+          py::arg("other"),
+          R"doc(Computes the element-wise logical AND of the given input tensors.)doc");
+
+    // 绑定底层输出指定函数: logical_and_(output, input, other)
+    // 对应 Python 调用: _infinicore.logical_and_(out, input, other)
+    m.def("logical_and_",
+          &op::logical_and_,
+          py::arg("output"),
+          py::arg("input"),
+          py::arg("other"),
+          R"doc(Explicit output logical AND computation.)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/logical_not.hpp b/src/infinicore/pybind11/ops/logical_not.hpp
new file mode 100644
index 000000000..a400dfea6
--- /dev/null
+++ b/src/infinicore/pybind11/ops/logical_not.hpp
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include "infinicore/ops/logical_not.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_logical_not(py::module &m) {
+    // Out-of-place: output = logical_not(input)
+    m.def("logical_not",
+          &op::logical_not,
+          py::arg("input"),
+          R"doc(Logical NOT of the tensor.)doc");
+
+    // In-place / Explicit Output: logical_not_(output, input)
+    // 对应 C++: void logical_not_(Tensor output, Tensor input)
+    m.def("logical_not_",
+          &op::logical_not_,
+          py::arg("output"),
+          py::arg("input"),
+          R"doc(In-place logical NOT computation.)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/unfold.hpp b/src/infinicore/pybind11/ops/unfold.hpp
new file mode 100644
index 000000000..43c1aa374
--- /dev/null
+++ b/src/infinicore/pybind11/ops/unfold.hpp
@@ -0,0 +1,59 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include "infinicore/ops/unfold.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_unfold(py::module &m) {
+    // -------------------------------------------------------------------------
+    // 1. 绑定函数式接口 (unfold)
+    // -------------------------------------------------------------------------
+    m.def("unfold",
+        [](const Tensor& input, 
+           std::vector<int64_t> kernel_sizes,
+           std::vector<int64_t> dilations, 
+           std::vector<int64_t> paddings, 
+           std::vector<int64_t> strides) {
+            return op::unfold(input, kernel_sizes, dilations, paddings, strides);
+        },
+        py::arg("input"),
+        py::arg("kernel_sizes"),
+        py::arg("dilations"),
+        py::arg("paddings"),
+        py::arg("strides"),
+        R"doc(Extracts sliding local blocks from a batched input tensor.
+
+    Args:
+        input (Tensor): The input tensor.
+        kernel_sizes (List[int]): The size of the sliding blocks.
+        dilations (List[int]): The parameter that controls the stride of elements within the neighborhood.
+        paddings (List[int]): Implicit zero padding to be added on both sides of input.
+        strides (List[int]): The stride of the sliding blocks.
+    )doc");
+
+    // -------------------------------------------------------------------------
+    // 2. 绑定 in-place 接口 (unfold_)
+    // -------------------------------------------------------------------------
+    m.def("unfold_",
+        [](Tensor& output, 
+           const Tensor& input, 
+           std::vector<int64_t> kernel_sizes,
+           std::vector<int64_t> dilations, 
+           std::vector<int64_t> paddings, 
+           std::vector<int64_t> strides) {
+            op::unfold_(output, input, kernel_sizes, dilations, paddings, strides);
+        },
+        py::arg("output"),
+        py::arg("input"),
+        py::arg("kernel_sizes"),
+        py::arg("dilations"),
+        py::arg("paddings"),
+        py::arg("strides"),
+        R"doc(Explicit output Unfold operation. Writes the result into the output tensor.)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/vander.hpp b/src/infinicore/pybind11/ops/vander.hpp
new file mode 100644
index 000000000..bfbb1acf0
--- /dev/null
+++ b/src/infinicore/pybind11/ops/vander.hpp
@@ -0,0 +1,40 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include "infinicore/ops/vander.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_vander(py::module &m) {
+    m.def("vander",
+        [](const Tensor& input, int64_t N, bool increasing) {
+            return op::vander(input, N, increasing);
+        },
+        py::arg("input"),
+        py::arg("N") = 0, 
+        py::arg("increasing") = false,
+        R"doc(Generates a Vandermonde matrix.
+
+    Args:
+        input (Tensor): 1-D input tensor.
+        N (int, optional): Number of columns in the output. If 0, defaults to input size (square matrix). Default: 0.
+        increasing (bool, optional): Order of the powers. If True, powers increase (x^0, x^1...). Default: False.
+    )doc");
+
+    // -------------------------------------------------------------------------
+    // 2. 绑定 in-place 接口 (vander_)
+    // -------------------------------------------------------------------------
+    m.def("vander_",
+        [](Tensor& output, const Tensor& input, int64_t N, bool increasing) {
+            op::vander_(output, input, N, increasing);
+        },
+        py::arg("output"),
+        py::arg("input"),
+        py::arg("N") = 0,
+        py::arg("increasing") = false,
+        R"doc(Explicit output Vander operation. Writes the result into the output tensor.)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infiniop/ops/logcumsumexp/cpu/logcumsumexp_cpu.cc b/src/infiniop/ops/logcumsumexp/cpu/logcumsumexp_cpu.cc
new file mode 100644
index 000000000..fb208d4f2
--- /dev/null
+++ b/src/infiniop/ops/logcumsumexp/cpu/logcumsumexp_cpu.cc
@@ -0,0 +1,145 @@
+#include "logcumsumexp_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include <algorithm>
+#include <cmath>
+#include <omp.h>
+#include <cstdint>
+#include <limits>
+
+#include "../../../../utils/custom_types.h"
+
+namespace op::logcumsumexp::cpu {
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+        _opaque = nullptr;
+    }
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int axis,
+    int exclusive,
+    int reverse) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    
+    // 注意：这里复用了你之前修改过的 Info 类，它现在包含正确的 stride 信息
+    auto result = LogCumSumExpInfo::create(y_desc, x_desc, axis, exclusive, reverse);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        new Opaque(),
+        result.take(),
+        0, 
+        handle->device, 
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T>
+void calculate_cpu_impl(
+    const LogCumSumExpInfo &info,
+    void *y,
+    const void *x) {
+
+    size_t outer_size = info.outer_size();
+    size_t axis_size = info.axis_size();
+    size_t inner_size = info.inner_size();
+    bool exclusive = info.exclusive();
+    bool reverse = info.reverse();
+
+    // 获取 Info 中存储的物理 stride
+    size_t x_outer_stride = info._x_outer_stride;
+    size_t x_axis_stride  = info._x_axis_stride;
+    size_t x_inner_stride = info._x_inner_stride;
+
+    size_t y_outer_stride = info._y_outer_stride;
+    size_t y_axis_stride  = info._y_axis_stride;
+    size_t y_inner_stride = info._y_inner_stride;
+
+    auto y_ptr = reinterpret_cast<T *>(y);
+    auto x_ptr = reinterpret_cast<const T *>(x);
+
+    #pragma omp parallel for collapse(2) schedule(static)
+    for (size_t i = 0; i < outer_size; ++i) {
+        for (size_t j = 0; j < inner_size; ++j) {
+            
+            // [修复] 使用物理 Stride 计算起始偏移量，而不是逻辑 Shape
+            size_t x_base = i * x_outer_stride + j * x_inner_stride;
+            size_t y_base = i * y_outer_stride + j * y_inner_stride;
+
+            double running_max = -std::numeric_limits<double>::infinity();
+            double running_sum_exp = 0.0;
+
+            for (size_t k = 0; k < axis_size; ++k) {
+                // 处理 reverse 逻辑
+                size_t k_idx = reverse ? (axis_size - 1 - k) : k;
+
+                // [修复] 使用物理 axis stride
+                size_t x_offset = x_base + k_idx * x_axis_stride;
+                size_t y_offset = y_base + k_idx * y_axis_stride;
+
+                float val = utils::cast<float>(x_ptr[x_offset]);
+                
+                // 如果是 exclusive，先记录结果再更新状态
+                if (exclusive) {
+                    if (running_sum_exp == 0.0) {
+                        y_ptr[y_offset] = utils::cast<T>(-std::numeric_limits<float>::infinity());
+                    } else {
+                        y_ptr[y_offset] = utils::cast<T>(static_cast<float>(running_max + std::log(running_sum_exp)));
+                    }
+                }
+
+                // 更新数值稳定的累加状态
+                if (val > running_max) {
+                    running_sum_exp = running_sum_exp * std::exp(running_max - val) + 1.0;
+                    running_max = val;
+                } else {
+                    running_sum_exp += std::exp(val - running_max);
+                }
+
+                // 如果不是 exclusive，更新状态后记录结果
+                if (!exclusive) {
+                    y_ptr[y_offset] = utils::cast<T>(static_cast<float>(running_max + std::log(running_sum_exp)));
+                }
+            }
+        }
+    }
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) const {
+
+    auto dtype = _info.dtype();
+
+    switch (dtype) {
+    case INFINI_DTYPE_F32:
+        cpu::calculate_cpu_impl<float>(_info, y, x);
+        break;
+    case INFINI_DTYPE_F16:
+        cpu::calculate_cpu_impl<fp16_t>(_info, y, x);
+        break;
+    case INFINI_DTYPE_BF16:
+        cpu::calculate_cpu_impl<bf16_t>(_info, y, x);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::logcumsumexp::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/logcumsumexp/cpu/logcumsumexp_cpu.h b/src/infiniop/ops/logcumsumexp/cpu/logcumsumexp_cpu.h
new file mode 100644
index 000000000..ce1b2f2f1
--- /dev/null
+++ b/src/infiniop/ops/logcumsumexp/cpu/logcumsumexp_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __LOGCUMSUMEXP_CPU_H__
+#define __LOGCUMSUMEXP_CPU_H__
+
+#include "../logcumsumexp.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __LOGCUMSUMEXP_CPU_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/logcumsumexp/cuda/kernel.cuh b/src/infiniop/ops/logcumsumexp/cuda/kernel.cuh
new file mode 100644
index 000000000..2bf93f27f
--- /dev/null
+++ b/src/infiniop/ops/logcumsumexp/cuda/kernel.cuh
@@ -0,0 +1,124 @@
+#ifndef __LOGCUMSUMEXP_CUDA_CUH__
+#define __LOGCUMSUMEXP_CUDA_CUH__
+
+#include <cuda_runtime.h>
+
+#if defined ENABLE_METAX_API
+    #include <maca_fp16.h>
+    #include <maca_bfloat16.h>
+#else
+    #include <cuda_fp16.h>
+    #include <cuda_bf16.h>
+#endif
+
+#include <cmath>
+#include <limits>
+
+namespace op::logcumsumexp::cuda {
+
+// ============================================================
+// 数值稳定 LogSumExp prefix state
+// 等价于：log(sum(exp(x[0:i])))
+// ============================================================
+
+struct LSEState {
+    float m;   // running max
+    float s;   // sum(exp(x - m))
+
+    // 数学单位元：log(0) = -inf
+    __device__ __forceinline__ static LSEState identity() {
+        return { -INFINITY, 0.0f };
+    }
+
+    // prefix 更新
+    __device__ __forceinline__ void update(float v) {
+        if (m == -INFINITY) {
+            // 第一个元素
+            m = v;
+            s = 1.0f;
+        } else if (v > m) {
+            // max 发生变化，需要 rescale
+            s = s * expf(m - v) + 1.0f;
+            m = v;
+        } else {
+            s += expf(v - m);
+        }
+    }
+
+    // 当前 log-sum-exp 值
+    __device__ __forceinline__ float value() const {
+        return (s == 0.0f) ? -INFINITY : (m + logf(s));
+    }
+};
+
+// ============================================================
+// kernel：一个 thread 负责一个 (outer, inner) 前缀向量
+// ============================================================
+
+template <typename T>
+__global__ void logcumsumexp_kernel(
+    T* __restrict__ y,
+    const T* __restrict__ x,
+
+    size_t outer_size,
+    size_t axis_size,
+    size_t inner_size,
+
+    size_t x_axis_stride,
+    size_t x_inner_stride,
+    size_t x_outer_stride,
+
+    size_t y_axis_stride,
+    size_t y_inner_stride,
+    size_t y_outer_stride,
+
+    bool exclusive,
+    bool reverse
+) {
+    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t num_vec = outer_size * inner_size;
+    if (tid >= num_vec) return;
+
+    size_t o = tid / inner_size;
+    size_t i = tid % inner_size;
+
+    // base offset（正确处理 stride）
+    size_t x_base = o * x_outer_stride + i * x_inner_stride;
+    size_t y_base = o * y_outer_stride + i * y_inner_stride;
+
+    LSEState state = LSEState::identity();
+
+    for (size_t k = 0; k < axis_size; ++k) {
+        size_t kk = reverse ? (axis_size - 1 - k) : k;
+
+        size_t x_off = x_base + kk * x_axis_stride;
+        size_t y_off = y_base + kk * y_axis_stride;
+
+        float v = static_cast<float>(x[x_off]);
+
+        if (exclusive) {
+            // y[i] = log(sum(exp(x[:i])))
+            y[y_off] = static_cast<T>(state.value());
+            state.update(v);
+        } else {
+            // y[i] = log(sum(exp(x[:i+1])))
+            state.update(v);
+            y[y_off] = static_cast<T>(state.value());
+        }
+
+        // ===== 调试用（需要时打开）=====
+        /*
+        if (o == 0 && i == 0 && k < 5) {
+            printf(
+                "[CUDA] k=%zu v=%f m=%f s=%f out=%f\n",
+                k, v, state.m, state.s,
+                static_cast<float>(y[y_off])
+            );
+        }
+        */
+    }
+}
+
+} // namespace op::logcumsumexp::cuda
+
+#endif // __LOGCUMSUMEXP_CUDA_CUH__
diff --git a/src/infiniop/ops/logcumsumexp/info.h b/src/infiniop/ops/logcumsumexp/info.h
new file mode 100644
index 000000000..4a220f524
--- /dev/null
+++ b/src/infiniop/ops/logcumsumexp/info.h
@@ -0,0 +1,152 @@
+#ifndef __LOGCUMSUMEXP_INFO_H__
+#define __LOGCUMSUMEXP_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::logcumsumexp {
+
+class LogCumSumExpInfo {
+    LogCumSumExpInfo() = default;
+
+public:
+    int _dtype;
+    int _axis;
+    bool _exclusive;
+    bool _reverse;
+
+    size_t _outer_size;
+    size_t _axis_size;
+    size_t _inner_size;
+
+    size_t _x_axis_stride;
+    size_t _x_inner_stride;
+    size_t _x_outer_stride;
+
+    size_t _y_axis_stride;
+    size_t _y_inner_stride;
+    size_t _y_outer_stride;
+
+    int dtype() const { return _dtype; }
+    int axis() const { return _axis; }
+    bool exclusive() const { return _exclusive; }
+    bool reverse() const { return _reverse; }
+    size_t outer_size() const { return _outer_size; }
+    size_t axis_size() const { return _axis_size; }
+    size_t inner_size() const { return _inner_size; }
+
+    LogCumSumExpInfo(
+        int dtype,
+        int axis,
+        bool exclusive,
+        bool reverse,
+        size_t outer,
+        size_t axis_len,
+        size_t inner,
+        size_t x_as,
+        size_t x_is,
+        size_t x_os,
+        size_t y_as,
+        size_t y_is,
+        size_t y_os
+    )
+        : _dtype(dtype),
+          _axis(axis),
+          _exclusive(exclusive),
+          _reverse(reverse),
+          _outer_size(outer),
+          _axis_size(axis_len),
+          _inner_size(inner),
+          _x_axis_stride(x_as),
+          _x_inner_stride(x_is),
+          _x_outer_stride(x_os),
+          _y_axis_stride(y_as),
+          _y_inner_stride(y_is),
+          _y_outer_stride(y_os) {}
+
+    static utils::Result<LogCumSumExpInfo> create(
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x_desc,
+        int axis,
+        int exclusive,
+        int reverse) {
+
+        if (y_desc->ndim() != x_desc->ndim()) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        size_t ndim = x_desc->ndim();
+        for (size_t i = 0; i < ndim; ++i) {
+            if (y_desc->shape()[i] != x_desc->shape()[i]) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+        }
+
+        if (y_desc->dtype() != x_desc->dtype()) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+
+        if (axis < 0 || static_cast<size_t>(axis) >= ndim) {
+            return INFINI_STATUS_BAD_PARAM;
+        }
+
+        // ================================
+        // 逻辑维度大小
+        // ================================
+        size_t outer = 1;
+        for (size_t i = 0; i < static_cast<size_t>(axis); ++i) {
+            outer *= x_desc->shape()[i];
+        }
+
+        size_t axis_len = x_desc->shape()[axis];
+
+        size_t inner = 1;
+        for (size_t i = static_cast<size_t>(axis) + 1; i < ndim; ++i) {
+            inner *= x_desc->shape()[i];
+        }
+
+        // ================================
+        // 物理 stride 计算 (已修复)
+        // ================================
+        
+        // 1. Axis Stride
+        size_t x_axis_stride = static_cast<size_t>(x_desc->stride(axis));
+        size_t y_axis_stride = static_cast<size_t>(y_desc->stride(axis));
+
+        // 2. Inner Stride
+        // [关键修复]: 
+        // Kernel 将 inner 部分视为被展平的一维数组 (0 到 inner-1)。
+        // 对于连续 (Contiguous) 的 Tensor，这部分数据的内存是连续的。
+        // 因此，无论 inner 包含多少个逻辑维度，访问下一个元素的物理偏移量固定为 1。
+        // 原先使用 stride(axis + 1) 会导致多维 inner 时跳过数据。
+        size_t x_inner_stride = 1;
+        size_t y_inner_stride = 1;
+
+        // 3. Outer Stride
+        // 如果 axis == 0，outer_size 为 1，outer stride 设为 0。
+        // 否则取 axis 前一维的 stride，代表跨越整个 axis+inner 数据块的步长。
+        size_t x_outer_stride = (axis == 0) ? 0 : static_cast<size_t>(x_desc->stride(axis - 1));
+        size_t y_outer_stride = (axis == 0) ? 0 : static_cast<size_t>(y_desc->stride(axis - 1));
+
+        return utils::Result<LogCumSumExpInfo>(LogCumSumExpInfo{
+            x_desc->dtype(),
+            axis,
+            static_cast<bool>(exclusive),
+            static_cast<bool>(reverse),
+            outer,
+            axis_len,
+            inner,
+            x_axis_stride,
+            x_inner_stride,
+            x_outer_stride,
+            y_axis_stride,
+            y_inner_stride,
+            y_outer_stride
+        });
+    }
+};
+
+} // namespace op::logcumsumexp
+
+#endif // __LOGCUMSUMEXP_INFO_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/logcumsumexp/logcumsumexp.h b/src/infiniop/ops/logcumsumexp/logcumsumexp.h
new file mode 100644
index 000000000..51f1a01aa
--- /dev/null
+++ b/src/infiniop/ops/logcumsumexp/logcumsumexp.h
@@ -0,0 +1,50 @@
+#ifndef __LOGCUMSUMEXP_H__
+#define __LOGCUMSUMEXP_H__
+
+#include "../../operator.h"
+#include "info.h" // 引用对应的 LogCumSumExpInfo 定义
+
+// 宏定义：用于生成不同命名空间下的 Descriptor 类 (例如 cpu, nvidia, metax)
+#define DESCRIPTOR(NAMESPACE)                                            \
+    namespace op::logcumsumexp::NAMESPACE {                              \
+    class Descriptor final : public InfiniopDescriptor {                 \
+        struct Opaque;                                                   \
+        Opaque *_opaque;            /* 指向后端私有实现数据 */              \
+        LogCumSumExpInfo _info;     /* 存储校验过的张量和算子信息 */         \
+        size_t _workspace_size;     /* 缓存计算所需的显存/内存大小 */        \
+                                                                         \
+        Descriptor(                                                      \
+            Opaque *opaque,                                              \
+            LogCumSumExpInfo info,                                       \
+            size_t workspace_size,                                       \
+            infiniDevice_t device_type,                                  \
+            int device_id)                                               \
+            : InfiniopDescriptor{device_type, device_id},                \
+              _opaque(opaque),                                           \
+              _info(info),                                               \
+              _workspace_size(workspace_size) {}                         \
+                                                                         \
+    public:                                                              \
+        ~Descriptor();                                                   \
+                                                                         \
+        size_t workspaceSize() const { return _workspace_size; }         \
+                                                                         \
+        static infiniStatus_t create(                                    \
+            infiniopHandle_t handle,                                     \
+            Descriptor **desc_ptr,                                       \
+            infiniopTensorDescriptor_t y_desc,                           \
+            infiniopTensorDescriptor_t x_desc,                           \
+            int axis,                                                    \
+            int exclusive,                                               \
+            int reverse);                                                \
+                                                                         \
+        infiniStatus_t calculate(                                        \
+            void *workspace,                                             \
+            size_t workspace_size,                                       \
+            void *y,                                                     \
+            const void *x,                                               \
+            void *stream) const;                                         \
+    };                                                                   \
+    }
+
+#endif // __LOGCUMSUMEXP_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/logcumsumexp/metax/logcumsumexp_metax.h b/src/infiniop/ops/logcumsumexp/metax/logcumsumexp_metax.h
new file mode 100644
index 000000000..a681f402f
--- /dev/null
+++ b/src/infiniop/ops/logcumsumexp/metax/logcumsumexp_metax.h
@@ -0,0 +1,8 @@
+#ifndef __LOGCUMSUMEXP_METAX_API_H__
+#define __LOGCUMSUMEXP_METAX_API_H__
+
+#include "../logcumsumexp.h"
+
+DESCRIPTOR(metax)
+
+#endif // __LOGCUMSUMEXP_METAX_API_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/logcumsumexp/metax/logcumsumexp_metax.maca b/src/infiniop/ops/logcumsumexp/metax/logcumsumexp_metax.maca
new file mode 100644
index 000000000..6927f5623
--- /dev/null
+++ b/src/infiniop/ops/logcumsumexp/metax/logcumsumexp_metax.maca
@@ -0,0 +1,209 @@
+#include "logcumsumexp_metax.h" 
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+#include <mcr/mc_runtime.h>
+#include <cstdint>
+#include <cmath>
+#include <limits>
+#include <maca_fp16.h>
+#include <maca_bfloat16.h>
+
+
+namespace op::logcumsumexp::metax {
+
+
+struct LSEState {
+    float m;   // running max
+    float s;   // sum(exp(x - m))
+
+    // 数学单位元：log(0) = -inf
+    __device__ __forceinline__ static LSEState identity() {
+        return { -INFINITY, 0.0f };
+    }
+
+    // prefix 更新逻辑
+    __device__ __forceinline__ void update(float v) {
+        if (m == -INFINITY) {
+            // 第一个有效值
+            m = v;
+            s = 1.0f;
+        } else if (v > m) {
+            // 新的 max 出现，需要缩放之前的 sum
+            // s_new = s_old * exp(old_m - new_v) + 1
+            s = s * expf(m - v) + 1.0f;
+            m = v;
+        } else {
+            // 累加项
+            s += expf(v - m);
+        }
+    }
+
+    // 计算当前的 log-sum-exp 值: m + log(s)
+    __device__ __forceinline__ float value() const {
+        return (s == 0.0f) ? -INFINITY : (m + logf(s));
+    }
+};
+
+
+template <typename T>
+__global__ void logcumsumexp_kernel(
+    T* __restrict__ y,
+    const T* __restrict__ x,
+
+    size_t outer_size,
+    size_t axis_size,
+    size_t inner_size,
+
+    size_t x_axis_stride,
+    size_t x_inner_stride,
+    size_t x_outer_stride,
+
+    size_t y_axis_stride,
+    size_t y_inner_stride,
+    size_t y_outer_stride,
+
+    bool exclusive,
+    bool reverse
+) {
+    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t num_vec = outer_size * inner_size;
+    if (tid >= num_vec) return;
+
+    // 计算当前的 outer (o) 和 inner (i) 索引
+    size_t o = tid / inner_size;
+    size_t i = tid % inner_size;
+
+    // 计算 Base Offset
+    size_t x_base = o * x_outer_stride + i * x_inner_stride;
+    size_t y_base = o * y_outer_stride + i * y_inner_stride;
+
+    LSEState state = LSEState::identity();
+
+    // 沿 Axis 维度循环
+    for (size_t k = 0; k < axis_size; ++k) {
+        // 处理 reverse (从后向前累加)
+        size_t kk = reverse ? (axis_size - 1 - k) : k;
+
+        size_t x_off = x_base + kk * x_axis_stride;
+        size_t y_off = y_base + kk * y_axis_stride;
+
+        // 统一转为 float 计算
+        float v = static_cast<float>(x[x_off]);
+
+        if (exclusive) {
+            // Exclusive: 先写入当前状态，再更新
+            // y[0] = log(0) = -inf
+            y[y_off] = static_cast<T>(state.value());
+            state.update(v);
+        } else {
+            // Inclusive: 先更新状态，再写入
+            // y[0] = x[0]
+            state.update(v);
+            y[y_off] = static_cast<T>(state.value());
+        }
+    }
+}
+
+
+template <typename T>
+void launch_kernel(
+    void* y,
+    const void* x,
+    const LogCumSumExpInfo& info,
+    void* stream) {
+
+    auto x_ptr = reinterpret_cast<const T*>(x);
+    auto y_ptr = reinterpret_cast<T*>(y);
+    auto mc_stream = reinterpret_cast<mcStream_t>(stream);
+
+    size_t outer = info.outer_size();
+    size_t axis  = info.axis_size();
+    size_t inner = info.inner_size();
+
+    size_t total = outer * inner;
+    constexpr size_t block = 256;
+    size_t grid = (total + block - 1) / block;
+
+    logcumsumexp_kernel<T>
+        <<<grid, block, 0, mc_stream>>>(
+            y_ptr,
+            x_ptr,
+            outer,
+            axis,
+            inner,
+
+            info._x_axis_stride,
+            info._x_inner_stride,
+            info._x_outer_stride,
+
+            info._y_axis_stride,
+            info._y_inner_stride,
+            info._y_outer_stride,
+
+            info.exclusive(),
+            info.reverse()
+        );
+}
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() {
+    if (_opaque) delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor** desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int axis,
+    int exclusive,
+    int reverse) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    
+    auto info_result =
+        LogCumSumExpInfo::create(y_desc, x_desc, axis, exclusive, reverse);
+    if (!info_result) return info_result.status();
+
+    *desc_ptr = new Descriptor(
+        new Opaque(),
+        info_result.take(),
+        /*workspace*/ 0,
+        handle->device,
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void* workspace,
+    size_t workspace_size,
+    void* y,
+    const void* x,
+    void* stream) const {
+
+    switch (_info.dtype()) {
+    case INFINI_DTYPE_F16:
+        launch_kernel<__half>(y, x, _info, stream);
+        break;
+    case INFINI_DTYPE_BF16:
+#if defined(__MACA__) || defined(__MACACC__)
+        launch_kernel<__maca_bfloat16>(y, x, _info, stream);
+#endif
+        break;
+    case INFINI_DTYPE_F32:
+        launch_kernel<float>(y, x, _info, stream);
+        break;
+    case INFINI_DTYPE_F64:
+        launch_kernel<double>(y, x, _info, stream);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::logcumsumexp::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/logcumsumexp/moore/logcumsumexp_moore.h b/src/infiniop/ops/logcumsumexp/moore/logcumsumexp_moore.h
new file mode 100644
index 000000000..1d9c9699f
--- /dev/null
+++ b/src/infiniop/ops/logcumsumexp/moore/logcumsumexp_moore.h
@@ -0,0 +1,6 @@
+#ifndef __LOGCUMSUMEXP_MOORE_API_H__
+#define __LOGCUMSUMEXP_MOORE_API_H__
+#include "../logcumsumexp.h"
+DESCRIPTOR(moore)
+
+#endif // __LOGCUMSUMEXP_MOORE_API_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/logcumsumexp/moore/logcumsumexp_moore.mu b/src/infiniop/ops/logcumsumexp/moore/logcumsumexp_moore.mu
new file mode 100644
index 000000000..4be0cfa78
--- /dev/null
+++ b/src/infiniop/ops/logcumsumexp/moore/logcumsumexp_moore.mu
@@ -0,0 +1,118 @@
+#include "logcumsumexp_moore.h"
+#include "logcumsumexp_moore_kernel.h"
+#include "../../../devices/moore/moore_handle.h"
+#include <musa_runtime.h>
+#include <musa_fp16.h>
+#include <cstdint>
+
+namespace op::logcumsumexp::moore {
+
+// ============================================================
+// Kernel Launch
+// ============================================================
+
+template <typename T>
+void launch_kernel(
+    void* y,
+    const void* x,
+    const LogCumSumExpInfo& info,
+    void* stream) {
+
+    auto x_ptr = reinterpret_cast<const T*>(x);
+    auto y_ptr = reinterpret_cast<T*>(y);
+    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
+
+    size_t outer = info.outer_size();
+    size_t axis  = info.axis_size();
+    size_t inner = info.inner_size();
+
+    size_t total = outer * inner;
+    constexpr size_t block = 256;
+    size_t grid = (total + block - 1) / block;
+
+    op::logcumsumexp::moore::logcumsumexp_kernel<T>
+        <<<grid, block, 0, musa_stream>>>(
+            y_ptr,
+            x_ptr,
+            outer,
+            axis,
+            inner,
+
+            info._x_axis_stride,
+            info._x_inner_stride,
+            info._x_outer_stride,   // ✅ 新增
+
+            info._y_axis_stride,
+            info._y_inner_stride,
+            info._y_outer_stride,   // ✅ 新增
+
+            info.exclusive(),
+            info.reverse()
+        );
+}
+
+// ============================================================
+// Descriptor
+// ============================================================
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() {
+    if (_opaque) delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor** desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int axis,
+    int exclusive,
+    int reverse) {
+
+    auto handle = reinterpret_cast<device::moore::Handle*>(handle_);
+
+    auto info_result =
+        LogCumSumExpInfo::create(y_desc, x_desc, axis, exclusive, reverse);
+    if (!info_result) return info_result.status();
+
+    *desc_ptr = new Descriptor(
+        new Opaque(),
+        info_result.take(),
+        /*workspace*/ 0,
+        handle->device,
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void* workspace,
+    size_t workspace_size,
+    void* y,
+    const void* x,
+    void* stream) const {
+
+    switch (_info.dtype()) {
+    case INFINI_DTYPE_F16:
+        launch_kernel<half>(y, x, _info, stream);
+        break;
+    case INFINI_DTYPE_BF16:
+        // Moore 架构通常使用 __mt_bfloat16
+        launch_kernel<__mt_bfloat16>(y, x, _info, stream);
+        break;
+    case INFINI_DTYPE_F32:
+        launch_kernel<float>(y, x, _info, stream);
+        break;
+    case INFINI_DTYPE_F64:
+        launch_kernel<double>(y, x, _info, stream);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::logcumsumexp::moore
\ No newline at end of file
diff --git a/src/infiniop/ops/logcumsumexp/moore/logcumsumexp_moore_kernel.h b/src/infiniop/ops/logcumsumexp/moore/logcumsumexp_moore_kernel.h
new file mode 100644
index 000000000..086a67493
--- /dev/null
+++ b/src/infiniop/ops/logcumsumexp/moore/logcumsumexp_moore_kernel.h
@@ -0,0 +1,110 @@
+#ifndef __LOGCUMSUMEXP_MOORE_KERNEL_H__
+#define __LOGCUMSUMEXP_MOORE_KERNEL_H__
+
+#include <musa_runtime.h>
+#include <musa_fp16.h>
+#include <musa_bf16.h>
+
+#include <cmath>
+#include <limits>
+
+namespace op::logcumsumexp::moore {
+
+// ============================================================
+// 数值稳定 LogSumExp prefix state
+// 等价于：log(sum(exp(x[0:i])))
+// ============================================================
+
+struct LSEState {
+    float m;   // running max
+    float s;   // sum(exp(x - m))
+
+    // 数学单位元：log(0) = -inf
+    __device__ __forceinline__ static LSEState identity() {
+        return { -INFINITY, 0.0f };
+    }
+
+    // prefix 更新
+    __device__ __forceinline__ void update(float v) {
+        if (m == -INFINITY) {
+            // 第一个元素
+            m = v;
+            s = 1.0f;
+        } else if (v > m) {
+            // max 发生变化，需要 rescale
+            s = s * expf(m - v) + 1.0f;
+            m = v;
+        } else {
+            s += expf(v - m);
+        }
+    }
+
+    // 当前 log-sum-exp 值
+    __device__ __forceinline__ float value() const {
+        return (s == 0.0f) ? -INFINITY : (m + logf(s));
+    }
+};
+
+// ============================================================
+// kernel：一个 thread 负责一个 (outer, inner) 前缀向量
+// ============================================================
+
+template <typename T>
+__global__ void logcumsumexp_kernel(
+    T* __restrict__ y,
+    const T* __restrict__ x,
+
+    size_t outer_size,
+    size_t axis_size,
+    size_t inner_size,
+
+    size_t x_axis_stride,
+    size_t x_inner_stride,
+    size_t x_outer_stride,
+
+    size_t y_axis_stride,
+    size_t y_inner_stride,
+    size_t y_outer_stride,
+
+    bool exclusive,
+    bool reverse
+) {
+    size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t num_vec = outer_size * inner_size;
+    if (tid >= num_vec) return;
+
+    size_t o = tid / inner_size;
+    size_t i = tid % inner_size;
+
+    // base offset（正确处理 stride）
+    size_t x_base = o * x_outer_stride + i * x_inner_stride;
+    size_t y_base = o * y_outer_stride + i * y_inner_stride;
+
+    LSEState state = LSEState::identity();
+
+    for (size_t k = 0; k < axis_size; ++k) {
+        size_t kk = reverse ? (axis_size - 1 - k) : k;
+
+        size_t x_off = x_base + kk * x_axis_stride;
+        size_t y_off = y_base + kk * y_axis_stride;
+
+        // 注意：这里利用 static_cast 将 half/__mt_bfloat16 转为 float
+        // MUSA 编译器通常支持这种隐式或显式转换，如果特定类型不支持，
+        // 需要替换为 __half2float(val) 或 static_cast<float>(val)
+        float v = static_cast<float>(x[x_off]);
+
+        if (exclusive) {
+            // y[i] = log(sum(exp(x[:i])))
+            y[y_off] = static_cast<T>(state.value());
+            state.update(v);
+        } else {
+            // y[i] = log(sum(exp(x[:i+1])))
+            state.update(v);
+            y[y_off] = static_cast<T>(state.value());
+        }
+    }
+}
+
+} // namespace op::logcumsumexp::moore
+
+#endif // __LOGCUMSUMEXP_MOORE_KERNEL_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/logcumsumexp/nvidia/logcumsumexp_nvidia.cu b/src/infiniop/ops/logcumsumexp/nvidia/logcumsumexp_nvidia.cu
new file mode 100644
index 000000000..967be0bf0
--- /dev/null
+++ b/src/infiniop/ops/logcumsumexp/nvidia/logcumsumexp_nvidia.cu
@@ -0,0 +1,113 @@
+#include "logcumsumexp_nvidia.cuh"
+#include "../cuda/kernel.cuh"
+#include "../../../handle.h"
+#include <cstdint>
+
+namespace op::logcumsumexp::nvidia {
+
+// ============================================================
+// Kernel Launch
+// ============================================================
+
+template <typename T>
+void launch_kernel(
+    void* y,
+    const void* x,
+    const LogCumSumExpInfo& info,
+    void* stream) {
+
+    auto x_ptr = reinterpret_cast<const T*>(x);
+    auto y_ptr = reinterpret_cast<T*>(y);
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+    size_t outer = info.outer_size();
+    size_t axis  = info.axis_size();
+    size_t inner = info.inner_size();
+
+    size_t total = outer * inner;
+    constexpr size_t block = 256;
+    size_t grid = (total + block - 1) / block;
+
+    op::logcumsumexp::cuda::logcumsumexp_kernel<T>
+        <<<grid, block, 0, cuda_stream>>>(
+            y_ptr,
+            x_ptr,
+            outer,
+            axis,
+            inner,
+
+            info._x_axis_stride,
+            info._x_inner_stride,
+            info._x_outer_stride,   // ✅ 新增
+
+            info._y_axis_stride,
+            info._y_inner_stride,
+            info._y_outer_stride,   // ✅ 新增
+
+            info.exclusive(),
+            info.reverse()
+        );
+}
+
+// ============================================================
+// Descriptor
+// ============================================================
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() {
+    if (_opaque) delete _opaque;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle,
+    Descriptor** desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    int axis,
+    int exclusive,
+    int reverse) {
+
+    auto info_result =
+        LogCumSumExpInfo::create(y_desc, x_desc, axis, exclusive, reverse);
+    if (!info_result) return info_result.status();
+
+    *desc_ptr = new Descriptor(
+        new Opaque(),
+        info_result.take(),
+        /*workspace*/ 0,
+        handle->device,
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void* workspace,
+    size_t workspace_size,
+    void* y,
+    const void* x,
+    void* stream) const {
+
+    switch (_info.dtype()) {
+    case INFINI_DTYPE_F16:
+        launch_kernel<half>(y, x, _info, stream);
+        break;
+    case INFINI_DTYPE_BF16:
+        launch_kernel<nv_bfloat16>(y, x, _info, stream);
+        break;
+    case INFINI_DTYPE_F32:
+        launch_kernel<float>(y, x, _info, stream);
+        break;
+    case INFINI_DTYPE_F64:
+        launch_kernel<double>(y, x, _info, stream);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::logcumsumexp::nvidia
diff --git a/src/infiniop/ops/logcumsumexp/nvidia/logcumsumexp_nvidia.cuh b/src/infiniop/ops/logcumsumexp/nvidia/logcumsumexp_nvidia.cuh
new file mode 100644
index 000000000..0032a0621
--- /dev/null
+++ b/src/infiniop/ops/logcumsumexp/nvidia/logcumsumexp_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __LOGCUMSUMEXP_NVIDIA_CUH__
+#define __LOGCUMSUMEXP_NVIDIA_CUH__
+
+#include "../logcumsumexp.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __LOGCUMSUMEXP_NVIDIA_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/logcumsumexp/operator.cc b/src/infiniop/ops/logcumsumexp/operator.cc
new file mode 100644
index 000000000..07379676a
--- /dev/null
+++ b/src/infiniop/ops/logcumsumexp/operator.cc
@@ -0,0 +1,183 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/logcumsumexp.h"
+
+// --- 各硬件后端实现头文件 ---
+#ifdef ENABLE_CPU_API
+#include "cpu/logcumsumexp_cpu.h"
+#endif
+
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/logcumsumexp_nvidia.cuh"
+#endif
+
+#ifdef ENABLE_METAX_API
+#include "metax/logcumsumexp_metax.h"
+#endif
+
+#ifdef ENABLE_MOORE_API
+#include "moore/logcumsumexp_moore.h"
+#endif
+
+extern "C" {
+
+// =======================================================================
+// 1. 创建算子描述符
+// =======================================================================
+__C infiniStatus_t infiniopCreateLogCumSumExpDescriptor(
+    infiniopHandle_t handle,
+    infiniopLogCumSumExpDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y,
+    infiniopTensorDescriptor_t x,
+    int axis,
+    int exclusive,
+    int reverse) {
+
+    #define CREATE(CASE, NAMESPACE)                                                                 \
+        case CASE:                                                                                  \
+            return op::logcumsumexp::NAMESPACE::Descriptor::create(                                 \
+                handle,                                                                             \
+                reinterpret_cast<op::logcumsumexp::NAMESPACE::Descriptor **>(desc_ptr),             \
+                y,                                                                                  \
+                x,                                                                                  \
+                axis,                                                                               \
+                exclusive,                                                                          \
+                reverse)
+
+    switch (handle->device) {
+    #ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CREATE
+}
+
+// =======================================================================
+// 2. 获取 Workspace 大小
+// =======================================================================
+__C infiniStatus_t infiniopGetLogCumSumExpWorkspaceSize(infiniopLogCumSumExpDescriptor_t desc, size_t *size) {
+
+    #define GET(CASE, NAMESPACE)                                                                            \
+        case CASE:                                                                                          \
+            *size = reinterpret_cast<op::logcumsumexp::NAMESPACE::Descriptor *>(desc)->workspaceSize();      \
+            return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef GET
+}
+
+// =======================================================================
+// 3. 执行计算
+// =======================================================================
+__C infiniStatus_t infiniopLogCumSumExp(
+    infiniopLogCumSumExpDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+    #define CALCULATE(CASE, NAMESPACE)                                                          \
+        case CASE:                                                                              \
+            return reinterpret_cast<const op::logcumsumexp::NAMESPACE::Descriptor *>(desc)      \
+                ->calculate(workspace, workspace_size, y, x, stream)
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CALCULATE
+}
+
+// =======================================================================
+// 4. 销毁描述符
+// =======================================================================
+__C infiniStatus_t infiniopDestroyLogCumSumExpDescriptor(infiniopLogCumSumExpDescriptor_t desc) {
+
+    #define DELETE(CASE, NAMESPACE)                                                                 \
+        case CASE:                                                                                  \
+            delete reinterpret_cast<const op::logcumsumexp::NAMESPACE::Descriptor *>(desc);         \
+            return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef DELETE
+}
+
+} // extern "C"
\ No newline at end of file
diff --git a/src/infiniop/ops/unfold/cpu/unfold_cpu.cc b/src/infiniop/ops/unfold/cpu/unfold_cpu.cc
new file mode 100644
index 000000000..53708fb76
--- /dev/null
+++ b/src/infiniop/ops/unfold/cpu/unfold_cpu.cc
@@ -0,0 +1,158 @@
+#include "unfold_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include <algorithm>
+#include <cstring>
+#include <omp.h>
+
+#include "../../../../utils/custom_types.h"
+
+namespace op::unfold::cpu {
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+        _opaque = nullptr;
+    }
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t input_desc,
+    const int *kernel_sizes,
+    const int *strides,
+    const int *paddings,
+    const int *dilations) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    
+    // Call the static create method from UnfoldInfo
+    auto result = UnfoldInfo::infer(out_desc, input_desc, kernel_sizes, strides, paddings, dilations);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        new Opaque(),
+        result.take(),
+        0, // No workspace needed for CPU
+        handle->device, 
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+// Core Calculation Implementation
+template <typename T>
+void calculate_cpu_impl(
+    const UnfoldInfo &info,
+    void *output,
+    const void *input) {
+
+    // 1. Retrieve parameters from vectors
+    // Note: This implementation assumes 2D spatial dimensions (NCHW)
+    // If the vector size != 2, this logic needs adaptation.
+    if (info._kernel_sizes.size() < 2) return;
+
+    int64_t batch = info._N;
+    int64_t in_c = info._C_in;
+    
+    // Spatial Input Dimensions
+    int64_t in_h = info._input_spatial_shape[0];
+    int64_t in_w = info._input_spatial_shape[1];
+
+    // Spatial Output Dimensions
+    int64_t out_h = info._output_spatial_shape[0];
+    int64_t out_w = info._output_spatial_shape[1];
+
+    // Kernel / Stride / Pad / Dilation
+    int64_t k_h = info._kernel_sizes[0];
+    int64_t k_w = info._kernel_sizes[1];
+    int64_t stride_h = info._strides[0];
+    int64_t stride_w = info._strides[1];
+    int64_t pad_h = info._paddings[0];
+    int64_t pad_w = info._paddings[1];
+    int64_t dil_h = info._dilations[0];
+    int64_t dil_w = info._dilations[1];
+
+    // 2. Prepare pointers
+    auto out_ptr = reinterpret_cast<T *>(output);
+    auto in_ptr = reinterpret_cast<const T *>(input);
+
+    // 3. Helper variables
+    int64_t L = info._L;
+    int64_t out_c_dim = info._C_out;
+
+    // 4. Parallel Execution
+    // Collapsing Batch and Input Channel dimensions
+    #pragma omp parallel for collapse(2) schedule(static)
+    for (int64_t n = 0; n < batch; ++n) {
+        for (int64_t c = 0; c < in_c; ++c) {
+            
+            int64_t in_batch_offset = n * in_c * in_h * in_w;
+            int64_t out_batch_offset = n * out_c_dim * L;
+
+            // Loop over kernel window (Unfolding to channel dimension)
+            for (int64_t kh = 0; kh < k_h; ++kh) {
+                for (int64_t kw = 0; kw < k_w; ++kw) {
+                    
+                    int64_t out_c_idx = c * k_h * k_w + kh * k_w + kw;
+                    
+                    // Loop over output spatial locations (Flattened L)
+                    for (int64_t oh = 0; oh < out_h; ++oh) {
+                        for (int64_t ow = 0; ow < out_w; ++ow) {
+                            
+                            // Mapping Logic
+                            int64_t h_in = oh * stride_h - pad_h + kh * dil_h;
+                            int64_t w_in = ow * stride_w - pad_w + kw * dil_w;
+
+                            int64_t out_idx = out_batch_offset + out_c_idx * L + (oh * out_w + ow);
+
+                            // Boundary Check & Assignment
+                            if (h_in >= 0 && h_in < in_h && w_in >= 0 && w_in < in_w) {
+                                int64_t in_idx = in_batch_offset + c * in_h * in_w + h_in * in_w + w_in;
+                                out_ptr[out_idx] = in_ptr[in_idx];
+                            } else {
+                                // Padding with zero
+                                out_ptr[out_idx] = utils::cast<T>(0.0f);
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    auto dtype = _info.dtype_val();
+
+    switch (dtype) {
+    case INFINI_DTYPE_F32:
+        cpu::calculate_cpu_impl<float>(_info, output, input);
+        break;
+    case INFINI_DTYPE_F64:
+        cpu::calculate_cpu_impl<double>(_info, output, input);
+        break;
+    case INFINI_DTYPE_F16:
+        cpu::calculate_cpu_impl<fp16_t>(_info, output, input);
+        break;
+    case INFINI_DTYPE_BF16:
+        cpu::calculate_cpu_impl<bf16_t>(_info, output, input);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::unfold::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/unfold/cpu/unfold_cpu.h b/src/infiniop/ops/unfold/cpu/unfold_cpu.h
new file mode 100644
index 000000000..0f37e3bc3
--- /dev/null
+++ b/src/infiniop/ops/unfold/cpu/unfold_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __UNFOLD_CPU_H__
+#define __UNFOLD_CPU_H__
+
+#include "../unfold.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __UNFOLD_CPU_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/unfold/cuda/kernel.cuh b/src/infiniop/ops/unfold/cuda/kernel.cuh
new file mode 100644
index 000000000..ca9315ea8
--- /dev/null
+++ b/src/infiniop/ops/unfold/cuda/kernel.cuh
@@ -0,0 +1,76 @@
+#ifndef __UNFOLD_CUDA_CUH__
+#define __UNFOLD_CUDA_CUH__
+
+#include <cuda_runtime.h>
+#if defined ENABLE_METAX_API
+    #include <maca_fp16.h>
+    #include <maca_bfloat16.h>
+    using nv_bfloat162 = __maca_bfloat162;
+#else
+    #include <cuda_fp16.h>
+    #include <cuda_bf16.h>
+#endif
+
+#include <cstdio>
+
+namespace op::unfold::cuda {
+template <typename T>
+__global__ void unfold_kernel(
+    T * __restrict__ output,            // [N, C_out, L]
+    const T * __restrict__ input,       // [N, C, H, W]
+    // 维度参数
+    int C, int H, int W,                // 输入维度
+    int out_h, int out_w,               // 输出空间维度
+    // 算子参数
+    int k_h, int k_w,                   // Kernel Size
+    int pad_h, int pad_w,               // Padding
+    int stride_h, int stride_w,         // Stride
+    int dil_h, int dil_w,               // Dilation
+    // 总任务量
+    size_t total_elements) {
+
+    // 平铺式索引：每个线程处理输出的一个元素
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < total_elements) {
+        // --------------------------------------------------------
+        // 1. 坐标反算：从线性 idx 解析出逻辑维度
+        // 输出形状逻辑上为: [N, (C * kH * kW), (out_h * out_w)]
+        // --------------------------------------------------------
+        int L = out_h * out_w;
+        int kernel_area = k_h * k_w;
+        int C_col = C * kernel_area; // 输出的通道数 (Column Channel)
+        int l_idx = idx % L;
+        size_t temp = idx / L;
+        int c_col_idx = temp % C_col;
+        int n_idx = temp / C_col;
+
+        // 解析空间坐标 (h_out, w_out)
+        int w_out = l_idx % out_w;
+        int h_out = l_idx / out_w;
+
+        // 解析通道坐标 -> (c_in, kh, kw)
+        int kw = c_col_idx % k_w;
+        int temp_k = c_col_idx / k_w;
+        int kh = temp_k % k_h;
+        int c_in = temp_k / k_h;
+        int h_in = h_out * stride_h - pad_h + kh * dil_h;
+        int w_in = w_out * stride_w - pad_w + kw * dil_w;
+        T val;
+        
+        if (h_in >= 0 && h_in < H && w_in >= 0 && w_in < W) {
+            // 计算输入线性索引：[n, c, h, w]
+            size_t in_idx = ((static_cast<size_t>(n_idx) * C + c_in) * H + h_in) * W + w_in;
+            val = input[in_idx];
+        } else {
+            // Padding 区域填 0
+            val = static_cast<T>(0.0f);
+        }
+
+        output[idx] = val;
+    }
+}
+
+} // namespace op::unfold::cuda
+
+#endif // __UNFOLD_CUDA_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/unfold/info.h b/src/infiniop/ops/unfold/info.h
new file mode 100644
index 000000000..59bbe6712
--- /dev/null
+++ b/src/infiniop/ops/unfold/info.h
@@ -0,0 +1,133 @@
+#ifndef __OPS_UNFOLD_INFO_H__
+#define __OPS_UNFOLD_INFO_H__
+
+// 按照你提供的路径引用
+#include "../../../utils.h"
+#include "../../tensor.h"
+
+#include <vector>
+#include <numeric>
+
+namespace op::unfold {
+
+struct UnfoldInfo {
+public:
+    int _dtype;
+    
+    // 空间参数
+    std::vector<int> _kernel_sizes;
+    std::vector<int> _strides;
+    std::vector<int> _paddings;
+    std::vector<int> _dilations;
+
+    // 形状缓存
+    size_t _N;        // Batch
+    size_t _C_in;     // Input Channels
+    size_t _C_out;    // Output Channels
+    size_t _L;        // Output Spatial Length
+    
+    // 缓存输入的空间维度 (H_in, W_in)
+    std::vector<int64_t> _input_spatial_shape;
+    // 缓存输出的空间维度 (H_out, W_out)
+    std::vector<int64_t> _output_spatial_shape;
+
+    // 默认构造必须是 public
+    UnfoldInfo() = default;
+
+    // 构造函数
+    UnfoldInfo(int dtype, 
+               std::vector<int> kernel_sizes, 
+               std::vector<int> strides, 
+               std::vector<int> paddings, 
+               std::vector<int> dilations,
+               size_t N, size_t C_in, size_t C_out, size_t L,
+               std::vector<int64_t> in_spatial,
+               std::vector<int64_t> out_spatial)
+        : _dtype(dtype), 
+          _kernel_sizes(std::move(kernel_sizes)), 
+          _strides(std::move(strides)), 
+          _paddings(std::move(paddings)), 
+          _dilations(std::move(dilations)),
+          _N(N), _C_in(C_in), _C_out(C_out), _L(L),
+          _input_spatial_shape(std::move(in_spatial)),
+          _output_spatial_shape(std::move(out_spatial)) {}
+
+    // Getters
+    int dtype_val() const { return _dtype; }
+    
+    // 对应 .cc 中的调用
+    static utils::Result<UnfoldInfo> infer(
+        infiniopTensorDescriptor_t out_desc,
+        infiniopTensorDescriptor_t input_desc,
+        const int *kernel_sizes,
+        const int *strides,
+        const int *paddings,
+        const int *dilations) {
+
+        // 1. 检查维度 (假设 ndim() 是方法)
+        if (input_desc->ndim() < 3) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        int ndim = input_desc->ndim();
+        int spatial_dims = ndim - 2;
+
+        // 2. 读取参数
+        size_t N = input_desc->shape()[0];
+        size_t C_in = input_desc->shape()[1];
+        
+        std::vector<int> k_vec(kernel_sizes, kernel_sizes + spatial_dims);
+        std::vector<int> s_vec(strides, strides + spatial_dims);
+        std::vector<int> p_vec(paddings, paddings + spatial_dims); 
+        std::vector<int> d_vec(dilations, dilations + spatial_dims);
+
+        // 3. 计算形状
+        size_t kernel_prod = 1;
+        for (int k : k_vec) kernel_prod *= k;
+        size_t C_out = C_in * kernel_prod;
+
+        size_t L = 1;
+        std::vector<int64_t> in_spatial;
+        std::vector<int64_t> out_spatial;
+
+        for (int i = 0; i < spatial_dims; ++i) {
+            int64_t in_dim = input_desc->shape()[i + 2];
+            in_spatial.push_back(in_dim);
+
+            int k = k_vec[i];
+            int s = s_vec[i];
+            int p = p_vec[i]; 
+            int d = d_vec[i];
+
+            int64_t numerator = in_dim + 2 * p - d * (k - 1) - 1;
+            if (numerator < 0) numerator = -s; 
+            
+            int64_t out_dim = (numerator / s) + 1;
+            if (out_dim <= 0) return INFINI_STATUS_BAD_PARAM;
+            
+            out_spatial.push_back(out_dim);
+            L *= out_dim;
+        }
+
+        // 4. 校验输出
+        if (out_desc->ndim() == 3) {
+            if (out_desc->shape()[0] != N || 
+                out_desc->shape()[1] != C_out || 
+                out_desc->shape()[2] != L) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+        }
+
+        // 显式构造 Result
+        return utils::Result<UnfoldInfo>(UnfoldInfo(
+            input_desc->dtype(),
+            std::move(k_vec), std::move(s_vec), std::move(p_vec), std::move(d_vec),
+            N, C_in, C_out, L,
+            std::move(in_spatial), std::move(out_spatial)
+        ));
+    }
+};
+
+} // namespace op::unfold
+
+#endif // __OPS_UNFOLD_INFO_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/unfold/metax/unfold_metax.h b/src/infiniop/ops/unfold/metax/unfold_metax.h
new file mode 100644
index 000000000..07a3aad1a
--- /dev/null
+++ b/src/infiniop/ops/unfold/metax/unfold_metax.h
@@ -0,0 +1,7 @@
+#ifndef __UNFOLD_METAX_API_H__
+#define __UNFOLD_METAX_API_H__
+
+#include "../unfold.h"
+DESCRIPTOR(metax)
+
+#endif // __UNFOLD_METAX_API_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/unfold/metax/unfold_metax.maca b/src/infiniop/ops/unfold/metax/unfold_metax.maca
new file mode 100644
index 000000000..01981add3
--- /dev/null
+++ b/src/infiniop/ops/unfold/metax/unfold_metax.maca
@@ -0,0 +1,216 @@
+#include "unfold_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+#include <mcr/mc_runtime.h>
+#include <cstdint>
+#include <algorithm>
+#include <cstdio>
+
+// ==================================================================
+// 1. MACA 类型兼容
+// ==================================================================
+#if defined(__MACA__) || defined(__MACACC__)
+    #include <maca_fp16.h>
+    #include <maca_bfloat16.h>
+    using nv_bfloat16 = __maca_bfloat16;
+    using nv_bfloat162 = __maca_bfloat162;
+#endif
+
+namespace op::unfold::metax {
+
+// ==================================================================
+// 2. Kernel 实现
+// ==================================================================
+template <typename T>
+__global__ void unfold_kernel(
+    T * __restrict__ output,            // [N, C_out, L]
+    const T * __restrict__ input,       // [N, C, H, W]
+    // 维度参数
+    int C, int H, int W,                // 输入维度
+    int out_h, int out_w,               // 输出空间维度
+    // 算子参数
+    int k_h, int k_w,                   // Kernel Size
+    int pad_h, int pad_w,               // Padding
+    int stride_h, int stride_w,         // Stride
+    int dil_h, int dil_w,               // Dilation
+    // 总任务量
+    size_t total_elements) {
+
+    // 平铺式索引：每个线程处理输出的一个元素
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < total_elements) {
+        // --------------------------------------------------------
+        // 1. 坐标反算：从线性 idx 解析出逻辑维度
+        // 输出形状逻辑上为: [N, (C * kH * kW), (out_h * out_w)]
+        // --------------------------------------------------------
+        int L = out_h * out_w;
+        int kernel_area = k_h * k_w;
+        int C_col = C * kernel_area; // 输出的通道数 (Column Channel)
+        
+        // 优化除法/取模运算顺序
+        int l_idx = idx % L;
+        size_t temp = idx / L;
+        int c_col_idx = temp % C_col;
+        int n_idx = temp / C_col;
+
+        // 解析空间坐标 (h_out, w_out)
+        int w_out = l_idx % out_w;
+        int h_out = l_idx / out_w;
+
+        // 解析通道坐标 -> (c_in, kh, kw)
+        int kw = c_col_idx % k_w;
+        int temp_k = c_col_idx / k_w;
+        int kh = temp_k % k_h;
+        int c_in = temp_k / k_h;
+        
+        // 计算对应的输入坐标
+        int h_in = h_out * stride_h - pad_h + kh * dil_h;
+        int w_in = w_out * stride_w - pad_w + kw * dil_w;
+        
+        T val;
+        
+        // 边界检查与赋值
+        if (h_in >= 0 && h_in < H && w_in >= 0 && w_in < W) {
+            // 计算输入线性索引：[n, c, h, w]
+            size_t in_idx = ((static_cast<size_t>(n_idx) * C + c_in) * H + h_in) * W + w_in;
+            val = input[in_idx];
+        } else {
+            // Padding 区域填 0
+            val = static_cast<T>(0.0f);
+        }
+
+        output[idx] = val;
+    }
+}
+
+// ==================================================================
+// 3. Kernel Launch Logic
+// ==================================================================
+template <typename T>
+void launch_kernel(
+    void *output, 
+    const void *input, 
+    const UnfoldInfo& info,
+    void *stream) {
+
+    // 1. 准备指针
+    // [修复] 修正了这里的拼写错误 (autoin_ptr -> auto in_ptr)
+    auto in_ptr = reinterpret_cast<const T *>(input);
+    auto out_ptr = reinterpret_cast<T *>(output);
+    auto mc_stream = reinterpret_cast<mcStream_t>(stream);
+
+    // 2. 准备参数 (从 Info 的向量中解包)
+    if (info._kernel_sizes.size() < 2) {
+        return;
+    }
+
+    int C = info._C_in;
+    int H = info._input_spatial_shape[0];
+    int W = info._input_spatial_shape[1];
+    
+    int out_h = info._output_spatial_shape[0];
+    int out_w = info._output_spatial_shape[1];
+    
+    int k_h = info._kernel_sizes[0];
+    int k_w = info._kernel_sizes[1];
+    int pad_h = info._paddings[0];
+    int pad_w = info._paddings[1];
+    int stride_h = info._strides[0];
+    int stride_w = info._strides[1];
+    int dil_h = info._dilations[0];
+    int dil_w = info._dilations[1];
+
+    // 3. 计算 Grid
+    size_t out_channels = info._C_out; 
+    size_t out_spatial = info._L;
+    size_t total_elements = info._N * out_channels * out_spatial;
+
+    size_t block_size = 256;
+    size_t grid_size = (total_elements + block_size - 1) / block_size;
+
+    // 4. 调用 Kernel
+    unfold_kernel<T>
+        <<<grid_size, block_size, 0, mc_stream>>>(
+            out_ptr, in_ptr,
+            C, H, W,
+            out_h, out_w,
+            k_h, k_w,
+            pad_h, pad_w,
+            stride_h, stride_w,
+            dil_h, dil_w,
+            total_elements
+        );
+}
+
+// ==================================================================
+// 4. Descriptor 实现
+// ==================================================================
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() { 
+    if (_opaque) delete _opaque; 
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t input_desc,
+    const int *kernel_sizes,
+    const int *strides,
+    const int *paddings,
+    const int *dilations) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+
+    // 1. 创建并校验 Info
+    auto result = UnfoldInfo::infer(out_desc, input_desc, kernel_sizes, strides, paddings, dilations);
+    if (!result) return result.status();
+    
+    size_t workspace_size = 0;
+
+    *desc_ptr = new Descriptor(
+        new Opaque(),
+        result.take(),
+        workspace_size,
+        handle->device,
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    auto dtype = _info.dtype_val();
+
+    // 3. 根据数据类型分发 Kernel
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        launch_kernel<__half>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_BF16:
+#if defined(__MACA__) || defined(__MACACC__)
+        launch_kernel<__maca_bfloat16>(output, input, _info, stream);
+#endif
+        break;
+    case INFINI_DTYPE_F32:
+        launch_kernel<float>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_F64:
+        launch_kernel<double>(output, input, _info, stream);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::unfold::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/unfold/moore/unfold_moore.h b/src/infiniop/ops/unfold/moore/unfold_moore.h
new file mode 100644
index 000000000..a61cf097e
--- /dev/null
+++ b/src/infiniop/ops/unfold/moore/unfold_moore.h
@@ -0,0 +1,8 @@
+#ifndef __UNFOLD_MOORE_H__
+#define __UNFOLD_MOORE_H__
+
+#include "../unfold.h"
+
+DESCRIPTOR(moore)
+
+#endif // __UNFOLD_MOORE_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/unfold/moore/unfold_moore.mu b/src/infiniop/ops/unfold/moore/unfold_moore.mu
new file mode 100644
index 000000000..59db55568
--- /dev/null
+++ b/src/infiniop/ops/unfold/moore/unfold_moore.mu
@@ -0,0 +1,140 @@
+#include "unfold_moore.h"
+#include "unfold_moore_kernel.h"
+#include "../../../devices/moore/moore_handle.h"
+#include <cstdint>
+#include <algorithm>
+
+namespace op::unfold::moore {
+
+// ==================================================================
+// Kernel Launch Logic
+// ==================================================================
+template <typename T>
+void launch_kernel(
+    void *output, 
+    const void *input, 
+    const UnfoldInfo& info,
+    void *stream) {
+
+    // 1. 准备指针
+    auto in_ptr = reinterpret_cast<const T *>(input);
+    auto out_ptr = reinterpret_cast<T *>(output);
+    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
+
+    // 2. 准备参数 (从 Info 的向量中解包)
+    // 注意：目前的 Kernel 仅支持 2D Spatial (NCHW)，这里取前两个维度
+    if (info._kernel_sizes.size() < 2) {
+        // 异常情况处理，或者直接返回
+        return;
+    }
+
+    int C = info._C_in;
+    int H = info._input_spatial_shape[0];
+    int W = info._input_spatial_shape[1];
+    
+    int out_h = info._output_spatial_shape[0];
+    int out_w = info._output_spatial_shape[1];
+    
+    int k_h = info._kernel_sizes[0];
+    int k_w = info._kernel_sizes[1];
+    int pad_h = info._paddings[0];
+    int pad_w = info._paddings[1];
+    int stride_h = info._strides[0];
+    int stride_w = info._strides[1];
+    int dil_h = info._dilations[0];
+    int dil_w = info._dilations[1];
+
+    // 3. 计算 Grid
+    // 输出通道数 = C * kH * kW
+    size_t out_channels = info._C_out; 
+    size_t out_spatial = info._L;
+    size_t total_elements = info._N * out_channels * out_spatial;
+
+    size_t block_size = 256;
+    size_t grid_size = (total_elements + block_size - 1) / block_size;
+
+    // 4. 调用 Moore Kernel
+    op::unfold::moore::unfold_kernel<T>
+        <<<grid_size, block_size, 0, musa_stream>>>(
+            out_ptr, in_ptr,
+            C, H, W,
+            out_h, out_w,
+            k_h, k_w,
+            pad_h, pad_w,
+            stride_h, stride_w,
+            dil_h, dil_w,
+            total_elements
+        );
+}
+
+// ==================================================================
+// Descriptor 实现
+// ==================================================================
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() { 
+    if (_opaque) delete _opaque; 
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t input_desc,
+    const int *kernel_sizes,
+    const int *strides,
+    const int *paddings,
+    const int *dilations) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+
+    // 1. 创建并校验 Info
+    // 使用新的 infer 接口
+    auto result = UnfoldInfo::infer(out_desc, input_desc, kernel_sizes, strides, paddings, dilations);
+    if (!result) return result.status();
+    
+    size_t workspace_size = 0;
+
+    *desc_ptr = new Descriptor(
+        new Opaque(),
+        result.take(),
+        workspace_size,
+        handle->device,
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    // 使用新的 getter
+    auto dtype = _info.dtype_val();
+
+    // 3. 根据数据类型分发 Kernel
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        launch_kernel<half>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_BF16:
+        launch_kernel<__mt_bfloat16>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_F32:
+        launch_kernel<float>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_F64:
+        launch_kernel<double>(output, input, _info, stream);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::unfold::moore
\ No newline at end of file
diff --git a/src/infiniop/ops/unfold/moore/unfold_moore_kernel.h b/src/infiniop/ops/unfold/moore/unfold_moore_kernel.h
new file mode 100644
index 000000000..3f0945f9e
--- /dev/null
+++ b/src/infiniop/ops/unfold/moore/unfold_moore_kernel.h
@@ -0,0 +1,80 @@
+#ifndef __UNFOLD_MOORE_KERNEL_H__
+#define __UNFOLD_MOORE_KERNEL_H__
+
+#include <musa_runtime.h>
+#include <musa_fp16.h>
+#include <musa_bf16.h>
+#include <cstdio>
+#include <type_traits>
+
+namespace op::unfold::moore {
+
+template <typename T>
+__global__ void unfold_kernel(
+    T * __restrict__ output,            // [N, C_out, L]
+    const T * __restrict__ input,       // [N, C, H, W]
+    // 维度参数
+    int C, int H, int W,                // 输入维度
+    int out_h, int out_w,               // 输出空间维度
+    // 算子参数
+    int k_h, int k_w,                   // Kernel Size
+    int pad_h, int pad_w,               // Padding
+    int stride_h, int stride_w,         // Stride
+    int dil_h, int dil_w,               // Dilation
+    // 总任务量
+    size_t total_elements) {
+
+    // 平铺式索引：每个线程处理输出的一个元素
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < total_elements) {
+        // --------------------------------------------------------
+        // 1. 坐标反算：从线性 idx 解析出逻辑维度
+        // 输出形状逻辑上为: [N, (C * kH * kW), (out_h * out_w)]
+        // --------------------------------------------------------
+        int L = out_h * out_w;
+        int kernel_area = k_h * k_w;
+        int C_col = C * kernel_area; // 输出的通道数 (Column Channel)
+        
+        int l_idx = idx % L;
+        size_t temp = idx / L;
+        int c_col_idx = temp % C_col;
+        int n_idx = temp / C_col;
+
+        // 解析空间坐标 (h_out, w_out)
+        int w_out = l_idx % out_w;
+        int h_out = l_idx / out_w;
+
+        // 解析通道坐标 -> (c_in, kh, kw)
+        int kw = c_col_idx % k_w;
+        int temp_k = c_col_idx / k_w;
+        int kh = temp_k % k_h;
+        int c_in = temp_k / k_h;
+
+        // 计算输入特征图上的坐标
+        int h_in = h_out * stride_h - pad_h + kh * dil_h;
+        int w_in = w_out * stride_w - pad_w + kw * dil_w;
+        
+        T val;
+        
+        if (h_in >= 0 && h_in < H && w_in >= 0 && w_in < W) {
+            // 计算输入线性索引：[n, c, h, w]
+            size_t in_idx = ((static_cast<size_t>(n_idx) * C + c_in) * H + h_in) * W + w_in;
+            val = input[in_idx];
+        } else {
+            if constexpr (std::is_same_v<T, half>) {
+                val = __float2half(0.0f);
+            } else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
+                val = __float2bfloat16(0.0f);
+            } else {
+                val = static_cast<T>(0.0f);
+            }
+        }
+
+        output[idx] = val;
+    }
+}
+
+} // namespace op::unfold::moore
+
+#endif // __UNFOLD_MOORE_KERNEL_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/unfold/nvidia/unfold_nvidia.cu b/src/infiniop/ops/unfold/nvidia/unfold_nvidia.cu
new file mode 100644
index 000000000..b08542a0d
--- /dev/null
+++ b/src/infiniop/ops/unfold/nvidia/unfold_nvidia.cu
@@ -0,0 +1,140 @@
+#include "unfold_nvidia.cuh"
+#include "../cuda/kernel.cuh"
+#include "../../../handle.h"
+#include <cstdint>
+#include <algorithm>
+
+namespace op::unfold::nvidia {
+
+// ==================================================================
+// Kernel Launch Logic
+// ==================================================================
+template <typename T>
+void launch_kernel(
+    void *output, 
+    const void *input, 
+    const UnfoldInfo& info,
+    void *stream) {
+
+    // 1. 准备指针
+    auto in_ptr = reinterpret_cast<const T *>(input);
+    auto out_ptr = reinterpret_cast<T *>(output);
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+    // 2. 准备参数 (从 Info 的向量中解包)
+    // 注意：目前的 CUDA Kernel 仅支持 2D Spatial (NCHW)，这里取前两个维度
+    if (info._kernel_sizes.size() < 2) {
+        // 异常情况处理，或者直接返回
+        return;
+    }
+
+    int C = info._C_in;
+    int H = info._input_spatial_shape[0];
+    int W = info._input_spatial_shape[1];
+    
+    int out_h = info._output_spatial_shape[0];
+    int out_w = info._output_spatial_shape[1];
+    
+    int k_h = info._kernel_sizes[0];
+    int k_w = info._kernel_sizes[1];
+    int pad_h = info._paddings[0];
+    int pad_w = info._paddings[1];
+    int stride_h = info._strides[0];
+    int stride_w = info._strides[1];
+    int dil_h = info._dilations[0];
+    int dil_w = info._dilations[1];
+
+    // 3. 计算 Grid
+    // 输出通道数 = C * kH * kW
+    size_t out_channels = info._C_out; 
+    size_t out_spatial = info._L;
+    size_t total_elements = info._N * out_channels * out_spatial;
+
+    size_t block_size = 256;
+    size_t grid_size = (total_elements + block_size - 1) / block_size;
+
+    // 4. 调用 CUDA Kernel
+    op::unfold::cuda::unfold_kernel<T>
+        <<<grid_size, block_size, 0, cuda_stream>>>(
+            out_ptr, in_ptr,
+            C, H, W,
+            out_h, out_w,
+            k_h, k_w,
+            pad_h, pad_w,
+            stride_h, stride_w,
+            dil_h, dil_w,
+            total_elements
+        );
+}
+
+// ==================================================================
+// Descriptor 实现
+// ==================================================================
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() { 
+    if (_opaque) delete _opaque; 
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t input_desc,
+    const int *kernel_sizes,
+    const int *strides,
+    const int *paddings,
+    const int *dilations) {
+
+    auto handle = reinterpret_cast<const infiniopHandle_t>(handle_);
+
+    // 1. 创建并校验 Info
+    // 使用新的 infer 接口
+    auto result = UnfoldInfo::infer(out_desc, input_desc, kernel_sizes, strides, paddings, dilations);
+    if (!result) return result.status();
+    
+    size_t workspace_size = 0;
+
+    *desc_ptr = new Descriptor(
+        new Opaque(),
+        result.take(),
+        workspace_size,
+        handle->device,
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    // 使用新的 getter
+    auto dtype = _info.dtype_val();
+
+    // 3. 根据数据类型分发 Kernel
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        launch_kernel<half>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_BF16:
+        launch_kernel<nv_bfloat16>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_F32:
+        launch_kernel<float>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_F64:
+        launch_kernel<double>(output, input, _info, stream);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::unfold::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/unfold/nvidia/unfold_nvidia.cuh b/src/infiniop/ops/unfold/nvidia/unfold_nvidia.cuh
new file mode 100644
index 000000000..7246da912
--- /dev/null
+++ b/src/infiniop/ops/unfold/nvidia/unfold_nvidia.cuh
@@ -0,0 +1,7 @@
+#ifndef __UNFOLD_NVIDIA_CUH__
+#define __UNFOLD_NVIDIA_CUH__
+
+#include "../unfold.h"
+DESCRIPTOR(nvidia)
+
+#endif // __UNFOLD_NVIDIA_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/unfold/operator.cc b/src/infiniop/ops/unfold/operator.cc
new file mode 100644
index 000000000..d78346016
--- /dev/null
+++ b/src/infiniop/ops/unfold/operator.cc
@@ -0,0 +1,185 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/unfold.h"
+
+// --- 后端实现头文件 ---
+#ifdef ENABLE_CPU_API
+#include "cpu/unfold_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/unfold_nvidia.cuh"
+#endif
+
+#ifdef ENABLE_METAX_API
+#include "metax/unfold_metax.h"
+#endif
+
+#ifdef ENABLE_MOORE_API
+#include "moore/unfold_moore.h"
+#endif
+
+extern "C" {
+
+// =======================================================================
+// 1. 创建算子描述符
+// =======================================================================
+__C infiniStatus_t infiniopCreateUnfoldDescriptor(
+    infiniopHandle_t handle,
+    infiniopUnfoldDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output,
+    infiniopTensorDescriptor_t input,
+    const int *kernel_sizes,
+    const int *strides,
+    const int *paddings,
+    const int *dilations) {
+
+    // 定义局部宏以简化多后端分发逻辑
+    #define CREATE(CASE, NAMESPACE)                                                         \
+        case CASE:                                                                          \
+            return op::unfold::NAMESPACE::Descriptor::create(                               \
+                handle,                                                                     \
+                reinterpret_cast<op::unfold::NAMESPACE::Descriptor **>(desc_ptr),           \
+                output,                                                                     \
+                input,                                                                      \
+                kernel_sizes,                                                               \
+                strides,                                                                    \
+                paddings,                                                                   \
+                dilations)
+
+    switch (handle->device) {
+    #ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CREATE
+}
+
+// =======================================================================
+// 2. 获取 Workspace 大小
+// =======================================================================
+__C infiniStatus_t infiniopGetUnfoldWorkspaceSize(infiniopUnfoldDescriptor_t desc, size_t *size) {
+
+    #define GET(CASE, NAMESPACE)                                                            \
+        case CASE:                                                                          \
+            *size = reinterpret_cast<op::unfold::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+            return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef GET
+}
+
+// =======================================================================
+// 3. 执行计算 (Calculate)
+// =======================================================================
+__C infiniStatus_t infiniopUnfold(
+    infiniopUnfoldDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+    #define CALCULATE(CASE, NAMESPACE)                                                      \
+        case CASE:                                                                          \
+            return reinterpret_cast<const op::unfold::NAMESPACE::Descriptor *>(desc)        \
+                ->calculate(workspace, workspace_size, output, input, stream)
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CALCULATE
+}
+
+// =======================================================================
+// 4. 销毁算子描述符
+// =======================================================================
+__C infiniStatus_t infiniopDestroyUnfoldDescriptor(infiniopUnfoldDescriptor_t desc) {
+
+    #define DELETE(CASE, NAMESPACE)                                                         \
+        case CASE:                                                                          \
+            delete reinterpret_cast<const op::unfold::NAMESPACE::Descriptor *>(desc);       \
+            return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef DELETE
+}
+
+} // extern "C"
\ No newline at end of file
diff --git a/src/infiniop/ops/unfold/unfold.h b/src/infiniop/ops/unfold/unfold.h
new file mode 100644
index 000000000..fe9241bbc
--- /dev/null
+++ b/src/infiniop/ops/unfold/unfold.h
@@ -0,0 +1,49 @@
+#ifndef __UNFOLD_H__
+#define __UNFOLD_H__
+
+#include "../../operator.h"
+#include "info.h" // 引用对应的 UnfoldInfo 定义
+#define DESCRIPTOR(NAMESPACE)                                            \
+    namespace op::unfold::NAMESPACE {                                    \
+    class Descriptor final : public InfiniopDescriptor {                 \
+        struct Opaque;                                                   \
+        Opaque *_opaque;                                                 \
+        UnfoldInfo _info;                                                \
+        size_t _workspace_size;                                          \
+                                                                         \
+        Descriptor(                                                      \
+            Opaque *opaque,                                              \
+            UnfoldInfo info,                                             \
+            size_t workspace_size,                                       \
+            infiniDevice_t device_type,                                  \
+            int device_id)                                               \
+            : InfiniopDescriptor{device_type, device_id},                \
+              _opaque(opaque),                                           \
+              _info(info),                                               \
+              _workspace_size(workspace_size) {}                         \
+                                                                         \
+    public:                                                              \
+        ~Descriptor();                                                   \
+                                                                         \
+        size_t workspaceSize() const { return _workspace_size; }         \
+                                                                         \
+        static infiniStatus_t create(                                    \
+            infiniopHandle_t handle,                                     \
+            Descriptor **desc_ptr,                                       \
+            infiniopTensorDescriptor_t out_desc,                         \
+            infiniopTensorDescriptor_t input_desc,                       \
+            const int *kernel_sizes,                                     \
+            const int *strides,                                          \
+            const int *paddings,                                         \
+            const int *dilations);                                       \
+                                                                         \
+        infiniStatus_t calculate(                                        \
+            void *workspace,                                             \
+            size_t workspace_size,                                       \
+            void *output,                                                \
+            const void *input,                                           \
+            void *stream) const;                                         \
+    };                                                                   \
+    }
+
+#endif // __UNFOLD_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/vander/cpu/vander_cpu.cc b/src/infiniop/ops/vander/cpu/vander_cpu.cc
new file mode 100644
index 000000000..fd57a2cb6
--- /dev/null
+++ b/src/infiniop/ops/vander/cpu/vander_cpu.cc
@@ -0,0 +1,115 @@
+#include "vander_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include <algorithm>
+#include <cmath>
+#include <omp.h>
+#include <cstdint>
+
+#include "../../../../utils/custom_types.h"
+
+namespace op::vander::cpu {
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+        _opaque = nullptr;
+    }
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t input_desc,
+    int N,
+    int increasing) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    
+    // 调用 Info::create 进行校验和元数据构建
+    auto result = VanderInfo::create(out_desc, input_desc, N, increasing);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        new Opaque(),
+        result.take(),
+        0, // CPU 实现通常不需要额外的 workspace
+        handle->device, 
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T>
+void calculate_cpu_impl(
+    const VanderInfo &info,
+    void *output,
+    const void *input) {
+
+    size_t rows = info.rows(); // Input Size
+    size_t cols = info.cols(); // Output Cols
+    bool increasing = info.increasing();
+
+    auto out_ptr = reinterpret_cast<T *>(output);
+    auto in_ptr = reinterpret_cast<const T *>(input);
+
+    // 对每一行（输入向量的每个元素）进行并行计算
+    #pragma omp parallel for schedule(static)
+    for (size_t i = 0; i < rows; ++i) {
+        // 将输入转换为 float/double 进行高精度计算，避免 fp16/bf16 累乘精度损失
+        float x = utils::cast<float>(in_ptr[i]);
+        
+        // 优化：使用累乘法替代 pow 函数
+        // x^0 = 1.0
+        float val = 1.0f; 
+
+        if (increasing) {
+            // 顺序：x^0, x^1, x^2 ...
+            for (size_t j = 0; j < cols; ++j) {
+                out_ptr[i * cols + j] = utils::cast<T>(val);
+                val *= x;
+            }
+        } else {
+            // 顺序：... x^2, x^1, x^0
+            // 从最后一列 (x^0) 向前填充
+            for (int64_t j = static_cast<int64_t>(cols) - 1; j >= 0; --j) {
+                out_ptr[i * cols + j] = utils::cast<T>(val);
+                val *= x;
+            }
+        }
+    }
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    auto dtype = _info.dtype();
+
+    switch (dtype) {
+    case INFINI_DTYPE_F32:
+        cpu::calculate_cpu_impl<float>(_info, output, input);
+        break;
+    case INFINI_DTYPE_F64:
+        cpu::calculate_cpu_impl<double>(_info, output, input);
+        break;
+    case INFINI_DTYPE_F16:
+        cpu::calculate_cpu_impl<fp16_t>(_info, output, input);
+        break;
+    case INFINI_DTYPE_BF16:
+        cpu::calculate_cpu_impl<bf16_t>(_info, output, input);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::vander::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/vander/cpu/vander_cpu.h b/src/infiniop/ops/vander/cpu/vander_cpu.h
new file mode 100644
index 000000000..1940ab023
--- /dev/null
+++ b/src/infiniop/ops/vander/cpu/vander_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __VANDER_CPU_H__
+#define __VANDER_CPU_H__
+
+#include "../vander.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __VANDER_CPU_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/vander/cuda/kernel.cuh b/src/infiniop/ops/vander/cuda/kernel.cuh
new file mode 100644
index 000000000..33ced35f3
--- /dev/null
+++ b/src/infiniop/ops/vander/cuda/kernel.cuh
@@ -0,0 +1,48 @@
+#ifndef __VANDER_CUDA_CUH__
+#define __VANDER_CUDA_CUH__
+
+#include <cuda_runtime.h>
+#if defined ENABLE_METAX_API
+    #include <maca_fp16.h>
+    #include <maca_bfloat16.h>
+    using nv_bfloat162 = __maca_bfloat162;
+#else
+    #include <cuda_fp16.h>
+    #include <cuda_bf16.h>
+#endif
+
+#include <cmath>
+#include <cstdio>
+
+namespace op::vander::cuda {
+
+// ==================================================================
+// 核心 Kernel
+// ==================================================================
+template <typename T>
+__global__ void vander_kernel(
+    T * __restrict__ output,        // [rows, cols]
+    const T * __restrict__ input,   // [rows]
+    size_t rows,
+    size_t cols,
+    bool increasing) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t total_elements = rows * cols;
+
+    if (idx < total_elements) {
+        size_t row = idx / cols;
+        size_t col = idx % cols;
+
+        // 加载输入 (同一个 row 的不同 col 线程会读取同一个 input[row]，L1 Cache 友好)
+        float x = static_cast<float>(input[row]);
+        float power = increasing ? static_cast<float>(col) 
+                                 : static_cast<float>(cols - 1 - col);
+        float res = powf(x, power);
+
+        output[idx] = static_cast<T>(res);
+    }
+}
+
+} // namespace op::vander::cuda
+
+#endif // __VANDER_CUDA_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/vander/info.h b/src/infiniop/ops/vander/info.h
new file mode 100644
index 000000000..18823aa2b
--- /dev/null
+++ b/src/infiniop/ops/vander/info.h
@@ -0,0 +1,71 @@
+#ifndef __VANDER_INFO_H__
+#define __VANDER_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+
+namespace op::vander {
+
+class VanderInfo {
+    VanderInfo() = default;
+
+public:
+    int _dtype;          // 数据类型
+    bool _increasing;    // 幂次顺序 (false: 递减, true: 递增)
+    
+    // 形状信息缓存
+    size_t _rows;        // 输入向量长度 (N)
+    size_t _cols;        // 输出矩阵列数 (M)
+
+    int dtype() const { return _dtype; }
+    bool increasing() const { return _increasing; }
+    size_t rows() const { return _rows; }
+    size_t cols() const { return _cols; }
+
+    // 构造函数
+    VanderInfo(int dtype, bool increasing, size_t rows, size_t cols)
+        : _dtype(dtype), _increasing(increasing), _rows(rows), _cols(cols) {}
+
+    static utils::Result<VanderInfo> create(
+        infiniopTensorDescriptor_t out_desc,
+        infiniopTensorDescriptor_t input_desc,
+        int N,            // 用户指定的列数，若 <= 0 则默认为输入长度
+        int increasing) { // C API 传入的是 int，内部转为 bool
+
+        // 1. 检查输入形状
+        // Input 必须是 1D 向量: (rows)
+        if (input_desc->ndim() != 1) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+        size_t rows = input_desc->shape()[0];
+        size_t cols = (N > 0) ? static_cast<size_t>(N) : rows;
+
+        // 3. 检查输出形状
+        // Output 必须是 2D 矩阵: (rows, cols)
+        if (out_desc->ndim() != 2) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+        
+        if (out_desc->shape()[0] != rows) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        if (out_desc->shape()[1] != cols) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+        if (out_desc->dtype() != input_desc->dtype()) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        return utils::Result<VanderInfo>(VanderInfo{
+            input_desc->dtype(),     // _dtype
+            static_cast<bool>(increasing), // _increasing
+            rows,                    // _rows
+            cols                     // _cols
+        });
+    }
+};
+
+} // namespace op::vander
+
+#endif // __VANDER_INFO_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/vander/metax/vander_metax.h b/src/infiniop/ops/vander/metax/vander_metax.h
new file mode 100644
index 000000000..0f5557dcb
--- /dev/null
+++ b/src/infiniop/ops/vander/metax/vander_metax.h
@@ -0,0 +1,8 @@
+#ifndef __VANDER_METAX_API_H__
+#define __VANDER_METAX_API_H__
+
+#include "../vander.h"
+
+DESCRIPTOR(metax)
+
+#endif // __VANDER_METAX_API_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/vander/metax/vander_metax.maca b/src/infiniop/ops/vander/metax/vander_metax.maca
new file mode 100644
index 000000000..58e44f7f3
--- /dev/null
+++ b/src/infiniop/ops/vander/metax/vander_metax.maca
@@ -0,0 +1,144 @@
+#include "vander_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+#include <mcr/mc_runtime.h>
+#include <cstdint>
+#include <algorithm>
+#include <cmath>
+#include <cstdio>
+#include <maca_fp16.h>
+#include <maca_bfloat16.h>
+using nv_bfloat16 = __maca_bfloat16;
+using nv_bfloat162 = __maca_bfloat162;
+
+namespace op::vander::metax {
+template <typename T>
+__global__ void vander_kernel(
+    T * __restrict__ output,        // [rows, cols]
+    const T * __restrict__ input,   // [rows]
+    size_t rows,
+    size_t cols,
+    bool increasing) {
+    
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t total_elements = rows * cols;
+
+    if (idx < total_elements) {
+        size_t row = idx / cols;
+        size_t col = idx % cols;
+
+        // 加载输入 (同一个 row 的不同 col 线程会读取同一个 input[row]，L1 Cache 友好)
+        float x = static_cast<float>(input[row]);
+        
+        // 计算指数
+        // increasing=True:  x^0, x^1, ..., x^(N-1)
+        // increasing=False: x^(N-1), ..., x^1, x^0
+        float power = increasing ? static_cast<float>(col) 
+                                 : static_cast<float>(cols - 1 - col);
+        
+        float res = powf(x, power);
+
+        output[idx] = static_cast<T>(res);
+    }
+}
+
+// ==================================================================
+// 3. Kernel Launch Logic
+// ==================================================================
+template <typename T>
+void launch_kernel(
+    void *output, 
+    const void *input, 
+    const VanderInfo& info,
+    void *stream) {
+
+    // 1. 准备指针
+    auto in_ptr = reinterpret_cast<const T *>(input);
+    auto out_ptr = reinterpret_cast<T *>(output);
+    auto mc_stream = reinterpret_cast<mcStream_t>(stream);
+    
+    // 2. 准备参数
+    size_t rows = info.rows();
+    size_t cols = info.cols();
+    bool increasing = info.increasing();
+    
+    // 计算 Grid Size
+    size_t total_elements = rows * cols;
+    size_t block_size = 256;
+    size_t grid_size = (total_elements + block_size - 1) / block_size;
+    
+    // 3. 启动 Kernel
+    vander_kernel<T>
+        <<<grid_size, block_size, 0, mc_stream>>>(
+            out_ptr, in_ptr, rows, cols, increasing
+        );
+}
+
+// ==================================================================
+// 4. Descriptor 实现
+// ==================================================================
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() { 
+    if (_opaque) delete _opaque; 
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_, 
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc, 
+    infiniopTensorDescriptor_t input_desc, 
+    int N, 
+    int increasing) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+
+    // 1. 创建并校验 Info
+    auto info_result = VanderInfo::create(out_desc, input_desc, N, increasing);
+    if (!info_result) return info_result.status();
+
+    // 2. 创建 Descriptor
+    size_t workspace_size = 0;
+
+    *desc_ptr = new Descriptor(
+        new Opaque(), 
+        info_result.take(), 
+        workspace_size, 
+        handle->device, 
+        handle->device_id
+    );
+    
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, 
+    size_t workspace_size, 
+    void *output,
+    const void *input, 
+    void *stream) const {
+
+    auto dtype = _info.dtype();
+
+    // 3. 根据数据类型分发 Kernel
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        launch_kernel<__half>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_BF16:
+        launch_kernel<__maca_bfloat16>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_F32:
+        launch_kernel<float>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_F64:
+        launch_kernel<double>(output, input, _info, stream);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::vander::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/vander/moore/vander_moore.h b/src/infiniop/ops/vander/moore/vander_moore.h
new file mode 100644
index 000000000..57d1e4d8e
--- /dev/null
+++ b/src/infiniop/ops/vander/moore/vander_moore.h
@@ -0,0 +1,8 @@
+#ifndef __VANDER_MOORE_H__
+#define __VANDER_MOORE_H__
+
+#include "../vander.h"
+
+DESCRIPTOR(moore)
+
+#endif // __VANDER_MOORE_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/vander/moore/vander_moore.mu b/src/infiniop/ops/vander/moore/vander_moore.mu
new file mode 100644
index 000000000..6f9db92bb
--- /dev/null
+++ b/src/infiniop/ops/vander/moore/vander_moore.mu
@@ -0,0 +1,109 @@
+#include "vander_moore.h"
+#include "vander_moore_kernel.h"
+#include "../../../devices/moore/moore_handle.h"
+#include <cstdint>
+#include <algorithm>
+
+namespace op::vander::moore {
+
+// ==================================================================
+// Kernel Launch Logic
+// ==================================================================
+template <typename T>
+void launch_kernel(
+    void *output, 
+    const void *input, 
+    const VanderInfo& info,
+    void *stream) {
+
+    // 1. 准备指针
+    auto in_ptr = reinterpret_cast<const T *>(input);
+    auto out_ptr = reinterpret_cast<T *>(output);
+    
+    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
+    
+    // 2. 准备参数
+    size_t rows = info.rows();
+    size_t cols = info.cols();
+    bool increasing = info.increasing();
+    
+    // 计算总元素数量以确定 Grid Size
+    size_t total_elements = rows * cols;
+    size_t block_size = 256;
+    size_t grid_size = (total_elements + block_size - 1) / block_size;
+    
+    // 调用 Moore Kernel
+    op::vander::moore::vander_kernel<T>
+        <<<grid_size, block_size, 0, musa_stream>>>(
+            out_ptr, in_ptr, rows, cols, increasing
+        );
+}
+
+// ==================================================================
+// Descriptor 实现
+// ==================================================================
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() { 
+    if (_opaque) delete _opaque; 
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_, 
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc, 
+    infiniopTensorDescriptor_t input_desc, 
+    int N, 
+    int increasing) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+
+    // 1. 创建并校验 Info
+    auto info_result = VanderInfo::create(out_desc, input_desc, N, increasing);
+    if (!info_result) return info_result.status();
+
+    // 2. 创建 Descriptor
+    size_t workspace_size = 0;
+
+    *desc_ptr = new Descriptor(
+        new Opaque(), 
+        info_result.take(), 
+        workspace_size, 
+        handle->device, 
+        handle->device_id
+    );
+    
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, 
+    size_t workspace_size, 
+    void *output,
+    const void *input, 
+    void *stream) const {
+
+    auto dtype = _info.dtype();
+
+    // 3. 根据数据类型分发 Kernel
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        launch_kernel<half>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_BF16:
+        launch_kernel<__mt_bfloat16>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_F32:
+        launch_kernel<float>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_F64:
+        launch_kernel<double>(output, input, _info, stream);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::vander::moore
\ No newline at end of file
diff --git a/src/infiniop/ops/vander/moore/vander_moore_kernel.h b/src/infiniop/ops/vander/moore/vander_moore_kernel.h
new file mode 100644
index 000000000..82d8a246c
--- /dev/null
+++ b/src/infiniop/ops/vander/moore/vander_moore_kernel.h
@@ -0,0 +1,61 @@
+#ifndef __VANDER_MOORE_KERNEL_H__
+#define __VANDER_MOORE_KERNEL_H__
+
+#include <musa_runtime.h>
+#include <musa_fp16.h>
+#include <musa_bf16.h>
+#include <cmath>
+#include <cstdio>
+#include <type_traits>
+
+namespace op::vander::moore {
+
+// ==================================================================
+// 核心 Kernel
+// ==================================================================
+template <typename T>
+__global__ void vander_kernel(
+    T * __restrict__ output,        // [rows, cols]
+    const T * __restrict__ input,   // [rows]
+    size_t rows,
+    size_t cols,
+    bool increasing) {
+    
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t total_elements = rows * cols;
+
+    if (idx < total_elements) {
+        size_t row = idx / cols;
+        size_t col = idx % cols;
+
+        // 1. 读取输入并转换为 float
+        T in_val = input[row];
+        float x;
+
+        if constexpr (std::is_same_v<T, half>) {
+            x = __half2float(in_val);
+        } else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
+            x = __bfloat162float(in_val);
+        } else {
+            x = static_cast<float>(in_val);
+        }
+
+        // 2. 计算幂
+        float power = increasing ? static_cast<float>(col) 
+                                 : static_cast<float>(cols - 1 - col);
+        float res = powf(x, power);
+
+        // 3. 结果写回
+        if constexpr (std::is_same_v<T, half>) {
+            output[idx] = __float2half(res);
+        } else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
+            output[idx] = __float2bfloat16(res);
+        } else {
+            output[idx] = static_cast<T>(res);
+        }
+    }
+}
+
+} // namespace op::vander::moore
+
+#endif // __VANDER_MOORE_KERNEL_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/vander/nvidia/vander_nvidia.cu b/src/infiniop/ops/vander/nvidia/vander_nvidia.cu
new file mode 100644
index 000000000..00bf41872
--- /dev/null
+++ b/src/infiniop/ops/vander/nvidia/vander_nvidia.cu
@@ -0,0 +1,108 @@
+#include "vander_nvidia.cuh"
+#include "../cuda/kernel.cuh"
+#include "../../../handle.h"
+#include <cstdint>
+#include <algorithm>
+
+namespace op::vander::nvidia {
+
+// ==================================================================
+// Kernel Launch Logic
+// ==================================================================
+template <typename T>
+void launch_kernel(
+    void *output, 
+    const void *input, 
+    const VanderInfo& info,
+    void *stream) {
+
+    // 1. 准备指针
+    auto in_ptr = reinterpret_cast<const T *>(input);
+    auto out_ptr = reinterpret_cast<T *>(output);
+    
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+    
+    // 2. 准备参数
+    size_t rows = info.rows();
+    size_t cols = info.cols();
+    bool increasing = info.increasing();
+    
+    // 计算总元素数量以确定 Grid Size
+    size_t total_elements = rows * cols;
+    size_t block_size = 256;
+    size_t grid_size = (total_elements + block_size - 1) / block_size;
+    
+    // 调用 CUDA Kernel
+    op::vander::cuda::vander_kernel<T>
+        <<<grid_size, block_size, 0, cuda_stream>>>(
+            out_ptr, in_ptr, rows, cols, increasing
+        );
+}
+
+// ==================================================================
+// Descriptor 实现
+// ==================================================================
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() { 
+    if (_opaque) delete _opaque; 
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle, 
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc, 
+    infiniopTensorDescriptor_t input_desc, 
+    int N, 
+    int increasing) {
+
+    // 1. 创建并校验 Info
+    auto info_result = VanderInfo::create(out_desc, input_desc, N, increasing);
+    if (!info_result) return info_result.status();
+
+    // 2. 创建 Descriptor
+    // Vander 算子是 Element-wise 展开操作，不需要额外的 workspace
+    size_t workspace_size = 0;
+
+    *desc_ptr = new Descriptor(
+        new Opaque(), 
+        info_result.take(), 
+        workspace_size, 
+        handle->device, 
+        handle->device_id
+    );
+    
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, 
+    size_t workspace_size, 
+    void *output,
+    const void *input, 
+    void *stream) const {
+
+    auto dtype = _info.dtype();
+
+    // 3. 根据数据类型分发 Kernel
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        launch_kernel<half>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_BF16:
+        launch_kernel<nv_bfloat16>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_F32:
+        launch_kernel<float>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_F64:
+        launch_kernel<double>(output, input, _info, stream);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::vander::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/vander/nvidia/vander_nvidia.cuh b/src/infiniop/ops/vander/nvidia/vander_nvidia.cuh
new file mode 100644
index 000000000..b9da220ef
--- /dev/null
+++ b/src/infiniop/ops/vander/nvidia/vander_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __VANDER_NVIDIA_CUH__
+#define __VANDER_NVIDIA_CUH__
+
+#include "../vander.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __VANDER_NVIDIA_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/vander/operator.cc b/src/infiniop/ops/vander/operator.cc
new file mode 100644
index 000000000..715e769dd
--- /dev/null
+++ b/src/infiniop/ops/vander/operator.cc
@@ -0,0 +1,181 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/vander.h"
+
+// --- 后端实现头文件 ---
+#ifdef ENABLE_CPU_API
+#include "cpu/vander_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/vander_nvidia.cuh"
+#endif
+
+#ifdef ENABLE_METAX_API
+#include "metax/vander_metax.h"
+#endif
+
+#ifdef ENABLE_MOORE_API
+#include "moore/vander_moore.h"
+#endif
+
+extern "C" {
+
+// =======================================================================
+// 1. 创建算子描述符
+// =======================================================================
+__C infiniStatus_t infiniopCreateVanderDescriptor(
+    infiniopHandle_t handle,
+    infiniopVanderDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output,
+    infiniopTensorDescriptor_t input,
+    int N,
+    int increasing) {
+
+    // 定义局部宏以简化多后端分发逻辑
+    #define CREATE(CASE, NAMESPACE)                                                             \
+        case CASE:                                                                              \
+            return op::vander::NAMESPACE::Descriptor::create(                                   \
+                handle,                                                                         \
+                reinterpret_cast<op::vander::NAMESPACE::Descriptor **>(desc_ptr),               \
+                output,                                                                         \
+                input,                                                                          \
+                N,                                                                              \
+                increasing)
+
+    switch (handle->device) {
+    #ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CREATE
+}
+
+// =======================================================================
+// 2. 获取 Workspace 大小
+// =======================================================================
+__C infiniStatus_t infiniopGetVanderWorkspaceSize(infiniopVanderDescriptor_t desc, size_t *size) {
+
+    #define GET(CASE, NAMESPACE)                                                                        \
+        case CASE:                                                                                      \
+            *size = reinterpret_cast<op::vander::NAMESPACE::Descriptor *>(desc)->workspaceSize();       \
+            return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef GET
+}
+
+// =======================================================================
+// 3. 执行计算 (Calculate)
+// =======================================================================
+__C infiniStatus_t infiniopVander(
+    infiniopVanderDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) {
+
+    #define CALCULATE(CASE, NAMESPACE)                                                          \
+        case CASE:                                                                              \
+            return reinterpret_cast<const op::vander::NAMESPACE::Descriptor *>(desc)            \
+                ->calculate(workspace, workspace_size, output, input, stream)
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CALCULATE
+}
+
+// =======================================================================
+// 4. 销毁算子描述符
+// =======================================================================
+__C infiniStatus_t infiniopDestroyVanderDescriptor(infiniopVanderDescriptor_t desc) {
+
+    #define DELETE(CASE, NAMESPACE)                                                             \
+        case CASE:                                                                              \
+            delete reinterpret_cast<const op::vander::NAMESPACE::Descriptor *>(desc);           \
+            return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef DELETE
+}
+
+} // extern "C"
\ No newline at end of file
diff --git a/src/infiniop/ops/vander/vander.h b/src/infiniop/ops/vander/vander.h
new file mode 100644
index 000000000..0eeaaea50
--- /dev/null
+++ b/src/infiniop/ops/vander/vander.h
@@ -0,0 +1,49 @@
+#ifndef __VANDER_H__
+#define __VANDER_H__
+
+#include "../../operator.h"
+#include "info.h" // 引用对应的 VanderInfo 定义
+
+// 宏定义：用于生成不同命名空间下的 Descriptor 类
+#define DESCRIPTOR(NAMESPACE)                                            \
+    namespace op::vander::NAMESPACE {                                    \
+    class Descriptor final : public InfiniopDescriptor {                 \
+        struct Opaque;                                                   \
+        Opaque *_opaque;                                                 \
+        VanderInfo _info;                                                \
+        size_t _workspace_size;                                          \
+                                                                         \
+        Descriptor(                                                      \
+            Opaque *opaque,                                              \
+            VanderInfo info,                                             \
+            size_t workspace_size,                                       \
+            infiniDevice_t device_type,                                  \
+            int device_id)                                               \
+            : InfiniopDescriptor{device_type, device_id},                \
+              _opaque(opaque),                                           \
+              _info(info),                                               \
+              _workspace_size(workspace_size) {}                         \
+                                                                         \
+    public:                                                              \
+        ~Descriptor();                                                   \
+                                                                         \
+        size_t workspaceSize() const { return _workspace_size; }         \
+                                                                         \
+        static infiniStatus_t create(                                    \
+            infiniopHandle_t handle,                                     \
+            Descriptor **desc_ptr,                                       \
+            infiniopTensorDescriptor_t out_desc,                         \
+            infiniopTensorDescriptor_t input_desc,                       \
+            int N,                                                       \
+            int increasing);                                             \
+                                                                         \
+        infiniStatus_t calculate(                                        \
+            void *workspace,                                             \
+            size_t workspace_size,                                       \
+            void *output,                                                \
+            const void *input,                                           \
+            void *stream) const;                                         \
+    };                                                                   \
+    }
+
+#endif // __VANDER_H__
\ No newline at end of file
diff --git a/test/infinicore/ops/logcumsumexp.py b/test/infinicore/ops/logcumsumexp.py
index 82514064b..385c592f1 100644
--- a/test/infinicore/ops/logcumsumexp.py
+++ b/test/infinicore/ops/logcumsumexp.py
@@ -77,9 +77,8 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.logcumsumexp(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.logcumsumexp(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+         return infinicore.logcumsumexp(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/logical_and.py b/test/infinicore/ops/logical_and.py
index 85f5438b9..caa976f0c 100644
--- a/test/infinicore/ops/logical_and.py
+++ b/test/infinicore/ops/logical_and.py
@@ -109,9 +109,8 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.logical_and(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.logical_and(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+         return infinicore.logical_and(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/logical_not.py b/test/infinicore/ops/logical_not.py
index c2e0d363d..e7857bcf7 100644
--- a/test/infinicore/ops/logical_not.py
+++ b/test/infinicore/ops/logical_not.py
@@ -93,9 +93,8 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.logical_not(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.logical_not(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+         return infinicore.logical_not(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/unfold.py b/test/infinicore/ops/unfold.py
index 2d2b30603..384c63506 100644
--- a/test/infinicore/ops/unfold.py
+++ b/test/infinicore/ops/unfold.py
@@ -72,9 +72,8 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.nn.functional.unfold(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.nn.functional.unfold(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+         return infinicore.nn.functional.unfold(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/vander.py b/test/infinicore/ops/vander.py
index 25040d474..02465d8aa 100644
--- a/test/infinicore/ops/vander.py
+++ b/test/infinicore/ops/vander.py
@@ -60,9 +60,8 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.vander(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.vander(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+         return infinicore.vander(*args, **kwargs)
 
 
 def main():