diff --git a/include/infinicore/common/hash.hpp b/include/infinicore/common/hash.hpp
index ec930a53f..7d450b3b7 100644
--- a/include/infinicore/common/hash.hpp
+++ b/include/infinicore/common/hash.hpp
@@ -15,6 +15,14 @@ hash_combine(size_t &seed, const T &value) {
 
 // Specialization for Tensor
 inline void hash_combine(size_t &seed, Tensor tensor) {
+    // For an undefined tensor (default-constructed), just mix in a sentinel
+    // value so that optional arguments like weight/pos_weight do not cause
+    // null dereferences when computing cache keys.
+    if (!tensor) {
+        hash_combine(seed, static_cast<size_t>(0));
+        return;
+    }
+
     hash_combine(seed, static_cast<size_t>(tensor->dtype()));
     for (Size shape : tensor->shape()) {
         hash_combine(seed, shape);
diff --git a/include/infinicore/ops.hpp b/include/infinicore/ops.hpp
index 0937a4821..25b4e7b8c 100644
--- a/include/infinicore/ops.hpp
+++ b/include/infinicore/ops.hpp
@@ -10,3 +10,8 @@
 #include "ops/rope.hpp"
 #include "ops/silu.hpp"
 #include "ops/swiglu.hpp"
+#include "ops/atanh.hpp"
+#include "ops/addcmul.hpp"
+#include "ops/cdist.hpp"
+#include "ops/reciprocal.hpp"
+#include "ops/binary_cross_entropy_with_logits.hpp"
\ No newline at end of file
diff --git a/include/infinicore/ops/addcmul.hpp b/include/infinicore/ops/addcmul.hpp
new file mode 100644
index 000000000..9597fa0cb
--- /dev/null
+++ b/include/infinicore/ops/addcmul.hpp
@@ -0,0 +1,17 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Addcmul {
+public:
+    // schema: out, input, t1, t2, value
+    using schema = void (*)(Tensor, Tensor, Tensor, Tensor, float);
+    static void execute(Tensor out, Tensor input, Tensor t1, Tensor t2, float value);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+Tensor addcmul(Tensor input, Tensor t1, Tensor t2, float value);
+void addcmul_(Tensor out, Tensor input, Tensor t1, Tensor t2, float value);
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/atanh.hpp b/include/infinicore/ops/atanh.hpp
new file mode 100644
index 000000000..9f75c4d53
--- /dev/null
+++ b/include/infinicore/ops/atanh.hpp
@@ -0,0 +1,34 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Atanh {
+public:
+    // schema 定义为：void(输出 Tensor, 输入 Tensor)
+    using schema = void (*)(Tensor, Tensor);
+    
+    // 执行函数
+    static void execute(Tensor y, Tensor a);
+    
+    // 获取算子分发器，用于多后端（CPU/CUDA 等）匹配
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+/**
+ * @brief 计算输入 Tensor 的反双曲正切值 (out-of-place)
+ * @param a 输入 Tensor
+ * @return 包含结果的新 Tensor
+ */
+Tensor atanh(Tensor a);
+
+/**
+ * @brief 计算输入 Tensor 的反双曲正切值 (in-place / specified output)
+ * @param y 输出 Tensor
+ * @param a 输入 Tensor
+ */
+void atanh_(Tensor y, Tensor a);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/binary_cross_entropy_with_logits.hpp b/include/infinicore/ops/binary_cross_entropy_with_logits.hpp
new file mode 100644
index 000000000..f291c939e
--- /dev/null
+++ b/include/infinicore/ops/binary_cross_entropy_with_logits.hpp
@@ -0,0 +1,46 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+#include <string>
+
+namespace infinicore::op {
+
+class BinaryCrossEntropyWithLogits {
+public:
+    /**
+     * @brief BCEWithLogits 算子的函数原型
+     * 参数顺序: out, logits, target, weight, pos_weight, reduction
+     */
+    using schema = void (*)(Tensor, Tensor, Tensor, Tensor, Tensor, std::string);
+
+    static void execute(Tensor out, 
+                        Tensor logits, 
+                        Tensor target, 
+                        Tensor weight, 
+                        Tensor pos_weight, 
+                        std::string reduction);
+
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+/**
+ * @brief 非原地操作接口 (Out-of-place)
+ */
+Tensor binary_cross_entropy_with_logits(Tensor logits, 
+                                       Tensor target, 
+                                       Tensor weight = {}, 
+                                       Tensor pos_weight = {}, 
+                                       std::string reduction = "mean");
+
+/**
+ * @brief 显式指定输出张量的接口
+ */
+void binary_cross_entropy_with_logits_(Tensor out, 
+                                      Tensor logits, 
+                                      Tensor target, 
+                                      Tensor weight, 
+                                      Tensor pos_weight, 
+                                      std::string reduction);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/cdist.hpp b/include/infinicore/ops/cdist.hpp
new file mode 100644
index 000000000..a572208e2
--- /dev/null
+++ b/include/infinicore/ops/cdist.hpp
@@ -0,0 +1,32 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Cdist {
+public:
+    /**
+     * @brief 成对距离计算算子 (Pairwise distance)
+     * schema: out (M, N), x1 (M, D), x2 (N, D), p (norm degree)
+     */
+    using schema = void (*)(Tensor, Tensor, Tensor, double);
+    
+    static void execute(Tensor out, Tensor x1, Tensor x2, double p);
+    
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+/**
+ * @brief 非原地（Out-of-place）接口
+ * @return 返回形状为 (M, N) 的新 Tensor
+ */
+Tensor cdist(Tensor x1, Tensor x2, double p = 2.0);
+
+/**
+ * @brief 显式指定输出接口
+ */
+void cdist_(Tensor out, Tensor x1, Tensor x2, double p = 2.0);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/reciprocal.hpp b/include/infinicore/ops/reciprocal.hpp
new file mode 100644
index 000000000..2d91772ad
--- /dev/null
+++ b/include/infinicore/ops/reciprocal.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class Reciprocal {
+public:
+    using schema = void (*)(Tensor, Tensor);
+    static void execute(Tensor y, Tensor x);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+Tensor reciprocal(Tensor x);
+void reciprocal_(Tensor y, Tensor x);
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/tensor.hpp b/include/infinicore/tensor.hpp
index 58a8f59e7..c63d3c82e 100644
--- a/include/infinicore/tensor.hpp
+++ b/include/infinicore/tensor.hpp
@@ -86,6 +86,9 @@ class Tensor {
 
     operator bool() const;
 
+    // 判断 Tensor 是否已定义（是否持有有效实现）
+    bool is_defined() const { return static_cast<bool>(*this); }
+
 protected:
     Tensor(std::shared_ptr<TensorImpl> impl) : impl_(std::move(impl)) {}
     std::shared_ptr<TensorImpl> impl_;
diff --git a/include/infiniop.h b/include/infiniop.h
index 92e6f5963..216ade693 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -30,6 +30,11 @@
 #include "infiniop/ops/topkrouter.h"
 #include "infiniop/ops/topksoftmax.h"
 #include "infiniop/ops/zeros.h"
+#include "infiniop/ops/atanh.h"
+#include "infiniop/ops/addcmul.h"
+#include "infiniop/ops/cdist.h"
+#include "infiniop/ops/binary_cross_entropy_with_logits.h"
+#include "infiniop/ops/reciprocal.h"
 #include "infiniop/tensor_descriptor.h"
 
 #endif // __INFINIOP_API_H__
diff --git a/include/infiniop/ops/addcmul.h b/include/infiniop/ops/addcmul.h
new file mode 100644
index 000000000..d2605c7e8
--- /dev/null
+++ b/include/infiniop/ops/addcmul.h
@@ -0,0 +1,57 @@
+#ifndef __INFINIOP_ADDCMUL_API_H__
+#define __INFINIOP_ADDCMUL_API_H__
+
+#include "../operator_descriptor.h"
+
+// 定义 addcmul 算子描述符类型
+typedef struct InfiniopDescriptor *infiniopAddcmulDescriptor_t;
+
+/**
+ * @brief 创建 Addcmul 算子描述符
+ * @param handle 算子句柄
+ * @param desc_ptr 指向返回的描述符指针
+ * @param out 输出张量描述符
+ * @param input 加项张量描述符
+ * @param tensor1 乘项张量1描述符
+ * @param tensor2 乘项张量2描述符
+ * @param value 乘积的标量系数
+ */
+__C __export infiniStatus_t infiniopCreateAddcmulDescriptor(infiniopHandle_t handle,
+                                                            infiniopAddcmulDescriptor_t *desc_ptr,
+                                                            infiniopTensorDescriptor_t out,
+                                                            infiniopTensorDescriptor_t input,
+                                                            infiniopTensorDescriptor_t tensor1,
+                                                            infiniopTensorDescriptor_t tensor2,
+                                                            float value);
+
+/**
+ * @brief 获取 Addcmul 计算所需的临时空间大小
+ */
+__C __export infiniStatus_t infiniopGetAddcmulWorkspaceSize(infiniopAddcmulDescriptor_t desc, size_t *size);
+
+/**
+ * @brief 执行 Addcmul 计算
+ * @param desc 算子描述符
+ * @param workspace 临时空间指针
+ * @param workspace_size 临时空间大小
+ * @param out 输出数据指针
+ * @param input 加项数据指针
+ * @param tensor1 乘项1数据指针
+ * @param tensor2 乘项2数据指针
+ * @param stream 计算流 (CUDA stream 等)
+ */
+__C __export infiniStatus_t infiniopAddcmul(infiniopAddcmulDescriptor_t desc,
+                                            void *workspace,
+                                            size_t workspace_size,
+                                            void *out,
+                                            const void *input,
+                                            const void *tensor1,
+                                            const void *tensor2,
+                                            void *stream);
+
+/**
+ * @brief 销毁 Addcmul 算子描述符
+ */
+__C __export infiniStatus_t infiniopDestroyAddcmulDescriptor(infiniopAddcmulDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/atanh.h b/include/infiniop/ops/atanh.h
new file mode 100644
index 000000000..021ab6bc1
--- /dev/null
+++ b/include/infiniop/ops/atanh.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_Atanh_API_H__
+#define __INFINIOP_Atanh_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopAtanhDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateAtanhDescriptor(infiniopHandle_t handle,
+                                                        infiniopAtanhDescriptor_t *desc_ptr,
+                                                        infiniopTensorDescriptor_t y,
+                                                        infiniopTensorDescriptor_t a);
+
+__C __export infiniStatus_t infiniopGetAtanhWorkspaceSize(infiniopAtanhDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopAtanh(infiniopAtanhDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *a,
+                                        void *stream);
+
+__C __export infiniStatus_t infiniopDestroyAtanhDescriptor(infiniopAtanhDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/binary_cross_entropy_with_logits.h b/include/infiniop/ops/binary_cross_entropy_with_logits.h
new file mode 100644
index 000000000..90ff7c37b
--- /dev/null
+++ b/include/infiniop/ops/binary_cross_entropy_with_logits.h
@@ -0,0 +1,73 @@
+#ifndef __INFINIOP_BINARY_CROSS_ENTROPY_WITH_LOGITS_API_H__
+#define __INFINIOP_BINARY_CROSS_ENTROPY_WITH_LOGITS_API_H__
+
+#include "../operator_descriptor.h"
+
+// 定义归约方式枚举
+typedef enum {
+    INFINIOP_REDUCTION_NONE = 0,
+    INFINIOP_REDUCTION_MEAN = 1,
+    INFINIOP_REDUCTION_SUM  = 2
+} infiniopReduction_t;
+
+// 定义 BCEWithLogits 算子描述符类型
+typedef struct InfiniopDescriptor *infiniopBCEWithLogitsDescriptor_t;
+
+/**
+ * @brief 创建 BCEWithLogits 算子描述符
+ * @param handle 算子句柄
+ * @param desc_ptr 指向返回的描述符指针
+ * @param out 输出张量描述符 (none时与input同形状，mean/sum时为标量)
+ * @param logits 输入 Logits 张量描述符
+ * @param target 目标标签张量描述符
+ * @param weight 样本权重描述符 (可选，不需要则传 NULL)
+ * @param pos_weight 正样本权重描述符 (可选，不需要则传 NULL)
+ * @param reduction 归约方式 (none, mean, sum)
+ */
+__C __export infiniStatus_t infiniopCreateBCEWithLogitsDescriptor(
+                                    infiniopHandle_t handle,
+                                    infiniopBCEWithLogitsDescriptor_t *desc_ptr,
+                                    infiniopTensorDescriptor_t out,
+                                    infiniopTensorDescriptor_t logits,
+                                    infiniopTensorDescriptor_t target,
+                                    infiniopTensorDescriptor_t weight,
+                                    infiniopTensorDescriptor_t pos_weight,
+                                    infiniopReduction_t reduction);
+
+/**
+ * @brief 获取 BCEWithLogits 计算所需的临时空间大小
+ */
+__C __export infiniStatus_t infiniopGetBCEWithLogitsWorkspaceSize(
+                                    infiniopBCEWithLogitsDescriptor_t desc, 
+                                    size_t *size);
+
+/**
+ * @brief 执行 BCEWithLogits 计算
+ * @param desc 算子描述符
+ * @param workspace 临时空间指针
+ * @param workspace_size 临时空间大小
+ * @param out 输出数据指针
+ * @param logits Logits 数据指针
+ * @param target Target 数据指针
+ * @param weight 权重数据指针 (可选，传 NULL 表示权重全为 1)
+ * @param pos_weight 正样本权重数据指针 (可选，传 NULL 表示权重全为 1)
+ * @param stream 计算流
+ */
+__C __export infiniStatus_t infiniopBCEWithLogits(
+                                    infiniopBCEWithLogitsDescriptor_t desc,
+                                    void *workspace,
+                                    size_t workspace_size,
+                                    void *out,
+                                    const void *logits,
+                                    const void *target,
+                                    const void *weight,
+                                    const void *pos_weight,
+                                    void *stream);
+
+/**
+ * @brief 销毁 BCEWithLogits 算子描述符
+ */
+__C __export infiniStatus_t infiniopDestroyBCEWithLogitsDescriptor(
+                                    infiniopBCEWithLogitsDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/cdist.h b/include/infiniop/ops/cdist.h
new file mode 100644
index 000000000..f2307d2ab
--- /dev/null
+++ b/include/infiniop/ops/cdist.h
@@ -0,0 +1,56 @@
+#ifndef __INFINIOP_CDIST_API_H__
+#define __INFINIOP_CDIST_API_H__
+
+#include "../operator_descriptor.h"
+
+// 定义 cdist 算子描述符类型
+typedef struct InfiniopDescriptor *infiniopCdistDescriptor_t;
+
+/**
+ * @brief 创建 Cdist 算子描述符
+ * @param handle 算子句柄
+ * @param desc_ptr 指向返回的描述符指针
+ * @param y 输出张量描述符 (Shape: M x N)
+ * @param x1 输入张量1描述符 (Shape: M x D)
+ * @param x2 输入张量2描述符 (Shape: N x D)
+ * @param p 范数阶数 (L-p norm)
+ */
+__C __export infiniStatus_t infiniopCreateCdistDescriptor(
+                            infiniopHandle_t handle,
+                            infiniopCdistDescriptor_t *desc_ptr, // 注意这里应该是具体类型的指针
+                            infiniopTensorDescriptor_t y_desc,
+                            infiniopTensorDescriptor_t x1_desc,
+                            infiniopTensorDescriptor_t x2_desc,
+                            double p);
+
+/**
+ * @brief 获取 Cdist 计算所需的临时空间大小
+ */
+__C __export infiniStatus_t infiniopGetCdistWorkspaceSize(infiniopCdistDescriptor_t desc, 
+                                                            size_t *size);
+
+/**
+ * @brief 执行 Cdist 计算
+ * @param desc 算子描述符
+ * @param workspace 临时空间指针
+ * @param workspace_size 临时空间大小
+ * @param y 输出数据指针
+ * @param x1 输入1数据指针
+ * @param x2 输入2数据指针
+ * @param stream 计算流 (CUDA stream 等)
+ */
+__C __export infiniStatus_t infiniopCdist(
+                                        infiniopCdistDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *y,
+                                        const void *x1,
+                                        const void *x2,
+                                        void *stream);
+
+/**
+ * @brief 销毁 Cdist 算子描述符
+ */
+__C __export infiniStatus_t infiniopDestroyCdistDescriptor(infiniopCdistDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/reciprocal.h b/include/infiniop/ops/reciprocal.h
new file mode 100644
index 000000000..304c3c754
--- /dev/null
+++ b/include/infiniop/ops/reciprocal.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_RECIPROCAL_API_H__
+#define __INFINIOP_RECIPROCAL_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopReciprocalDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateReciprocalDescriptor(infiniopHandle_t handle,
+                                                               infiniopReciprocalDescriptor_t *desc_ptr,
+                                                               infiniopTensorDescriptor_t y,
+                                                               infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetReciprocalWorkspaceSize(infiniopReciprocalDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopReciprocal(infiniopReciprocalDescriptor_t desc,
+                                               void *workspace,
+                                               size_t workspace_size,
+                                               void *y,
+                                               const void *x,
+                                               void *stream);
+
+__C __export infiniStatus_t infiniopDestroyReciprocalDescriptor(infiniopReciprocalDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
index 7ca962449..431aea92c 100644
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
@@ -47,6 +47,11 @@
 from infinicore.ops.rearrange import rearrange
 from infinicore.ops.squeeze import squeeze
 from infinicore.ops.unsqueeze import unsqueeze
+from infinicore.ops.atanh import atanh
+from infinicore.ops.addcmul import addcmul
+from infinicore.ops.cdist import cdist
+from infinicore.ops.binary_cross_entropy_with_logits import binary_cross_entropy_with_logits
+from infinicore.ops.reciprocal import reciprocal
 from infinicore.tensor import (
     Tensor,
     empty,
@@ -108,6 +113,11 @@
     "narrow",
     "squeeze",
     "unsqueeze",
+    "atanh",
+    "addcmul",
+    "cdist",
+    "binary_cross_entropy_with_logits",
+    "reciprocal",
     "rearrange",
     "empty",
     "empty_like",
diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py
index 255079790..68d498f69 100644
--- a/python/infinicore/nn/functional/__init__.py
+++ b/python/infinicore/nn/functional/__init__.py
@@ -6,9 +6,11 @@
 from .rope import RopeAlgo, rope
 from .silu import silu
 from .swiglu import swiglu
+from .binary_cross_entropy_with_logits import binary_cross_entropy_with_logits
 
 __all__ = [
     "causal_softmax",
+    "binary_cross_entropy_with_logits",
     "random_sample",
     "rms_norm",
     "silu",
diff --git a/python/infinicore/nn/functional/binary_cross_entropy_with_logits.py b/python/infinicore/nn/functional/binary_cross_entropy_with_logits.py
new file mode 100644
index 000000000..98d2e3ae9
--- /dev/null
+++ b/python/infinicore/nn/functional/binary_cross_entropy_with_logits.py
@@ -0,0 +1,103 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def binary_cross_entropy_with_logits(
+    input: Tensor,
+    target: Tensor,
+    weight: Tensor | None = None,
+    pos_weight: Tensor | None = None,
+    reduction: str = "mean",
+    *,
+    out: Tensor | None = None,
+) -> Tensor:
+    """Binary cross entropy loss with logits.
+
+    This wraps the underlying C++/CUDA implementation exposed via `_infinicore`.
+
+    The low-level binding treats missing ``weight`` / ``pos_weight`` via
+    default-constructed tensors. Here we avoid passing ``None`` down and
+    instead omit arguments when they are not provided, so pybind11 uses
+    its defaults.
+    """
+
+    # Out-of-place API
+    if out is None:
+        # Neither weight nor pos_weight
+        if weight is None and pos_weight is None:
+            return Tensor(
+                _infinicore.binary_cross_entropy_with_logits(
+                    input._underlying,
+                    target._underlying,
+                    reduction=reduction,
+                )
+            )
+
+        # weight provided only
+        if weight is not None and pos_weight is None:
+            return Tensor(
+                _infinicore.binary_cross_entropy_with_logits(
+                    input._underlying,
+                    target._underlying,
+                    weight._underlying,
+                    reduction=reduction,
+                )
+            )
+
+        # pos_weight provided only
+        if weight is None and pos_weight is not None:
+            return Tensor(
+                _infinicore.binary_cross_entropy_with_logits(
+                    input._underlying,
+                    target._underlying,
+                    pos_weight=pos_weight._underlying,
+                    reduction=reduction,
+                )
+            )
+
+        # both provided
+        return Tensor(
+            _infinicore.binary_cross_entropy_with_logits(
+                input._underlying,
+                target._underlying,
+                weight._underlying,
+                pos_weight._underlying,
+                reduction,
+            )
+        )
+
+    # In-place-style API with explicit out
+    if weight is None and pos_weight is None:
+        _infinicore.binary_cross_entropy_with_logits_(
+            out._underlying,
+            input._underlying,
+            target._underlying,
+            reduction=reduction,
+        )
+    elif weight is not None and pos_weight is None:
+        _infinicore.binary_cross_entropy_with_logits_(
+            out._underlying,
+            input._underlying,
+            target._underlying,
+            weight._underlying,
+            reduction=reduction,
+        )
+    elif weight is None and pos_weight is not None:
+        _infinicore.binary_cross_entropy_with_logits_(
+            out._underlying,
+            input._underlying,
+            target._underlying,
+            pos_weight=pos_weight._underlying,
+            reduction=reduction,
+        )
+    else:
+        _infinicore.binary_cross_entropy_with_logits_(
+            out._underlying,
+            input._underlying,
+            target._underlying,
+            weight._underlying,
+            pos_weight._underlying,
+            reduction,
+        )
+
+    return out
diff --git a/python/infinicore/ops/addcmul.py b/python/infinicore/ops/addcmul.py
new file mode 100644
index 000000000..11e2feffa
--- /dev/null
+++ b/python/infinicore/ops/addcmul.py
@@ -0,0 +1,24 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def addcmul(input, tensor1, tensor2, value=1.0, *, out=None):
+    if out is None:
+        return Tensor(
+            _infinicore.addcmul(
+                input._underlying,
+                tensor1._underlying,
+                tensor2._underlying,
+                float(value),
+            )
+        )
+
+    _infinicore.addcmul_(
+        out._underlying,
+        input._underlying,
+        tensor1._underlying,
+        tensor2._underlying,
+        float(value),
+    )
+
+    return out
\ No newline at end of file
diff --git a/python/infinicore/ops/atanh.py b/python/infinicore/ops/atanh.py
new file mode 100644
index 000000000..4008e0268
--- /dev/null
+++ b/python/infinicore/ops/atanh.py
@@ -0,0 +1,11 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def atanh(input, *, out=None):
+    if out is None:
+        return Tensor(_infinicore.atanh(input._underlying))
+
+    _infinicore.atanh_(out._underlying, input._underlying)
+
+    return out
diff --git a/python/infinicore/ops/binary_cross_entropy_with_logits.py b/python/infinicore/ops/binary_cross_entropy_with_logits.py
new file mode 100644
index 000000000..c75996548
--- /dev/null
+++ b/python/infinicore/ops/binary_cross_entropy_with_logits.py
@@ -0,0 +1,47 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+def binary_cross_entropy_with_logits(
+    input, 
+    target, 
+    weight=None, 
+    pos_weight=None, 
+    reduction="mean", 
+    *, 
+    out=None
+):
+    """
+    input: Tensor (logits)
+    target: Tensor (labels)
+    weight: Tensor (optional, sample-wise weight)
+    pos_weight: Tensor (optional, class-wise weight)
+    reduction: str ('none', 'mean', 'sum')
+    """
+    
+    # 提取底层 C++ 对象，处理可选 Tensor
+    weight_raw = weight._underlying if weight is not None else None
+    pos_weight_raw = pos_weight._underlying if pos_weight is not None else None
+
+    if out is None:
+        # 调用非原地接口，返回新创建的 Tensor
+        return Tensor(
+            _infinicore.binary_cross_entropy_with_logits(
+                input._underlying,
+                target._underlying,
+                weight_raw,
+                pos_weight_raw,
+                str(reduction),
+            )
+        )
+
+    # 调用显式指定输出的接口 (binary_cross_entropy_with_logits_)
+    _infinicore.binary_cross_entropy_with_logits_(
+        out._underlying,
+        input._underlying,
+        target._underlying,
+        weight_raw,
+        pos_weight_raw,
+        str(reduction),
+    )
+
+    return out
\ No newline at end of file
diff --git a/python/infinicore/ops/cdist.py b/python/infinicore/ops/cdist.py
new file mode 100644
index 000000000..4d3dcee3b
--- /dev/null
+++ b/python/infinicore/ops/cdist.py
@@ -0,0 +1,35 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+def cdist(x1, x2, p=2.0, *, out=None):
+    """
+    计算两组向量集合中每一对向量之间的 p-norm 距离。
+    
+    参数:
+        x1 (Tensor): 形状为 (M, D) 的输入张量。
+        x2 (Tensor): 形状为 (N, D) 的输入张量。
+        p (float): p-norm 的阶数，默认为 2.0。
+        out (Tensor, optional): 结果输出张量。
+        
+    返回:
+        Tensor: 形状为 (M, N) 的距离矩阵。
+    """
+    if out is None:
+        # 非原地操作：由底层 C++ 接口根据 x1, x2 推导形状并创建新 Tensor
+        return Tensor(
+            _infinicore.cdist(
+                x1._underlying,
+                x2._underlying,
+                float(p),
+            )
+        )
+
+    # 原地/指定输出操作：结果写入用户提供的 out 张量
+    _infinicore.cdist_(
+        out._underlying,
+        x1._underlying,
+        x2._underlying,
+        float(p),
+    )
+
+    return out
\ No newline at end of file
diff --git a/python/infinicore/ops/reciprocal.py b/python/infinicore/ops/reciprocal.py
new file mode 100644
index 000000000..989e5dc3a
--- /dev/null
+++ b/python/infinicore/ops/reciprocal.py
@@ -0,0 +1,11 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def reciprocal(input, *, out=None):
+    if out is None:
+        return Tensor(_infinicore.reciprocal(input._underlying))
+
+    _infinicore.reciprocal_(out._underlying, input._underlying)
+
+    return out
\ No newline at end of file
diff --git a/src/infinicore/ops/addcmul/addcmul.cc b/src/infinicore/ops/addcmul/addcmul.cc
new file mode 100644
index 000000000..44735b55a
--- /dev/null
+++ b/src/infinicore/ops/addcmul/addcmul.cc
@@ -0,0 +1,29 @@
+#include "infinicore/ops/addcmul.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<Addcmul::schema> &Addcmul::dispatcher() {
+    static common::OpDispatcher<Addcmul::schema> dispatcher_;
+    return dispatcher_;
+};
+
+// 执行核心逻辑：设备校验与后端分发
+void Addcmul::execute(Tensor out, Tensor input, Tensor t1, Tensor t2, float value) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, input, t1, t2);
+    infinicore::context::setDevice(out->device());
+    dispatcher().lookup(out->device().getType())(out, input, t1, t2, value);
+}
+
+// Out-of-place 接口：自动创建输出 Tensor
+Tensor addcmul(Tensor input, Tensor t1, Tensor t2, float value) {
+    auto out = Tensor::empty(input->shape(), input->dtype(), input->device());
+    addcmul_(out, input, t1, t2, value);
+    return out;
+}
+
+void addcmul_(Tensor out, Tensor input, Tensor t1, Tensor t2, float value) {
+    Addcmul::execute(out, input, t1, t2, value);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/addcmul/addcmul_infiniop.cc b/src/infinicore/ops/addcmul/addcmul_infiniop.cc
new file mode 100644
index 000000000..ed6327f05
--- /dev/null
+++ b/src/infinicore/ops/addcmul/addcmul_infiniop.cc
@@ -0,0 +1,51 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/addcmul.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::addcmul_impl::infiniop {
+
+// 定义线程局部的算子描述符缓存
+thread_local common::OpCache<size_t, infiniopAddcmulDescriptor_t> caches(
+    100,
+    [](infiniopAddcmulDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyAddcmulDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor out, Tensor input, Tensor t1, Tensor t2, float value) {
+    size_t seed = hash_combine(out, input, t1, t2, value);
+
+    auto device = context::getDevice();
+    auto &cache = caches.getCache(device);
+
+    auto desc_opt = cache.get(seed);
+    infiniopAddcmulDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateAddcmulDescriptor(
+            context::getInfiniopHandle(device), &desc,
+            out->desc(), input->desc(), t1->desc(), t2->desc(), value));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetAddcmulWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopAddcmul(
+        desc, workspace->data(), workspace_size,
+        out->data(), input->data(), t1->data(), t2->data(), context::getStream()));
+}
+
+static bool registered = []() {
+    Addcmul::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::addcmul_impl::infiniop
\ No newline at end of file
diff --git a/src/infinicore/ops/atanh/atanh.cc b/src/infinicore/ops/atanh/atanh.cc
new file mode 100644
index 000000000..a03686759
--- /dev/null
+++ b/src/infinicore/ops/atanh/atanh.cc
@@ -0,0 +1,37 @@
+#include "infinicore/ops/atanh.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+// 获取单例分发器
+common::OpDispatcher<Atanh::schema> &Atanh::dispatcher() {
+    static common::OpDispatcher<Atanh::schema> dispatcher_;
+    return dispatcher_;
+};
+
+// 执行入口：负责设备切换和后端查找
+void Atanh::execute(Tensor y, Tensor a) {
+    // 确保输入和输出在同一个设备上
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, a);
+    
+    // 切换当前上下文到目标设备
+    infinicore::context::setDevice(y->device());
+    
+    // 根据设备类型（CPU/CUDA等）查找对应的实现并执行
+    dispatcher().lookup(y->device().getType())(y, a);
+}
+
+// Out-of-place 接口：自动创建结果 Tensor
+Tensor atanh(Tensor a) {
+    // 创建一个与输入形状、类型、设备完全相同的空 Tensor
+    auto y = Tensor::empty(a->shape(), a->dtype(), a->device());
+    atanh_(y, a);
+    return y;
+}
+
+// In-place 或指定输出接口
+void atanh_(Tensor y, Tensor a) {
+    Atanh::execute(y, a);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/atanh/atanh_infiniop.cc b/src/infinicore/ops/atanh/atanh_infiniop.cc
new file mode 100644
index 000000000..00e7f72c3
--- /dev/null
+++ b/src/infinicore/ops/atanh/atanh_infiniop.cc
@@ -0,0 +1,58 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/atanh.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::atanh_impl::infiniop {
+
+// 定义线程局部的算子描述符缓存，避免重复创建 Descriptor 带来的开销
+thread_local common::OpCache<size_t, infiniopAtanhDescriptor_t> caches(
+    100, // 缓存容量
+    [](infiniopAtanhDescriptor_t &desc) {
+        if (desc != nullptr) {
+            // 缓存释放时的回调：销毁 infiniop 算子描述符
+            INFINICORE_CHECK_ERROR(infiniopDestroyAtanhDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor y, Tensor a) {
+    // 1. 根据 Tensor 的形状、步长、类型等信息生成唯一 Hash 值
+    size_t seed = hash_combine(y, a);
+
+    auto device = context::getDevice();
+    auto &cache = caches.getCache(device);
+
+    // 2. 尝试从缓存中获取已存在的描述符
+    auto desc_opt = cache.get(seed);
+    infiniopAtanhDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        // 如果缓存未命中，创建新的描述符
+        INFINICORE_CHECK_ERROR(infiniopCreateAtanhDescriptor(
+            context::getInfiniopHandle(device), &desc,
+            y->desc(), a->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    // 3. 获取并分配必要的 Workspace 空间（如果有的话）
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetAtanhWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    // 4. 执行底层计算
+    INFINICORE_CHECK_ERROR(infiniopAtanh(
+        desc, workspace->data(), workspace_size,
+        y->data(), a->data(), context::getStream()));
+}
+
+// 5. 自动注册逻辑：程序启动时将此实现注册到分发器中
+static bool registered = []() {
+    Atanh::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::atanh_impl::infiniop
\ No newline at end of file
diff --git a/src/infinicore/ops/binary_cross_entropy_with_logits/binary_cross_entropy_with_logits.cc b/src/infinicore/ops/binary_cross_entropy_with_logits/binary_cross_entropy_with_logits.cc
new file mode 100644
index 000000000..fe3720525
--- /dev/null
+++ b/src/infinicore/ops/binary_cross_entropy_with_logits/binary_cross_entropy_with_logits.cc
@@ -0,0 +1,63 @@
+#include "infinicore/ops/binary_cross_entropy_with_logits.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+// 静态调度器实例化
+common::OpDispatcher<BinaryCrossEntropyWithLogits::schema> &BinaryCrossEntropyWithLogits::dispatcher() {
+    static common::OpDispatcher<BinaryCrossEntropyWithLogits::schema> dispatcher_;
+    return dispatcher_;
+};
+
+/**
+ * 执行核心逻辑：设备校验、上下文设置与后端分发
+ */
+void BinaryCrossEntropyWithLogits::execute(Tensor out, Tensor logits, Tensor target, Tensor weight, Tensor pos_weight, std::string reduction) {
+    // 1. 校验所有已定义的 Tensor 是否在同一设备上
+    // 使用宏或循环校验 logits, target, out 以及可选的 weight/pos_weight
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, logits, target);
+    if (weight.is_defined()) INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, weight);
+    if (pos_weight.is_defined()) INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, pos_weight);
+    
+    // 2. 设置当前设备上下文
+    infinicore::context::setDevice(out->device());
+    
+    // 3. 根据设备类型查找并执行具体的后端实现（如 CUDA 或 CPU 实现）
+    dispatcher().lookup(out->device().getType())(out, logits, target, weight, pos_weight, reduction);
+}
+
+/**
+ * Out-of-place 接口：根据 reduction 自动创建输出 Tensor
+ */
+Tensor binary_cross_entropy_with_logits(Tensor logits, Tensor target, Tensor weight, Tensor pos_weight, std::string reduction) {
+    std::vector<uint64_t> out_shape;
+
+    // 1. 根据归约方式确定输出形状
+    if (reduction == "none") {
+        // 不归约，形状与输入 logits 一致
+        auto in_shape = logits->shape();
+        for (auto dim : in_shape) {
+            out_shape.push_back(static_cast<uint64_t>(dim));
+        }
+    } else {
+        // mean 或 sum 归约，输出为标量 (空 shape 向量表示 0-dim tensor)
+        out_shape = {}; 
+    }
+
+    // 2. 创建输出 Tensor
+    auto out = Tensor::empty(out_shape, logits->dtype(), logits->device());
+    
+    // 3. 调用显式接口执行计算
+    binary_cross_entropy_with_logits_(out, logits, target, weight, pos_weight, reduction);
+    
+    return out;
+}
+
+/**
+ * 显式指定输出接口
+ */
+void binary_cross_entropy_with_logits_(Tensor out, Tensor logits, Tensor target, Tensor weight, Tensor pos_weight, std::string reduction) {
+    BinaryCrossEntropyWithLogits::execute(out, logits, target, weight, pos_weight, reduction);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/binary_cross_entropy_with_logits/binary_cross_entropy_with_logits_infiniop.cc b/src/infinicore/ops/binary_cross_entropy_with_logits/binary_cross_entropy_with_logits_infiniop.cc
new file mode 100644
index 000000000..4403c3c28
--- /dev/null
+++ b/src/infinicore/ops/binary_cross_entropy_with_logits/binary_cross_entropy_with_logits_infiniop.cc
@@ -0,0 +1,99 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/binary_cross_entropy_with_logits.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::bce_logits_impl::infiniop {
+
+// 定义线程局部的 BCEWithLogits 算子描述符缓存
+thread_local common::OpCache<size_t, infiniopBCEWithLogitsDescriptor_t> caches(
+    100,
+    [](infiniopBCEWithLogitsDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyBCEWithLogitsDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+/**
+ * @brief 执行 BCEWithLogits 计算
+ * @param out 输出 Tensor (根据 reduction 可能是标量或与 logits 同形状)
+ * @param logits 预测值 Tensor
+ * @param target 标签 Tensor
+ * @param weight 样本权重 Tensor (可选)
+ * @param pos_weight 正类权重 Tensor (可选)
+ * @param reduction_str 归约方式 ("none", "mean", "sum")
+ */
+void calculate(Tensor out, Tensor logits, Tensor target, Tensor weight, Tensor pos_weight, std::string reduction_str) {
+    // 1. 将字符串归约参数转换为底层 API 使用的枚举值
+    infiniopReduction_t reduction;
+    if (reduction_str == "none") {
+        reduction = INFINIOP_REDUCTION_NONE;
+    } else if (reduction_str == "mean") {
+        reduction = INFINIOP_REDUCTION_MEAN;
+    } else if (reduction_str == "sum") {
+        reduction = INFINIOP_REDUCTION_SUM;
+    } else {
+        throw std::runtime_error("Unknown reduction mode: " + reduction_str);
+    }
+
+    // 2. 生成唯一 Hash Seed 用于缓存查找
+    // 包含所有输入 Tensor 的状态和 reduction 参数，确保缓存键的唯一性
+    size_t seed = hash_combine(out, logits, target, weight, pos_weight, static_cast<int>(reduction));
+
+    auto device = context::getDevice();
+    auto &cache = caches.getCache(device);
+
+    auto desc_opt = cache.get(seed);
+    infiniopBCEWithLogitsDescriptor_t desc = nullptr;
+
+    // 3. 如果缓存未命中，创建新的描述符并存入缓存
+    if (!desc_opt) {
+        // 获取可选 Tensor 的描述符，若未定义则传 nullptr
+        auto weight_desc = weight.is_defined() ? weight->desc() : nullptr;
+        auto pos_weight_desc = pos_weight.is_defined() ? pos_weight->desc() : nullptr;
+
+        INFINICORE_CHECK_ERROR(infiniopCreateBCEWithLogitsDescriptor(
+            context::getInfiniopHandle(device), 
+            &desc,
+            out->desc(), 
+            logits->desc(), 
+            target->desc(), 
+            weight_desc,
+            pos_weight_desc,
+            reduction));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    // 4. 动态获取并分配 Workspace 临时内存
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetBCEWithLogitsWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    // 5. 获取数据指针，处理可选 Tensor 的空指针逻辑
+    const void* weight_ptr = weight.is_defined() ? weight->data() : nullptr;
+    const void* pos_weight_ptr = pos_weight.is_defined() ? pos_weight->data() : nullptr;
+
+    // 6. 执行底层算子
+    INFINICORE_CHECK_ERROR(infiniopBCEWithLogits(
+        desc, 
+        workspace->data(), 
+        workspace_size,
+        out->data(), 
+        logits->data(), 
+        target->data(), 
+        weight_ptr,
+        pos_weight_ptr,
+        context::getStream()));
+}
+
+// 7. 自动注册到调度器 (Dispatcher)
+static bool registered = []() {
+    BinaryCrossEntropyWithLogits::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::bce_logits_impl::infiniop
\ No newline at end of file
diff --git a/src/infinicore/ops/cdist/cdist.cc b/src/infinicore/ops/cdist/cdist.cc
new file mode 100644
index 000000000..b351a25d6
--- /dev/null
+++ b/src/infinicore/ops/cdist/cdist.cc
@@ -0,0 +1,57 @@
+#include "infinicore/ops/cdist.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+// 静态调度器实例化
+common::OpDispatcher<Cdist::schema> &Cdist::dispatcher() {
+    static common::OpDispatcher<Cdist::schema> dispatcher_;
+    return dispatcher_;
+};
+
+/**
+ * 执行核心逻辑：设备校验与后端分发
+ */
+void Cdist::execute(Tensor out, Tensor x1, Tensor x2, double p) {
+    // 校验三个 Tensor 是否在同一设备上
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, x1, x2);
+    
+    // 设置当前设备上下文
+    infinicore::context::setDevice(out->device());
+    
+    // 根据设备类型（CUDA/CPU/etc.）查找并执行注册的算子实现
+    dispatcher().lookup(out->device().getType())(out, x1, x2, p);
+}
+
+/**
+ * Out-of-place 接口：自动创建输出 Tensor
+ * x1: (M, D), x2: (N, D) -> out: (M, N)
+ */
+Tensor cdist(Tensor x1, Tensor x2, double p) {
+    // 1. 获取输入维度
+    auto shape1 = x1->shape(); // 假设为 {M, D}
+    auto shape2 = x2->shape(); // 假设为 {N, D}
+    
+    // 将原来的 std::vector<int64_t> 修改为 std::vector<uint64_t>
+    std::vector<uint64_t> out_shape = {
+        static_cast<uint64_t>(shape1[0]), 
+        static_cast<uint64_t>(shape2[0])
+    };
+
+    // 或者使用更简洁的初始化列表方式，强制转换类型
+    auto out = Tensor::empty({(uint64_t)shape1[0], (uint64_t)shape2[0]}, x1->dtype(), x1->device());
+    
+    // 5. 调用执行接口
+    cdist_(out, x1, x2, p);
+    
+    return out;
+}
+
+/**
+ * 显式指定输出接口
+ */
+void cdist_(Tensor out, Tensor x1, Tensor x2, double p) {
+    Cdist::execute(out, x1, x2, p);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/cdist/cdist_infiniop.cc b/src/infinicore/ops/cdist/cdist_infiniop.cc
new file mode 100644
index 000000000..488f90815
--- /dev/null
+++ b/src/infinicore/ops/cdist/cdist_infiniop.cc
@@ -0,0 +1,66 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/cdist.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::cdist_impl::infiniop {
+
+// 定义线程局部的 cdist 算子描述符缓存
+// 缓存 key 为输入 Tensor 描述信息及参数 p 的哈希值
+thread_local common::OpCache<size_t, infiniopCdistDescriptor_t> caches(
+    100,
+    [](infiniopCdistDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyCdistDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor out, Tensor x1, Tensor x2, double p) {
+    // 1. 生成唯一 Hash Seed 用于缓存查找
+    size_t seed = hash_combine(out, x1, x2, p);
+
+    auto device = context::getDevice();
+    auto &cache = caches.getCache(device);
+
+    auto desc_opt = cache.get(seed);
+    infiniopCdistDescriptor_t desc = nullptr;
+
+    // 2. 如果缓存未命中，创建新的描述符并存入缓存
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateCdistDescriptor(
+            context::getInfiniopHandle(device), 
+            &desc,
+            out->desc(), 
+            x1->desc(), 
+            x2->desc(), 
+            p));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    // 3. 动态获取并分配 Workspace 临时内存
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetCdistWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    // 4. 执行底层算子
+    INFINICORE_CHECK_ERROR(infiniopCdist(
+        desc, 
+        workspace->data(), 
+        workspace_size,
+        out->data(), 
+        x1->data(), 
+        x2->data(), 
+        context::getStream()));
+}
+
+// 5. 自动注册到调度器 (Dispatcher)
+static bool registered = []() {
+    Cdist::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::cdist_impl::infiniop
\ No newline at end of file
diff --git a/src/infinicore/ops/reciprocal/reciprocal.cc b/src/infinicore/ops/reciprocal/reciprocal.cc
new file mode 100644
index 000000000..4ce526fd7
--- /dev/null
+++ b/src/infinicore/ops/reciprocal/reciprocal.cc
@@ -0,0 +1,27 @@
+#include "infinicore/ops/reciprocal.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<Reciprocal::schema> &Reciprocal::dispatcher() {
+    static common::OpDispatcher<Reciprocal::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Reciprocal::execute(Tensor y, Tensor x) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, x);
+    infinicore::context::setDevice(y->device());
+    dispatcher().lookup(y->device().getType())(y, x);
+}
+
+Tensor reciprocal(Tensor x) {
+    auto y = Tensor::empty(x->shape(), x->dtype(), x->device());
+    reciprocal_(y, x);
+    return y;
+}
+
+void reciprocal_(Tensor y, Tensor x) {
+    Reciprocal::execute(y, x);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/reciprocal/reciprocal_infiniop.cc b/src/infinicore/ops/reciprocal/reciprocal_infiniop.cc
new file mode 100644
index 000000000..47481679b
--- /dev/null
+++ b/src/infinicore/ops/reciprocal/reciprocal_infiniop.cc
@@ -0,0 +1,50 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/reciprocal.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::reciprocal_impl::infiniop {
+
+thread_local common::OpCache<size_t, infiniopReciprocalDescriptor_t> caches(
+    100, // capacity
+    [](infiniopReciprocalDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyReciprocalDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor y, Tensor x) {
+    size_t seed = hash_combine(y, x);
+
+    auto device = context::getDevice();
+    auto &cache = caches.getCache(device);
+
+    auto desc_opt = cache.get(seed);
+    infiniopReciprocalDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateReciprocalDescriptor(
+            context::getInfiniopHandle(device), &desc,
+            y->desc(), x->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetReciprocalWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopReciprocal(
+        desc, workspace->data(), workspace_size,
+        y->data(), x->data(), context::getStream()));
+}
+
+static bool registered = []() {
+    Reciprocal::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::reciprocal_impl::infiniop
\ No newline at end of file
diff --git a/src/infinicore/pybind11/infinicore.cc b/src/infinicore/pybind11/infinicore.cc
index 152adefd9..225a4a6f6 100644
--- a/src/infinicore/pybind11/infinicore.cc
+++ b/src/infinicore/pybind11/infinicore.cc
@@ -16,8 +16,8 @@ PYBIND11_MODULE(_infinicore, m) {
     device::bind(m);
     device_event::bind(m);
     dtype::bind(m);
-    ops::bind(m);
     tensor::bind(m);
+    ops::bind(m);
 }
 
 } // namespace infinicore
diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp
index 978defa17..b45d21ad0 100644
--- a/src/infinicore/pybind11/ops.hpp
+++ b/src/infinicore/pybind11/ops.hpp
@@ -16,6 +16,11 @@
 #include "ops/silu.hpp"
 #include "ops/swiglu.hpp"
 
+#include "ops/atanh.hpp"
+#include "ops/addcmul.hpp"
+#include "ops/cdist.hpp"
+#include "ops/binary_cross_entropy_with_logits.hpp"
+#include "ops/reciprocal.hpp"
 namespace py = pybind11;
 
 namespace infinicore::ops {
@@ -34,6 +39,11 @@ inline void bind(py::module &m) {
     bind_swiglu(m);
     bind_rope(m);
     bind_embedding(m);
+    bind_atanh(m);
+    bind_addcmul(m);
+    bind_cdist(m);
+    bind_binary_cross_entropy_with_logits(m);
+    bind_reciprocal(m);
 }
 
 } // namespace infinicore::ops
diff --git a/src/infinicore/pybind11/ops/addcmul.hpp b/src/infinicore/pybind11/ops/addcmul.hpp
new file mode 100644
index 000000000..e46ba81ec
--- /dev/null
+++ b/src/infinicore/pybind11/ops/addcmul.hpp
@@ -0,0 +1,51 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/addcmul.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_addcmul(py::module &m) {
+    // 绑定 out-of-place 接口: out = addcmul(input, t1, t2, value)
+    m.def("addcmul",
+          &op::addcmul,
+          py::arg("input"),
+          py::arg("tensor1"),
+          py::arg("tensor2"),
+          py::arg("value") = 1.0f,
+          R"doc(Performs the element-wise multiplication of tensor1 by tensor2, 
+multiplies the result by value and adds it to input.
+
+Args:
+    input: Tensor to be added
+    tensor1: First tensor for multiplication
+    tensor2: Second tensor for multiplication
+    value: Scalar multiplier for tensor1 * tensor2 (default: 1.0)
+
+Returns:
+    The output tensor
+)doc");
+
+    // 绑定 in-place / specified output 接口: addcmul_(out, input, t1, t2, value)
+    m.def("addcmul_",
+          &op::addcmul_,
+          py::arg("out"),
+          py::arg("input"),
+          py::arg("tensor1"),
+          py::arg("tensor2"),
+          py::arg("value") = 1.0f,
+          R"doc(In-place version of addcmul.
+
+Args:
+    out: The destination tensor to store the result
+    input: Tensor to be added
+    tensor1: First tensor for multiplication
+    tensor2: Second tensor for multiplication
+    value: Scalar multiplier for tensor1 * tensor2 (default: 1.0)
+)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/atanh.hpp b/src/infinicore/pybind11/ops/atanh.hpp
new file mode 100644
index 000000000..9db33def7
--- /dev/null
+++ b/src/infinicore/pybind11/ops/atanh.hpp
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/atanh.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_atanh(py::module &m) {
+    m.def("atanh",
+          &op::atanh,
+          py::arg("a"),
+          R"doc(Inverse hyperbolic tangent of a tensor.)doc");
+
+    m.def("atanh_",
+          &op::atanh_,
+          py::arg("y"),
+          py::arg("a"),
+          R"doc(Compute inverse hyperbolic tangent and store in the provided output tensor.)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/binary_cross_entropy_with_logits.hpp b/src/infinicore/pybind11/ops/binary_cross_entropy_with_logits.hpp
new file mode 100644
index 000000000..25ddd802d
--- /dev/null
+++ b/src/infinicore/pybind11/ops/binary_cross_entropy_with_logits.hpp
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include "infinicore/ops/binary_cross_entropy_with_logits.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_binary_cross_entropy_with_logits(py::module &m) {
+    // 1. 绑定 out-of-place 接口: out = binary_cross_entropy_with_logits(...)
+    m.def("binary_cross_entropy_with_logits",
+          &op::binary_cross_entropy_with_logits,
+          py::arg("input"),
+          py::arg("target"),
+          py::arg("weight") = Tensor(),      // 默认为空 Tensor
+          py::arg("pos_weight") = Tensor(),  // 默认为空 Tensor
+          py::arg("reduction") = "mean",         // 默认归约方式为平均值
+          R"doc(Measures Binary Cross Entropy between target and output logits.
+
+Args:
+    input: Tensor of arbitrary shape as unnormalized scores (logits).
+    target: Tensor of the same shape as input with values between 0 and 1.
+    weight: Optional rescaling weight for each loss component.
+    pos_weight: Optional weight for positive examples (must be broadcastable).
+    reduction: Specfies the reduction to apply: 'none' | 'mean' | 'sum'.
+
+Returns:
+    A tensor representing the loss.
+)doc");
+
+    // 2. 绑定指定输出接口: binary_cross_entropy_with_logits_(out, ...)
+    m.def("binary_cross_entropy_with_logits_",
+          &op::binary_cross_entropy_with_logits_,
+          py::arg("out"),
+          py::arg("input"),
+          py::arg("target"),
+          py::arg("weight") = Tensor(),
+          py::arg("pos_weight") = Tensor(),
+          py::arg("reduction") = "mean",
+          R"doc(Specified output version of binary_cross_entropy_with_logits.
+
+Args:
+    out: The destination tensor to store the loss.
+    input: Logits tensor.
+    target: Target tensor.
+    weight: Optional sample weight.
+    pos_weight: Optional positive class weight.
+    reduction: Specfies the reduction to apply.
+)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/cdist.hpp b/src/infinicore/pybind11/ops/cdist.hpp
new file mode 100644
index 000000000..78ef18529
--- /dev/null
+++ b/src/infinicore/pybind11/ops/cdist.hpp
@@ -0,0 +1,46 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/cdist.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_cdist(py::module &m) {
+    // 1. 绑定 out-of-place 接口: out = cdist(x1, x2, p)
+    m.def("cdist",
+          &op::cdist,
+          py::arg("x1"),
+          py::arg("x2"),
+          py::arg("p") = 2.0,
+          R"doc(Computes batched pairwise distance between vectors in x1 and x2 using p-norm.
+
+Args:
+    x1: First set of vectors, shape (M, D)
+    x2: Second set of vectors, shape (N, D)
+    p: The p-norm to apply (default: 2.0)
+
+Returns:
+    A matrix containing pairwise distances, shape (M, N)
+)doc");
+
+    // 2. 绑定 in-place / specified output 接口: cdist_(out, x1, x2, p)
+    m.def("cdist_",
+          &op::cdist_,
+          py::arg("out"),
+          py::arg("x1"),
+          py::arg("x2"),
+          py::arg("p") = 2.0,
+          R"doc(In-place version of cdist. Stores the results in the 'out' tensor.
+
+Args:
+    out: The destination tensor, shape (M, N)
+    x1: First set of vectors, shape (M, D)
+    x2: Second set of vectors, shape (N, D)
+    p: The p-norm to apply (default: 2.0)
+)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/reciprocal.hpp b/src/infinicore/pybind11/ops/reciprocal.hpp
new file mode 100644
index 000000000..dec6472cc
--- /dev/null
+++ b/src/infinicore/pybind11/ops/reciprocal.hpp
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/reciprocal.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_reciprocal(py::module &m) {
+    m.def("reciprocal",
+          &op::reciprocal,
+          py::arg("x"),
+          R"doc(Computes the reciprocal of the input tensor.)doc");
+
+    m.def("reciprocal_",
+          &op::reciprocal_,
+          py::arg("y"),
+          py::arg("x"),
+          R"doc(Computes the reciprocal of the input tensor and stores in the output tensor.)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infiniop-test/include/ops.hpp b/src/infiniop-test/include/ops.hpp
index 12469d780..2490a8291 100644
--- a/src/infiniop-test/include/ops.hpp
+++ b/src/infiniop-test/include/ops.hpp
@@ -13,6 +13,11 @@ DECLARE_INFINIOP_TEST(rope)
 DECLARE_INFINIOP_TEST(clip)
 DECLARE_INFINIOP_TEST(swiglu)
 DECLARE_INFINIOP_TEST(add)
+DECLARE_INFINIOP_TEST(atanh)
+DECLARE_INFINIOP_TEST(addcmul)
+DECLARE_INFINIOP_TEST(cdist)
+DECLARE_INFINIOP_TEST(binary_cross_entropy_with_logits)
+DECLARE_INFINIOP_TEST(reciprocal)
 DECLARE_INFINIOP_TEST(causal_softmax)
 DECLARE_INFINIOP_TEST(rearrange)
 DECLARE_INFINIOP_TEST(silu)
diff --git a/src/infiniop-test/src/ops/addcmul.cpp b/src/infiniop-test/src/ops/addcmul.cpp
new file mode 100644
index 000000000..7b1b40ea9
--- /dev/null
+++ b/src/infiniop-test/src/ops/addcmul.cpp
@@ -0,0 +1,143 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::addcmul {
+
+struct Test::Attributes {
+    std::shared_ptr<Tensor> input;
+    std::shared_ptr<Tensor> t1;
+    std::shared_ptr<Tensor> t2;
+    std::shared_ptr<Tensor> out;
+    std::shared_ptr<Tensor> ans;
+    float value;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+
+    // 校验张量是否存在
+    if (tensors.find("input") == tensors.end() ||
+        tensors.find("t1") == tensors.end() ||
+        tensors.find("t2") == tensors.end() ||
+        tensors.find("out") == tensors.end() ||
+        tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Addcmul Test: Missing tensors");
+    }
+
+    // 获取标量属性 value
+    test->_attributes->value = 1.0f; // 默认值
+    if (attributes.find("value") != attributes.end()) {
+        test->_attributes->value = *reinterpret_cast<float *>(attributes["value"].data());
+    }
+
+    test->_attributes->input = tensors["input"];
+    test->_attributes->t1 = tensors["t1"];
+    test->_attributes->t2 = tensors["t2"];
+    test->_attributes->out = tensors["out"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    
+    infiniopAddcmulDescriptor_t op_desc;
+    
+    // 数据迁移至指定设备
+    auto input = _attributes->input->to(device, device_id);
+    auto t1 = _attributes->t1->to(device, device_id);
+    auto t2 = _attributes->t2->to(device, device_id);
+    auto out = _attributes->out->to(device, device_id);
+
+    // 创建算子描述符
+    CHECK_OR(infiniopCreateAddcmulDescriptor(handle, &op_desc,
+                                             out->desc(),
+                                             input->desc(),
+                                             t1->desc(),
+                                             t2->desc(),
+                                             _attributes->value),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create addcmul descriptor."));
+
+    // Workspace 处理
+    size_t workspace_size;
+    CHECK_OR(infiniopGetAddcmulWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+
+    // 执行计算
+    CHECK_OR(infiniopAddcmul(op_desc, workspace, workspace_size,
+                             out->data(),
+                             input->data(),
+                             t1->data(),
+                             t2->data(),
+                             nullptr), // stream
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    // 结果验证
+    try {
+        allClose(out, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    // 性能测试
+    double elapsed_time = benchmark(
+        [=]() {
+            infiniopAddcmul(op_desc, workspace, workspace_size,
+                            out->data(),
+                            input->data(),
+                            t1->data(),
+                            t2->data(),
+                            nullptr);
+        },
+        warm_ups, iterations);
+
+    // 资源清理
+    infinirtFree(workspace);
+    infiniopDestroyAddcmulDescriptor(op_desc);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"value"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"input", "t1", "t2", "out", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"out"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- value: " << _attributes->value << std::endl;
+    oss << "- input: " << _attributes->input->info() << std::endl;
+    oss << "- t1: " << _attributes->t1->info() << std::endl;
+    oss << "- t2: " << _attributes->t2->info() << std::endl;
+    oss << "- out: " << _attributes->out->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::addcmul
\ No newline at end of file
diff --git a/src/infiniop-test/src/ops/atanh.cpp b/src/infiniop-test/src/ops/atanh.cpp
new file mode 100644
index 000000000..35b647814
--- /dev/null
+++ b/src/infiniop-test/src/ops/atanh.cpp
@@ -0,0 +1,116 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::atanh {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> a;   // 输入
+    std::shared_ptr<Tensor> y;   // 输出
+    std::shared_ptr<Tensor> ans; // 参考结果
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    
+    // atanh 只需要 a (input), y (output) 和 ans (reference)
+    if (tensors.find("a") == tensors.end()
+        || tensors.find("y") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Atanh Test: Missing tensors.");
+    }
+
+    test->_attributes->a = tensors["a"];
+    test->_attributes->y = tensors["y"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    
+    infiniopAtanhDescriptor_t op_desc;
+    auto a = _attributes->a->to(device, device_id);
+    auto y = _attributes->y->to(device, device_id);
+
+    // 调用修正后的 4 参数版本接口 (handle, desc, y, a)
+    CHECK_OR(infiniopCreateAtanhDescriptor(handle, &op_desc,
+                                           y->desc(),
+                                           a->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create atanh descriptor."));
+    
+    size_t workspace_size;
+    CHECK_OR(infiniopGetAtanhWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+
+    // 执行计算 (移除 b 相关的参数)
+    CHECK_OR(infiniopAtanh(op_desc, workspace, workspace_size,
+                           y->data(),
+                           a->data(),
+                           nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during atanh execution."));
+
+    // 验证结果
+    try {
+        allClose(y, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    // 性能测试 (Benchmark)
+    double elapsed_time = 0.;
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopAtanh(
+                op_desc, workspace, workspace_size,
+                y->data(),
+                a->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    // 释放资源 (可选：根据框架决定是否在此释放 op_desc)
+    // infiniopDestroyAtanhDescriptor(op_desc);
+    // infinirtFree(workspace);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "y", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"y"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- y: " << _attributes->y->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::atanh
\ No newline at end of file
diff --git a/src/infiniop-test/src/ops/binary_cross_entropy_with_logits.cpp b/src/infiniop-test/src/ops/binary_cross_entropy_with_logits.cpp
new file mode 100644
index 000000000..d6d9516a2
--- /dev/null
+++ b/src/infiniop-test/src/ops/binary_cross_entropy_with_logits.cpp
@@ -0,0 +1,154 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::binary_cross_entropy_with_logits {
+
+struct Test::Attributes {
+    std::shared_ptr<Tensor> logits;
+    std::shared_ptr<Tensor> target;
+    std::shared_ptr<Tensor> weight;     // 可选
+    std::shared_ptr<Tensor> pos_weight; // 可选
+    std::shared_ptr<Tensor> out;
+    std::shared_ptr<Tensor> ans;
+    int reduction; // 0: none, 1: mean, 2: sum
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+
+    // 1. 校验必要张量是否存在
+    if (tensors.find("logits") == tensors.end() ||
+        tensors.find("target") == tensors.end() ||
+        tensors.find("out") == tensors.end() ||
+        tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid BCE Test: Missing mandatory tensors");
+    }
+
+    // 2. 获取 reduction 属性 (默认为 1: mean)
+    test->_attributes->reduction = 1; 
+    if (attributes.find("reduction") != attributes.end()) {
+        test->_attributes->reduction = *reinterpret_cast<int *>(attributes["reduction"].data());
+    }
+
+    // 3. 填充张量（处理可选张量）
+    test->_attributes->logits = tensors["logits"];
+    test->_attributes->target = tensors["target"];
+    test->_attributes->out = tensors["out"];
+    test->_attributes->ans = tensors["ans"];
+    
+    // 如果 tensors 中存在则赋值，否则为 nullptr
+    test->_attributes->weight = tensors.count("weight") ? tensors["weight"] : nullptr;
+    test->_attributes->pos_weight = tensors.count("pos_weight") ? tensors["pos_weight"] : nullptr;
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    
+    infiniopBCEWithLogitsDescriptor_t op_desc;
+    
+    // 4. 数据迁移
+    auto logits = _attributes->logits->to(device, device_id);
+    auto target = _attributes->target->to(device, device_id);
+    auto out = _attributes->out->to(device, device_id);
+    
+    // 处理可选张量迁移
+    std::shared_ptr<Tensor> weight = (_attributes->weight) ? _attributes->weight->to(device, device_id) : nullptr;
+    std::shared_ptr<Tensor> pos_weight = (_attributes->pos_weight) ? _attributes->pos_weight->to(device, device_id) : nullptr;
+
+    // 5. 创建描述符 (注意处理 NULL 描述符)
+    auto w_desc = weight ? weight->desc() : nullptr;
+    auto pw_desc = pos_weight ? pos_weight->desc() : nullptr;
+
+    CHECK_OR(infiniopCreateBCEWithLogitsDescriptor(handle, &op_desc,
+                                                     out->desc(),
+                                                     logits->desc(),
+                                                     target->desc(),
+                                                     w_desc,
+                                                     pw_desc,
+                                                     static_cast<infiniopReduction_t>(_attributes->reduction)),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create BCE descriptor."));
+
+    // 6. Workspace 管理
+    size_t workspace_size;
+    CHECK_OR(infiniopGetBCEWithLogitsWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+
+    // 7. 执行计算
+    auto w_data = weight ? weight->data() : nullptr;
+    auto pw_data = pos_weight ? pos_weight->data() : nullptr;
+
+    CHECK_OR(infiniopBCEWithLogits(op_desc, workspace, workspace_size,
+                                    out->data(),
+                                    logits->data(),
+                                    target->data(),
+                                    w_data,
+                                    pw_data,
+                                    nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    // 8. 结果验证
+    try {
+        allClose(out, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    // 9. 性能 Benchmark
+    double elapsed_time = benchmark(
+        [=]() {
+            infiniopBCEWithLogits(op_desc, workspace, workspace_size,
+                                   out->data(), logits->data(), target->data(),
+                                   w_data, pw_data, nullptr);
+        },
+        warm_ups, iterations);
+
+    // 10. 资源清理
+    infinirtFree(workspace);
+    infiniopDestroyBCEWithLogitsDescriptor(op_desc);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"reduction"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"logits", "target", "weight", "pos_weight", "out", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"out"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- reduction: " << _attributes->reduction << std::endl;
+    oss << "- logits: " << _attributes->logits->info() << std::endl;
+    if (_attributes->weight) oss << "- weight: " << _attributes->weight->info() << std::endl;
+    oss << "- out: " << _attributes->out->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::binary_cross_entropy_with_logits
\ No newline at end of file
diff --git a/src/infiniop-test/src/ops/cdist.cpp b/src/infiniop-test/src/ops/cdist.cpp
new file mode 100644
index 000000000..3ffb44fb5
--- /dev/null
+++ b/src/infiniop-test/src/ops/cdist.cpp
@@ -0,0 +1,135 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::cdist {
+
+struct Test::Attributes {
+    std::shared_ptr<Tensor> x1;
+    std::shared_ptr<Tensor> x2;
+    std::shared_ptr<Tensor> out;
+    std::shared_ptr<Tensor> ans;
+    double p;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+
+    // 1. 校验张量是否存在 (x1, x2, out, ans)
+    if (tensors.find("x1") == tensors.end() ||
+        tensors.find("x2") == tensors.end() ||
+        tensors.find("out") == tensors.end() ||
+        tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Cdist Test: Missing tensors");
+    }
+
+    // 2. 获取标量属性 p (注意 cdist 通常用 double)
+    test->_attributes->p = 2.0; // 默认值
+    if (attributes.find("p") != attributes.end()) {
+        test->_attributes->p = *reinterpret_cast<double *>(attributes["p"].data());
+    }
+
+    test->_attributes->x1 = tensors["x1"];
+    test->_attributes->x2 = tensors["x2"];
+    test->_attributes->out = tensors["out"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    
+    infiniopCdistDescriptor_t op_desc;
+    
+    // 3. 数据迁移至指定设备 (M x D, N x D)
+    auto x1 = _attributes->x1->to(device, device_id);
+    auto x2 = _attributes->x2->to(device, device_id);
+    auto out = _attributes->out->to(device, device_id);
+
+    // 4. 创建算子描述符
+    CHECK_OR(infiniopCreateCdistDescriptor(handle, &op_desc,
+                                             out->desc(),
+                                             x1->desc(),
+                                             x2->desc(),
+                                             _attributes->p),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create cdist descriptor."));
+
+    // 5. Workspace 动态内存分配
+    size_t workspace_size;
+    CHECK_OR(infiniopGetCdistWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+
+    // 6. 执行计算 (计算 M x N 距离矩阵)
+    CHECK_OR(infiniopCdist(op_desc, workspace, workspace_size,
+                             out->data(),
+                             x1->data(),
+                             x2->data(),
+                             nullptr), // stream
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    // 7. 结果数值验证
+    try {
+        allClose(out, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    // 8. 性能 Benchmark
+    double elapsed_time = benchmark(
+        [=]() {
+            infiniopCdist(op_desc, workspace, workspace_size,
+                            out->data(),
+                            x1->data(),
+                            x2->data(),
+                            nullptr);
+        },
+        warm_ups, iterations);
+
+    // 9. 资源清理
+    infinirtFree(workspace);
+    infiniopDestroyCdistDescriptor(op_desc);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {"p"};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"x1", "x2", "out", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"out"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- p: " << _attributes->p << std::endl;
+    oss << "- x1: " << _attributes->x1->info() << std::endl;
+    oss << "- x2: " << _attributes->x2->info() << std::endl;
+    oss << "- out: " << _attributes->out->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::cdist
\ No newline at end of file
diff --git a/src/infiniop-test/src/ops/reciprocal.cpp b/src/infiniop-test/src/ops/reciprocal.cpp
new file mode 100644
index 000000000..a2a095362
--- /dev/null
+++ b/src/infiniop-test/src/ops/reciprocal.cpp
@@ -0,0 +1,104 @@
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::reciprocal {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> x;
+    std::shared_ptr<Tensor> y;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("x") == tensors.end()
+        || tensors.find("y") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->x = tensors["x"];
+    test->_attributes->y = tensors["y"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopReciprocalDescriptor_t op_desc;
+    auto x = _attributes->x->to(device, device_id);
+    auto y = _attributes->y->to(device, device_id);
+    CHECK_OR(infiniopCreateReciprocalDescriptor(handle, &op_desc,
+                                               y->desc(),
+                                               x->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    
+    size_t workspace_size;
+    CHECK_OR(infiniopGetReciprocalWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    
+    CHECK_OR(infiniopReciprocal(op_desc, workspace, workspace_size,
+                                 y->data(),
+                                 x->data(),
+                                 nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+
+    try {
+        allClose(y, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopReciprocal(
+                op_desc, workspace, workspace_size,
+                y->data(),
+                x->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"x", "y", "ans"};
+}
+
+std::vector<std::string> Test::output_names() {
+    return {"y"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- x: " << _attributes->x->info() << std::endl;
+    oss << "- y: " << _attributes->y->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::reciprocal
\ No newline at end of file
diff --git a/src/infiniop/ops/addcmul/cpu/addcmul_cpu.cc b/src/infiniop/ops/addcmul/cpu/addcmul_cpu.cc
new file mode 100644
index 000000000..d79b53136
--- /dev/null
+++ b/src/infiniop/ops/addcmul/cpu/addcmul_cpu.cc
@@ -0,0 +1,61 @@
+#include "addcmul_cpu.h"
+
+namespace op::addcmul::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec,
+    float value) { // 额外接收 value 参数
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    // 1. 类型检查
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    // 2. 形状检查 (仿照 atanh，这里至少检查第一个输入)
+    const auto &y_shape = out_desc->shape();
+    for (const auto &in_desc : input_desc_vec) {
+        CHECK_SAME_SHAPE(y_shape, in_desc->shape());
+    }
+    
+    // 3. 使用通用的 Elementwise 宏创建描述符
+    // 该宏会实例化 Descriptor 并将其赋值给 *desc_ptr
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    // 4. 将标量属性 value 存入 Descriptor 内部
+    (*desc_ptr)->_value = value;
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    // 仿照 atanh，使用 switch 分发不同数据类型
+    // 这里的模板参数是 AddcmulOp，它在 addcmul_cpu.h 中定义
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<AddcmulOp, fp16_t>(_info, output, inputs, stream, _value);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<AddcmulOp, float>(_info, output, inputs, stream, _value);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<AddcmulOp, double>(_info, output, inputs, stream, _value);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<AddcmulOp, bf16_t>(_info, output, inputs, stream, _value);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::addcmul::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/addcmul/cpu/addcmul_cpu.h b/src/infiniop/ops/addcmul/cpu/addcmul_cpu.h
new file mode 100644
index 000000000..ef55fad7e
--- /dev/null
+++ b/src/infiniop/ops/addcmul/cpu/addcmul_cpu.h
@@ -0,0 +1,78 @@
+#ifndef __ADDCMUL_CPU_H__
+#define __ADDCMUL_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+#include <type_traits>
+
+namespace op::addcmul::cpu {
+
+struct AddcmulOp {
+public:
+    // addcmul 是三元算子: out = input + value * t1 * t2
+    static constexpr size_t num_inputs = 3;
+
+    template <typename T, typename Scalar>
+    T operator()(const T &input, const T &t1, const T &t2, Scalar value) const {
+        // 对于 float, double 等原生浮点类型
+        if constexpr (std::is_floating_point_v<T>) {
+            return input + static_cast<T>(value) * t1 * t2;
+        } else {
+            // 对于 fp16, bf16 等类型，提升至 float 计算以保证精度并处理标量乘法
+            float f_input = static_cast<float>(input);
+            float f_t1 = static_cast<float>(t1);
+            float f_t2 = static_cast<float>(t2);
+            float v = static_cast<float>(value);
+            return static_cast<T>(f_input + v * f_t1 * f_t2);
+        }
+    }
+};
+
+// 为 addcmul 在 CPU 端自定义 Descriptor，支持额外的标量参数 value
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    op::elementwise::ElementwiseInfo _info;
+    std::unique_ptr<op::elementwise::cpu::DeviceImpl> _device_info;
+    size_t _workspace_size;
+    float _value; // 标量系数 value
+
+    Descriptor(
+        infiniDtype_t dtype,
+        op::elementwise::ElementwiseInfo info,
+        op::elementwise::cpu::DeviceImpl *device_info,
+        size_t workspace_size,
+        infiniDevice_t device_type,
+        int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          _info(std::move(info)),
+          _device_info(device_info),
+          _workspace_size(workspace_size),
+          _value(0.0f) {}
+
+public:
+    ~Descriptor();
+
+    size_t workspaceSize() const { return _workspace_size; }
+
+    // 额外接收 value 参数
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t output_desc,
+        std::vector<infiniopTensorDescriptor_t> input_descs,
+        float value);
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *output,
+        std::vector<const void *> inputs,
+        void *stream) const;
+
+    float getValue() const { return _value; }
+};
+
+} // namespace op::addcmul::cpu
+
+#endif // __ADDCMUL_CPU_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/addcmul/cuda/kernel.cuh b/src/infiniop/ops/addcmul/cuda/kernel.cuh
new file mode 100644
index 000000000..af5266a21
--- /dev/null
+++ b/src/infiniop/ops/addcmul/cuda/kernel.cuh
@@ -0,0 +1,46 @@
+#ifndef __ADDCMUL_CUDA_CUH__
+#define __ADDCMUL_CUDA_CUH__
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <type_traits>
+
+namespace op::addcmul::cuda {
+
+struct AddcmulOp {
+public:
+    // addcmul 是三元算子：out = input + value * t1 * t2
+    static constexpr size_t num_inputs = 3;
+
+    template <typename T>
+    __device__ __host__ __forceinline__ T operator()(const T &input, const T &t1, const T &t2, float value) const {
+		float v = value;
+		if constexpr (std::is_same_v<T, half>) {
+            // 提升至 float 计算以保证精度并简化标量乘法
+            float f_input = __half2float(input);
+            float f_t1 = __half2float(t1);
+            float f_t2 = __half2float(t2);
+            return __float2half(f_input + v * f_t1 * f_t2);
+            
+        } else if constexpr (std::is_same_v<T, nv_bfloat16>) {
+            float f_input = __bfloat162float(input);
+            float f_t1 = __bfloat162float(t1);
+            float f_t2 = __bfloat162float(t2);
+            return __float2bfloat16(f_input + v * f_t1 * f_t2);
+            
+        } else if constexpr (std::is_same_v<T, float>) {
+            return input + v * t1 * t2;
+            
+        } else if constexpr (std::is_same_v<T, double>) {
+            return input + static_cast<double>(v) * t1 * t2;
+            
+        } else {
+            // 兜底逻辑
+            return static_cast<T>(static_cast<float>(input) + v * static_cast<float>(t1) * static_cast<float>(t2));
+        }
+    }
+};
+
+} // namespace op::addcmul::cuda
+
+#endif // __ADDCMUL_CUDA_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/addcmul/metax/addcmul_metax.h b/src/infiniop/ops/addcmul/metax/addcmul_metax.h
new file mode 100644
index 000000000..b74014c54
--- /dev/null
+++ b/src/infiniop/ops/addcmul/metax/addcmul_metax.h
@@ -0,0 +1,72 @@
+#ifndef __ADDCMUL_METAX_H__
+#define __ADDCMUL_METAX_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+namespace op::addcmul::metax {
+
+// 为 addcmul 在 METAX 端自定义 Descriptor，支持额外的标量参数 value
+class Descriptor final : public InfiniopDescriptor {
+    // 为保持与通用 Elementwise 框架的兼容，仍然保留这些成员
+	infiniDtype_t _dtype;
+	op::elementwise::ElementwiseInfo _info;
+	std::unique_ptr<op::elementwise::metax::DeviceImpl> _device_info;
+	size_t _workspace_size;
+	float _value; // 标量系数 value
+
+public:
+	// 为自定义 CUDA kernel 记录张量元信息
+	static constexpr int MAX_NDIM = 8;
+
+    struct TensorMeta {
+        int ndim;
+        size_t shape[MAX_NDIM];
+        ptrdiff_t strides[MAX_NDIM];
+    };
+
+    TensorMeta _out_meta{};
+    TensorMeta _input_meta{};
+    TensorMeta _t1_meta{};
+    TensorMeta _t2_meta{};
+    size_t _output_size{0};
+
+	Descriptor(
+		infiniDtype_t dtype,
+		op::elementwise::ElementwiseInfo info,
+		op::elementwise::metax::DeviceImpl *device_info,
+		size_t workspace_size,
+		infiniDevice_t device_type,
+		int device_id)
+		: InfiniopDescriptor{device_type, device_id},
+		  _dtype(dtype),
+		  _info(std::move(info)),
+		  _device_info(device_info),
+		  _workspace_size(workspace_size),
+		  _value(0.0f) {}
+
+public:
+	~Descriptor();
+
+	size_t workspaceSize() const { return _workspace_size; }
+
+	// 额外接收 value 参数
+	static infiniStatus_t create(
+		infiniopHandle_t handle,
+		Descriptor **desc_ptr,
+		infiniopTensorDescriptor_t output_desc,
+		std::vector<infiniopTensorDescriptor_t> input_descs,
+		float value);
+
+	infiniStatus_t calculate(
+		void *workspace,
+		size_t workspace_size,
+		void *output,
+		std::vector<const void *> inputs,
+		void *stream) const;
+
+	float getValue() const { return _value; }
+};
+
+} // namespace op::addcmul::metax
+
+#endif // __ADDCMUL_METAX_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/addcmul/metax/addcmul_metax.maca b/src/infiniop/ops/addcmul/metax/addcmul_metax.maca
new file mode 100644
index 000000000..9c8c00b76
--- /dev/null
+++ b/src/infiniop/ops/addcmul/metax/addcmul_metax.maca
@@ -0,0 +1,179 @@
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "addcmul_metax.h"
+#include "addcmul_metax_kernel.h"
+
+namespace op::addcmul::metax {
+
+Descriptor::~Descriptor() = default;
+
+// 将 TensorDescriptor 中的 shape/strides 填充到 TensorMeta 结构中
+static inline infiniStatus_t fill_tensor_meta(
+    infiniopTensorDescriptor_t desc,
+    Descriptor::TensorMeta &meta) {
+
+    auto ndim = desc->ndim();
+    if (ndim > Descriptor::MAX_NDIM) {
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    }
+
+    meta.ndim = static_cast<int>(ndim);
+    const auto &shape = desc->shape();
+    const auto &strides = desc->strides();
+    for (int i = 0; i < meta.ndim; ++i) {
+        meta.shape[i] = shape[i];
+        meta.strides[i] = strides[i];
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec,
+    float value) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    // 1. 类型检查
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
+
+    // 2. 形状检查：要求输出与三个输入形状一致（若不支持广播）
+    const auto &out_shape = out_desc->shape();
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &t1_desc = input_desc_vec.at(1);
+    const auto &t2_desc = input_desc_vec.at(2);
+    CHECK_SAME_SHAPE(out_shape, input_desc->shape());
+    CHECK_SAME_SHAPE(out_shape, t1_desc->shape());
+    CHECK_SAME_SHAPE(out_shape, t2_desc->shape());
+
+    // 3. 创建底层的 Elementwise METAX 描述符
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    // 4. 记录张量元信息和输出元素个数，供自定义 METAX kernel 使用
+    auto *desc = *desc_ptr;
+    desc->_output_size = out_desc->numel();
+
+    CHECK_STATUS(fill_tensor_meta(out_desc, desc->_out_meta));
+    CHECK_STATUS(fill_tensor_meta(input_desc, desc->_input_meta));
+    CHECK_STATUS(fill_tensor_meta(t1_desc, desc->_t1_meta));
+    CHECK_STATUS(fill_tensor_meta(t2_desc, desc->_t2_meta));
+
+    // 5. 将标量属性 value 存入 Descriptor 内部
+    desc->_value = value;
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+// 自定义 addcmul METAX kernel：使用 Descriptor 中的 TensorMeta 做通用 strided 访问
+template <typename T>
+INFINIOP_METAX_KERNEL addcmul_kernel(
+    size_t output_size,
+    Descriptor::TensorMeta out_meta,
+    Descriptor::TensorMeta in_meta,
+    Descriptor::TensorMeta t1_meta,
+    Descriptor::TensorMeta t2_meta,
+    T *out,
+    const T *input,
+    const T *t1,
+    const T *t2,
+    float value) {
+
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= output_size) {
+        return;
+    }
+
+    // 根据输出 shape/stride 计算各个张量的偏移
+    ptrdiff_t out_offset = 0;
+    ptrdiff_t in_offset = 0;
+    ptrdiff_t t1_offset = 0;
+    ptrdiff_t t2_offset = 0;
+
+    size_t linear = idx;
+    for (int dim = out_meta.ndim - 1; dim >= 0; --dim) {
+        size_t dim_size = out_meta.shape[dim];
+        size_t coord = linear % dim_size;
+        linear /= dim_size;
+
+        out_offset += static_cast<ptrdiff_t>(coord) * out_meta.strides[dim];
+        in_offset += static_cast<ptrdiff_t>(coord) * in_meta.strides[dim];
+        t1_offset += static_cast<ptrdiff_t>(coord) * t1_meta.strides[dim];
+        t2_offset += static_cast<ptrdiff_t>(coord) * t2_meta.strides[dim];
+    }
+
+    T in_val = input[in_offset];
+    T t1_val = t1[t1_offset];
+    T t2_val = t2[t2_offset];
+
+    out[out_offset] = op::addcmul::metax::AddcmulOp{}(in_val, t1_val, t2_val, value);
+}
+
+template <typename T>
+static inline infiniStatus_t launch_addcmul_kernel(
+    const Descriptor *desc,
+    void *output,
+    const std::vector<const void *> &inputs,
+    void *stream) {
+
+    size_t output_size = desc->_output_size;
+    if (output_size == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    auto *out_ptr = reinterpret_cast<T *>(output);
+    auto *in_ptr = reinterpret_cast<const T *>(inputs.at(0));
+    auto *t1_ptr = reinterpret_cast<const T *>(inputs.at(1));
+    auto *t2_ptr = reinterpret_cast<const T *>(inputs.at(2));
+
+    mcStream_t metax_stream = reinterpret_cast<mcStream_t>(stream);
+
+    constexpr uint32_t BLOCK_SIZE = 256;
+    uint32_t grid = static_cast<uint32_t>((output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
+
+    addcmul_kernel<T><<<grid, BLOCK_SIZE, 0, metax_stream>>>(
+        output_size,
+        desc->_out_meta,
+        desc->_input_meta,
+        desc->_t1_meta,
+        desc->_t2_meta,
+        out_ptr,
+        in_ptr,
+        t1_ptr,
+        t2_ptr,
+        desc->getValue());
+
+    CHECK_METAX(mcGetLastError());
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    // 目前不依赖 workspace 内容，只检查大小是否足够以保持与其他算子一致的接口语义
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    // 直接调用自定义 METAX kernel，避免通过通用 elementwise 框架
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return launch_addcmul_kernel<half>(this, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return launch_addcmul_kernel<cuda_bfloat16>(this, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return launch_addcmul_kernel<float>(this, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return launch_addcmul_kernel<double>(this, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+} // namespace op::addcmul::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/addcmul/metax/addcmul_metax_kernel.h b/src/infiniop/ops/addcmul/metax/addcmul_metax_kernel.h
new file mode 100644
index 000000000..e8992196a
--- /dev/null
+++ b/src/infiniop/ops/addcmul/metax/addcmul_metax_kernel.h
@@ -0,0 +1,48 @@
+#ifndef __ADDCMUL_METAX_KERNEL_H__
+#define __ADDCMUL_METAX_KERNEL_H__
+
+/*
+ * This file contains the Addcmul operation implementation for the MUSA backend.
+ * Formula: out = input + value * tensor1 * tensor2
+ */
+
+namespace op::addcmul::metax {
+
+typedef struct AddcmulOp {
+public:
+    // 三元算子，输入为 input, tensor1, tensor2
+    static constexpr size_t num_inputs = 3;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &in, const T &t1, const T &t2, float value) const {
+        if constexpr (std::is_same_v<T, float>) {
+            // F32 直接使用乘加指令
+            return in + value * t1 * t2;
+        } 
+        else if constexpr (std::is_same_v<T, half>) {
+            // F16 提升到 float 计算以防止中间乘法溢出
+            float f_in = __half2float(in);
+            float f_t1 = __half2float(t1);
+            float f_t2 = __half2float(t2);
+            return __float2half(f_in + value * f_t1 * f_t2);
+        } 
+        else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // BF16 同样提升到 float 计算
+            float f_in = __bfloat162float(in);
+            float f_t1 = __bfloat162float(t1);
+            float f_t2 = __bfloat162float(t2);
+            return __float2bfloat16_rn(f_in + value * f_t1 * f_t2);
+        } 
+        else if constexpr (std::is_same_v<T, double>) {
+            return in + (double)value * t1 * t2;
+        } 
+        else {
+            // 整数类型或其他类型
+            return in + static_cast<T>(value) * t1 * t2;
+        }
+    }
+} AddcmulOp;
+
+} // namespace op::addcmul::metax
+
+#endif // __ADDCMUL_METAX_KERNEL_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/addcmul/moore/addcmul_moore.h b/src/infiniop/ops/addcmul/moore/addcmul_moore.h
new file mode 100644
index 000000000..1d74d72d6
--- /dev/null
+++ b/src/infiniop/ops/addcmul/moore/addcmul_moore.h
@@ -0,0 +1,76 @@
+#ifndef __ADDCMUL_MOORE_H__
+#define __ADDCMUL_MOORE_H__
+
+// 1. 切换到 Moore 平台的 Elementwise API
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+namespace op::addcmul::moore {
+
+/**
+ * 为 addcmul 在 Moore 端自定义 Descriptor
+ * 保持与 NVIDIA 版本一致的结构，以便于跨平台对齐
+ */
+class Descriptor final : public InfiniopDescriptor {
+    infiniDtype_t _dtype;
+    op::elementwise::ElementwiseInfo _info;
+    // 2. 切换到 Moore 设备的实现指针
+    std::unique_ptr<op::elementwise::moore::DeviceImpl> _device_info;
+    size_t _workspace_size;
+    float _value; // 标量系数 value
+
+public:
+    // 摩尔线程 MUSA 同样支持 stride 访问，记录张量元信息
+    static constexpr int MAX_NDIM = 8;
+
+    struct TensorMeta {
+        int ndim;
+        size_t shape[MAX_NDIM];
+        ptrdiff_t strides[MAX_NDIM];
+    };
+
+    TensorMeta _out_meta{};
+    TensorMeta _input_meta{};
+    TensorMeta _t1_meta{};
+    TensorMeta _t2_meta{};
+    size_t _output_size{0};
+
+    Descriptor(
+        infiniDtype_t dtype,
+        op::elementwise::ElementwiseInfo info,
+        op::elementwise::moore::DeviceImpl *device_info, // 3. 修改构造函数参数类型
+        size_t workspace_size,
+        infiniDevice_t device_type,
+        int device_id)
+        : InfiniopDescriptor{device_type, device_id},
+          _dtype(dtype),
+          _info(std::move(info)),
+          _device_info(device_info),
+          _workspace_size(workspace_size),
+          _value(0.0f) {}
+
+public:
+    ~Descriptor();
+
+    size_t workspaceSize() const { return _workspace_size; }
+
+    // 4. 保持相同的接口，接收 value 参数
+    static infiniStatus_t create(
+        infiniopHandle_t handle,
+        Descriptor **desc_ptr,
+        infiniopTensorDescriptor_t output_desc,
+        std::vector<infiniopTensorDescriptor_t> input_descs,
+        float value);
+
+    infiniStatus_t calculate(
+        void *workspace,
+        size_t workspace_size,
+        void *output,
+        std::vector<const void *> inputs,
+        void *stream) const;
+
+    float getValue() const { return _value; }
+};
+
+} // namespace op::addcmul::moore
+
+#endif // __ADDCMUL_MOORE_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/addcmul/moore/addcmul_moore.mu b/src/infiniop/ops/addcmul/moore/addcmul_moore.mu
new file mode 100644
index 000000000..37b840c2a
--- /dev/null
+++ b/src/infiniop/ops/addcmul/moore/addcmul_moore.mu
@@ -0,0 +1,159 @@
+#include "../../../elementwise/moore/elementwise_moore.h"
+#include "addcmul_moore_kernel.h"
+#include "addcmul_moore.h"
+#include <musa_runtime.h>
+
+namespace op::addcmul::moore {
+
+Descriptor::~Descriptor() = default;
+
+// 1. 填充 TensorMeta，逻辑与 NVIDIA 一致，用于 MUSA Kernel 中的 Strided 寻址
+static inline infiniStatus_t fill_tensor_meta(
+    infiniopTensorDescriptor_t desc,
+    Descriptor::TensorMeta &meta) {
+
+    auto ndim = desc->ndim();
+    if (ndim > Descriptor::MAX_NDIM) {
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    }
+
+    meta.ndim = static_cast<int>(ndim);
+    const auto &shape = desc->shape();
+    const auto &strides = desc->strides();
+    for (int i = 0; i < meta.ndim; ++i) {
+        meta.shape[i] = shape[i];
+        meta.strides[i] = strides[i];
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec,
+    float value) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    // 类型检查
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
+
+    // 形状检查 (A, T1, T2 需一致)
+    const auto &out_shape = out_desc->shape();
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &t1_desc = input_desc_vec.at(1);
+    const auto &t2_desc = input_desc_vec.at(2);
+    CHECK_SAME_SHAPE(out_shape, input_desc->shape());
+    CHECK_SAME_SHAPE(out_shape, t1_desc->shape());
+    CHECK_SAME_SHAPE(out_shape, t2_desc->shape());
+
+    // 2. 调用 Moore 平台的描述符创建宏
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    auto *desc = *desc_ptr;
+    desc->_output_size = out_desc->numel();
+
+    // 填充元数据
+    CHECK_STATUS(fill_tensor_meta(out_desc, desc->_out_meta));
+    CHECK_STATUS(fill_tensor_meta(input_desc, desc->_input_meta));
+    CHECK_STATUS(fill_tensor_meta(t1_desc, desc->_t1_meta));
+    CHECK_STATUS(fill_tensor_meta(t2_desc, desc->_t2_meta));
+
+    desc->_value = value;
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+// 3. MUSA Kernel 实现：逻辑保持一致
+template <typename T>
+__global__ void addcmul_kernel(
+    size_t output_size,
+    Descriptor::TensorMeta out_meta,
+    Descriptor::TensorMeta in_meta,
+    Descriptor::TensorMeta t1_meta,
+    Descriptor::TensorMeta t2_meta,
+    T *out,
+    const T *input,
+    const T *t1,
+    const T *t2,
+    float value) {
+
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= output_size) return;
+
+    ptrdiff_t out_offset = 0, in_offset = 0, t1_offset = 0, t2_offset = 0;
+    size_t linear = idx;
+
+    // 通用多维索引转偏移逻辑
+    for (int dim = out_meta.ndim - 1; dim >= 0; --dim) {
+        size_t dim_size = out_meta.shape[dim];
+        size_t coord = linear % dim_size;
+        linear /= dim_size;
+
+        out_offset += static_cast<ptrdiff_t>(coord) * out_meta.strides[dim];
+        in_offset += static_cast<ptrdiff_t>(coord) * in_meta.strides[dim];
+        t1_offset += static_cast<ptrdiff_t>(coord) * t1_meta.strides[dim];
+        t2_offset += static_cast<ptrdiff_t>(coord) * t2_meta.strides[dim];
+    }
+
+    // 调用 Moore 平台定义的 AddcmulOp
+    out[out_offset] = op::addcmul::moore::AddcmulOp{}(input[in_offset], t1[t1_offset], t2[t2_offset], value);
+}
+
+// 4. 内核启动封装
+template <typename T>
+static inline infiniStatus_t launch_addcmul_kernel(
+    const Descriptor *desc,
+    void *output,
+    const std::vector<const void *> &inputs,
+    void *stream) {
+
+    size_t output_size = desc->_output_size;
+    if (output_size == 0) return INFINI_STATUS_SUCCESS;
+
+    auto *out_ptr = reinterpret_cast<T *>(output);
+    auto *in_ptr = reinterpret_cast<const T *>(inputs.at(0));
+    auto *t1_ptr = reinterpret_cast<const T *>(inputs.at(1));
+    auto *t2_ptr = reinterpret_cast<const T *>(inputs.at(2));
+
+    musaStream_t musa_stream = reinterpret_cast<musaStream_t>(stream);
+
+    constexpr uint32_t BLOCK_SIZE = 256;
+    uint32_t grid = static_cast<uint32_t>((output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
+
+    addcmul_kernel<T><<<grid, BLOCK_SIZE, 0, musa_stream>>>(
+        output_size, desc->_out_meta, desc->_input_meta, desc->_t1_meta, desc->_t2_meta,
+        out_ptr, in_ptr, t1_ptr, t2_ptr, desc->getValue());
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return launch_addcmul_kernel<half>(this, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        // 使用 Moore 平台对应的 bf16 类型
+        return launch_addcmul_kernel<cuda_bfloat16>(this, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return launch_addcmul_kernel<float>(this, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return launch_addcmul_kernel<double>(this, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+} // namespace op::addcmul::moore
\ No newline at end of file
diff --git a/src/infiniop/ops/addcmul/moore/addcmul_moore_kernel.h b/src/infiniop/ops/addcmul/moore/addcmul_moore_kernel.h
new file mode 100644
index 000000000..287192947
--- /dev/null
+++ b/src/infiniop/ops/addcmul/moore/addcmul_moore_kernel.h
@@ -0,0 +1,48 @@
+#ifndef __ADDCMUL_MOORE_KERNEL_H__
+#define __ADDCMUL_MOORE_KERNEL_H__
+
+/*
+ * This file contains the Addcmul operation implementation for the MUSA backend.
+ * Formula: out = input + value * tensor1 * tensor2
+ */
+
+namespace op::addcmul::moore {
+
+typedef struct AddcmulOp {
+public:
+    // 三元算子，输入为 input, tensor1, tensor2
+    static constexpr size_t num_inputs = 3;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &in, const T &t1, const T &t2, float value) const {
+        if constexpr (std::is_same_v<T, float>) {
+            // F32 直接使用乘加指令
+            return in + value * t1 * t2;
+        } 
+        else if constexpr (std::is_same_v<T, half>) {
+            // F16 提升到 float 计算以防止中间乘法溢出
+            float f_in = __half2float(in);
+            float f_t1 = __half2float(t1);
+            float f_t2 = __half2float(t2);
+            return __float2half(f_in + value * f_t1 * f_t2);
+        } 
+        else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // BF16 同样提升到 float 计算
+            float f_in = __bfloat162float(in);
+            float f_t1 = __bfloat162float(t1);
+            float f_t2 = __bfloat162float(t2);
+            return __float2bfloat16_rn(f_in + value * f_t1 * f_t2);
+        } 
+        else if constexpr (std::is_same_v<T, double>) {
+            return in + (double)value * t1 * t2;
+        } 
+        else {
+            // 整数类型或其他类型
+            return in + static_cast<T>(value) * t1 * t2;
+        }
+    }
+} AddcmulOp;
+
+} // namespace op::addcmul::moore
+
+#endif // __ADDCMUL_MOORE_KERNEL_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/addcmul/nvidia/addcmul_nvidia.cu b/src/infiniop/ops/addcmul/nvidia/addcmul_nvidia.cu
new file mode 100644
index 000000000..83922e8f5
--- /dev/null
+++ b/src/infiniop/ops/addcmul/nvidia/addcmul_nvidia.cu
@@ -0,0 +1,178 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+#include "../cuda/kernel.cuh"
+#include "addcmul_nvidia.cuh"
+
+namespace op::addcmul::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+// 将 TensorDescriptor 中的 shape/strides 填充到 TensorMeta 结构中
+static inline infiniStatus_t fill_tensor_meta(
+    infiniopTensorDescriptor_t desc,
+    Descriptor::TensorMeta &meta) {
+
+    auto ndim = desc->ndim();
+    if (ndim > Descriptor::MAX_NDIM) {
+        return INFINI_STATUS_NOT_IMPLEMENTED;
+    }
+
+    meta.ndim = static_cast<int>(ndim);
+    const auto &shape = desc->shape();
+    const auto &strides = desc->strides();
+    for (int i = 0; i < meta.ndim; ++i) {
+        meta.shape[i] = shape[i];
+        meta.strides[i] = strides[i];
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec,
+    float value) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    // 1. 类型检查
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
+
+    // 2. 形状检查：要求输出与三个输入形状一致（若不支持广播）
+    const auto &out_shape = out_desc->shape();
+    const auto &input_desc = input_desc_vec.at(0);
+    const auto &t1_desc = input_desc_vec.at(1);
+    const auto &t2_desc = input_desc_vec.at(2);
+    CHECK_SAME_SHAPE(out_shape, input_desc->shape());
+    CHECK_SAME_SHAPE(out_shape, t1_desc->shape());
+    CHECK_SAME_SHAPE(out_shape, t2_desc->shape());
+
+    // 3. 创建底层的 Elementwise CUDA 描述符
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    // 4. 记录张量元信息和输出元素个数，供自定义 CUDA kernel 使用
+    auto *desc = *desc_ptr;
+    desc->_output_size = out_desc->numel();
+
+    CHECK_STATUS(fill_tensor_meta(out_desc, desc->_out_meta));
+    CHECK_STATUS(fill_tensor_meta(input_desc, desc->_input_meta));
+    CHECK_STATUS(fill_tensor_meta(t1_desc, desc->_t1_meta));
+    CHECK_STATUS(fill_tensor_meta(t2_desc, desc->_t2_meta));
+
+    // 5. 将标量属性 value 存入 Descriptor 内部
+    desc->_value = value;
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+// 自定义 addcmul CUDA kernel：使用 Descriptor 中的 TensorMeta 做通用 strided 访问
+template <typename T>
+INFINIOP_CUDA_KERNEL addcmul_kernel(
+    size_t output_size,
+    Descriptor::TensorMeta out_meta,
+    Descriptor::TensorMeta in_meta,
+    Descriptor::TensorMeta t1_meta,
+    Descriptor::TensorMeta t2_meta,
+    T *out,
+    const T *input,
+    const T *t1,
+    const T *t2,
+    float value) {
+
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= output_size) {
+        return;
+    }
+
+    // 根据输出 shape/stride 计算各个张量的偏移
+    ptrdiff_t out_offset = 0;
+    ptrdiff_t in_offset = 0;
+    ptrdiff_t t1_offset = 0;
+    ptrdiff_t t2_offset = 0;
+
+    size_t linear = idx;
+    for (int dim = out_meta.ndim - 1; dim >= 0; --dim) {
+        size_t dim_size = out_meta.shape[dim];
+        size_t coord = linear % dim_size;
+        linear /= dim_size;
+
+        out_offset += static_cast<ptrdiff_t>(coord) * out_meta.strides[dim];
+        in_offset += static_cast<ptrdiff_t>(coord) * in_meta.strides[dim];
+        t1_offset += static_cast<ptrdiff_t>(coord) * t1_meta.strides[dim];
+        t2_offset += static_cast<ptrdiff_t>(coord) * t2_meta.strides[dim];
+    }
+
+    T in_val = input[in_offset];
+    T t1_val = t1[t1_offset];
+    T t2_val = t2[t2_offset];
+
+    out[out_offset] = op::addcmul::cuda::AddcmulOp{}(in_val, t1_val, t2_val, value);
+}
+
+template <typename T>
+static inline infiniStatus_t launch_addcmul_kernel(
+    const Descriptor *desc,
+    void *output,
+    const std::vector<const void *> &inputs,
+    void *stream) {
+
+    size_t output_size = desc->_output_size;
+    if (output_size == 0) {
+        return INFINI_STATUS_SUCCESS;
+    }
+
+    auto *out_ptr = reinterpret_cast<T *>(output);
+    auto *in_ptr = reinterpret_cast<const T *>(inputs.at(0));
+    auto *t1_ptr = reinterpret_cast<const T *>(inputs.at(1));
+    auto *t2_ptr = reinterpret_cast<const T *>(inputs.at(2));
+
+    cudaStream_t cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+
+    constexpr uint32_t BLOCK_SIZE = 256;
+    uint32_t grid = static_cast<uint32_t>((output_size + BLOCK_SIZE - 1) / BLOCK_SIZE);
+
+    addcmul_kernel<T><<<grid, BLOCK_SIZE, 0, cuda_stream>>>(
+        output_size,
+        desc->_out_meta,
+        desc->_input_meta,
+        desc->_t1_meta,
+        desc->_t2_meta,
+        out_ptr,
+        in_ptr,
+        t1_ptr,
+        t2_ptr,
+        desc->getValue());
+
+    CHECK_CUDA(cudaGetLastError());
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    // 目前不依赖 workspace 内容，只检查大小是否足够以保持与其他算子一致的接口语义
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    // 直接调用自定义 CUDA kernel，避免通过通用 elementwise 框架
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return launch_addcmul_kernel<half>(this, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return launch_addcmul_kernel<nv_bfloat16>(this, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return launch_addcmul_kernel<float>(this, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return launch_addcmul_kernel<double>(this, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+} // namespace op::addcmul::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/addcmul/nvidia/addcmul_nvidia.cuh b/src/infiniop/ops/addcmul/nvidia/addcmul_nvidia.cuh
new file mode 100644
index 000000000..89a001cc0
--- /dev/null
+++ b/src/infiniop/ops/addcmul/nvidia/addcmul_nvidia.cuh
@@ -0,0 +1,72 @@
+#ifndef __ADDCMUL_NVIDIA_H__
+#define __ADDCMUL_NVIDIA_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+namespace op::addcmul::nvidia {
+
+// 为 addcmul 在 NVIDIA 端自定义 Descriptor，支持额外的标量参数 value
+class Descriptor final : public InfiniopDescriptor {
+    // 为保持与通用 Elementwise 框架的兼容，仍然保留这些成员
+	infiniDtype_t _dtype;
+	op::elementwise::ElementwiseInfo _info;
+	std::unique_ptr<op::elementwise::nvidia::DeviceImpl> _device_info;
+	size_t _workspace_size;
+	float _value; // 标量系数 value
+
+public:
+	// 为自定义 CUDA kernel 记录张量元信息
+	static constexpr int MAX_NDIM = 8;
+
+    struct TensorMeta {
+        int ndim;
+        size_t shape[MAX_NDIM];
+        ptrdiff_t strides[MAX_NDIM];
+    };
+
+    TensorMeta _out_meta{};
+    TensorMeta _input_meta{};
+    TensorMeta _t1_meta{};
+    TensorMeta _t2_meta{};
+    size_t _output_size{0};
+
+	Descriptor(
+		infiniDtype_t dtype,
+		op::elementwise::ElementwiseInfo info,
+		op::elementwise::nvidia::DeviceImpl *device_info,
+		size_t workspace_size,
+		infiniDevice_t device_type,
+		int device_id)
+		: InfiniopDescriptor{device_type, device_id},
+		  _dtype(dtype),
+		  _info(std::move(info)),
+		  _device_info(device_info),
+		  _workspace_size(workspace_size),
+		  _value(0.0f) {}
+
+public:
+	~Descriptor();
+
+	size_t workspaceSize() const { return _workspace_size; }
+
+	// 额外接收 value 参数
+	static infiniStatus_t create(
+		infiniopHandle_t handle,
+		Descriptor **desc_ptr,
+		infiniopTensorDescriptor_t output_desc,
+		std::vector<infiniopTensorDescriptor_t> input_descs,
+		float value);
+
+	infiniStatus_t calculate(
+		void *workspace,
+		size_t workspace_size,
+		void *output,
+		std::vector<const void *> inputs,
+		void *stream) const;
+
+	float getValue() const { return _value; }
+};
+
+} // namespace op::addcmul::nvidia
+
+#endif // __ADDCMUL_NVIDIA_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/addcmul/operator.cc b/src/infiniop/ops/addcmul/operator.cc
new file mode 100644
index 000000000..50a1ecc23
--- /dev/null
+++ b/src/infiniop/ops/addcmul/operator.cc
@@ -0,0 +1,193 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/addcmul.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/addcmul_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/addcmul_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/addcmul_metax.h"
+#endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/addcmul_kunlun.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/addcmul_bang.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/addcmul_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateAddcmulDescriptor(
+    infiniopHandle_t handle,
+    infiniopAddcmulDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t t1_desc,
+    infiniopTensorDescriptor_t t2_desc,
+    float value) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::addcmul::NAMESPACE::Descriptor::create(                  \
+            handle,                                                         \
+            reinterpret_cast<op::addcmul::NAMESPACE::Descriptor **>(desc_ptr), \
+            out_desc,                                                       \
+            {input_desc, t1_desc, t2_desc},                                 \
+            value)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetAddcmulWorkspaceSize(infiniopAddcmulDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                   \
+    case CASE:                                                                                 \
+        *size = reinterpret_cast<op::addcmul::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+
+__C infiniStatus_t infiniopAddcmul(
+    infiniopAddcmulDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *input,
+    const void *t1,
+    const void *t2,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                                 \
+        return reinterpret_cast<const op::addcmul::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, out, {input, t1, t2}, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CALCULATE
+}
+
+__C infiniStatus_t infiniopDestroyAddcmulDescriptor(infiniopAddcmulDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::addcmul::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DELETE
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/atanh/cpu/atanh_cpu.cc b/src/infiniop/ops/atanh/cpu/atanh_cpu.cc
new file mode 100644
index 000000000..c33d0a460
--- /dev/null
+++ b/src/infiniop/ops/atanh/cpu/atanh_cpu.cc
@@ -0,0 +1,52 @@
+#include "atanh_cpu.h"
+
+namespace op::atanh::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(y_shape, a_shape);
+    
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    // 分发到对应的数据类型进行计算，模板参数为我们在 atanh_cpu.h 中定义的 AtanhOp
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<AtanhOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<AtanhOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<AtanhOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<AtanhOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::atanh::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/atanh/cpu/atanh_cpu.h b/src/infiniop/ops/atanh/cpu/atanh_cpu.h
new file mode 100644
index 000000000..8a55e3afb
--- /dev/null
+++ b/src/infiniop/ops/atanh/cpu/atanh_cpu.h
@@ -0,0 +1,31 @@
+#ifndef __ATANH_CPU_H__
+#define __ATANH_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath>
+#include <type_traits>
+
+// 注册 atanh 算子在 cpu 后端的 descriptor
+ELEMENTWISE_DESCRIPTOR(atanh, cpu)
+
+namespace op::atanh::cpu {
+typedef struct AtanhOp {
+public:
+    // atanh 是一元算子
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    T operator()(const T &a) const {
+        // 对于 float, double 等原生支持的类型直接调用 std::atanh
+        if constexpr (std::is_floating_point_v<T>) {
+            return std::atanh(a);
+        } else {
+            // 对于 half, bfloat16 等自定义类型，先转为 float 计算再转回
+            // 假设这些类型支持 static_cast 到 float
+            return static_cast<T>(std::atanhf(static_cast<float>(a)));
+        }
+    }
+} AtanhOp;
+} // namespace op::atanh::cpu
+
+#endif // __ATANH_CPU_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/atanh/cuda/kernel.cuh b/src/infiniop/ops/atanh/cuda/kernel.cuh
new file mode 100644
index 000000000..9a98aef63
--- /dev/null
+++ b/src/infiniop/ops/atanh/cuda/kernel.cuh
@@ -0,0 +1,40 @@
+#ifndef __ATANH_CUDA_H__
+#define __ATANH_CUDA_H__
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+namespace op::atanh::cuda {
+typedef struct AtanhOp {
+public:
+    // atanh 是一元算子，只需要一个输入
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            // 对 half2 的两个部分分别求 atanh
+            float2 f = __half22float2(a);
+            f.x = atanhf(f.x);
+            f.y = atanhf(f.y);
+            return __float22half2_rn(f);
+        } else if constexpr (std::is_same_v<T, half>) {
+            // half 类型先转为 float 计算再转回
+            return __float2half(atanhf(__half2float(a)));
+        } else if constexpr (std::is_same_v<T, nv_bfloat16>) {
+            // bfloat16 类型处理同上
+            return __float2bfloat16(atanhf(__bfloat162float(a)));
+        } else if constexpr (std::is_same_v<T, float>) {
+            // float 直接调用标准数学库函数
+            return atanhf(a);
+        } else if constexpr (std::is_same_v<T, double>) {
+            return ::atanh(a);
+        } else {
+            // 其他整数类型或不支持类型理论上不应进入，此处做简单 fallback
+            return static_cast<T>(atanhf(static_cast<float>(a)));
+        }
+    }
+} AtanhOp;
+} // namespace op::atanh::cuda
+
+#endif // __ATANH_CUDA_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/atanh/metax/atanh_metax.h b/src/infiniop/ops/atanh/metax/atanh_metax.h
new file mode 100644
index 000000000..7c758d7f1
--- /dev/null
+++ b/src/infiniop/ops/atanh/metax/atanh_metax.h
@@ -0,0 +1,8 @@
+#ifndef __ATANH_METAX_API_H__
+#define __ATANH_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(atanh, metax)
+
+#endif // __ATANH_METAX_API_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/atanh/metax/atanh_metax.maca b/src/infiniop/ops/atanh/metax/atanh_metax.maca
new file mode 100644
index 000000000..043463b39
--- /dev/null
+++ b/src/infiniop/ops/atanh/metax/atanh_metax.maca
@@ -0,0 +1,58 @@
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "atanh_metax.h"
+#include "atanh_metax_kernel.h"
+
+namespace op::atanh::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(y_shape, a_shape);
+
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, metax::AtanhOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, metax::AtanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, metax::AtanhOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, metax::AtanhOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::atanh::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/atanh/metax/atanh_metax_kernel.h b/src/infiniop/ops/atanh/metax/atanh_metax_kernel.h
new file mode 100644
index 000000000..d659ef193
--- /dev/null
+++ b/src/infiniop/ops/atanh/metax/atanh_metax_kernel.h
@@ -0,0 +1,47 @@
+#ifndef __ATANH_METAX_KERNEL_H__
+#define __ATANH_METAX_KERNEL_H__
+
+/*
+ * This file contains the Atanh operation implementation for the MUSA backend.
+ *
+ * It follows the consistent code structure to ensure alignment across different
+ * hardware platforms within the Moore Threads (MUSA) ecosystem.
+ */
+namespace op::atanh::metax {
+
+typedef struct AtanhOp {
+public:
+    // 一元算子，输入数量为 1
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            // 针对 half2 进行并行计算
+            float2 f2 = __half22float2(a);
+            f2.x = atanhf(f2.x);
+            f2.y = atanhf(f2.y);
+            return __float22half2_rn(f2);
+        } else if constexpr (std::is_same_v<T, half>) {
+            // 转为 float 计算以保证精度并匹配 MUSA 数学库
+            return __float2half(atanhf(__half2float(a)));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // BF16 同样提升到 float 计算，避免转换歧义
+            float a_f = __bfloat162float(a);
+            return __float2bfloat16_rn(atanhf(a_f));
+        } else if constexpr (std::is_same_v<T, float>) {
+            // 调用 MUSA 内置的单精度反双曲正切函数
+            return atanhf(a);
+        } else if constexpr (std::is_same_v<T, double>) {
+            // 调用双精度版本
+            return ::atanh(a);
+        } else {
+            // 兜底实现（如果是整数类型，通常会隐式转为 float）
+            return static_cast<T>(atanhf(static_cast<float>(a)));
+        }
+    }
+} AtanhOp;
+
+} // namespace op::atanh::metax
+
+#endif // __ATANH_METAX_KERNEL_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/atanh/moore/atanh_moore.h b/src/infiniop/ops/atanh/moore/atanh_moore.h
new file mode 100644
index 000000000..9bfc1e978
--- /dev/null
+++ b/src/infiniop/ops/atanh/moore/atanh_moore.h
@@ -0,0 +1,11 @@
+#ifndef __ATANH_MOORE_API_H__
+#define __ATANH_MOORE_API_H__
+
+// 1. 修改包含路径，指向 moore 平台的 elementwise API 定义
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+// 2. 使用 ELEMENTWISE_DESCRIPTOR 宏，平台参数改为 moore
+// 这将自动生成 op::atanh::moore::Descriptor 类的声明
+ELEMENTWISE_DESCRIPTOR(atanh, moore)
+
+#endif // __ATANH_MOORE_API_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/atanh/moore/atanh_moore.mu b/src/infiniop/ops/atanh/moore/atanh_moore.mu
new file mode 100644
index 000000000..01f71219a
--- /dev/null
+++ b/src/infiniop/ops/atanh/moore/atanh_moore.mu
@@ -0,0 +1,64 @@
+#include "../../../elementwise/moore/elementwise_moore.h"
+
+#include "atanh_moore.h"
+#include "atanh_moore_kernel.h"
+
+namespace op::atanh::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    // 1. 转换 Handle 为 Moore 平台类型
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+
+    // 2. 检查数据类型支持情况
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
+
+    // 3. 校验 Shape 一致性
+    CHECK_SAME_SHAPE(y_shape, a_shape);
+
+    // 4. 创建 Moore 平台的 Elementwise 描述符
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    // 5. 根据数据类型分发到具体的 MUSA Kernel 逻辑
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, moore::AtanhOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        // 注意：这里将 nv_bfloat16 替换为 Moore 环境下的 bfloat16 类型名
+        return _device_info->calculate<256, moore::AtanhOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, moore::AtanhOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, moore::AtanhOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::atanh::moore
\ No newline at end of file
diff --git a/src/infiniop/ops/atanh/moore/atanh_moore_kernel.h b/src/infiniop/ops/atanh/moore/atanh_moore_kernel.h
new file mode 100644
index 000000000..81cf13d96
--- /dev/null
+++ b/src/infiniop/ops/atanh/moore/atanh_moore_kernel.h
@@ -0,0 +1,47 @@
+#ifndef __ATANH_MOORE_KERNEL_H__
+#define __ATANH_MOORE_KERNEL_H__
+
+/*
+ * This file contains the Atanh operation implementation for the MUSA backend.
+ *
+ * It follows the consistent code structure to ensure alignment across different
+ * hardware platforms within the Moore Threads (MUSA) ecosystem.
+ */
+namespace op::atanh::moore {
+
+typedef struct AtanhOp {
+public:
+    // 一元算子，输入数量为 1
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            // 针对 half2 进行并行计算
+            float2 f2 = __half22float2(a);
+            f2.x = atanhf(f2.x);
+            f2.y = atanhf(f2.y);
+            return __float22half2_rn(f2);
+        } else if constexpr (std::is_same_v<T, half>) {
+            // 转为 float 计算以保证精度并匹配 MUSA 数学库
+            return __float2half(atanhf(__half2float(a)));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // BF16 同样提升到 float 计算，避免转换歧义
+            float a_f = __bfloat162float(a);
+            return __float2bfloat16_rn(atanhf(a_f));
+        } else if constexpr (std::is_same_v<T, float>) {
+            // 调用 MUSA 内置的单精度反双曲正切函数
+            return atanhf(a);
+        } else if constexpr (std::is_same_v<T, double>) {
+            // 调用双精度版本
+            return ::atanh(a);
+        } else {
+            // 兜底实现（如果是整数类型，通常会隐式转为 float）
+            return static_cast<T>(atanhf(static_cast<float>(a)));
+        }
+    }
+} AtanhOp;
+
+} // namespace op::atanh::moore
+
+#endif // __ATANH_MOORE_KERNEL_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu
new file mode 100644
index 000000000..4959b7abf
--- /dev/null
+++ b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cu
@@ -0,0 +1,58 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "atanh_nvidia.cuh"
+
+namespace op::atanh::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(y_shape, a_shape);
+
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::AtanhOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::AtanhOp, nv_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::AtanhOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::AtanhOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::atanh::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cuh b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cuh
new file mode 100644
index 000000000..473c3d47c
--- /dev/null
+++ b/src/infiniop/ops/atanh/nvidia/atanh_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __ATANH_CUDA_API_H__
+#define __ATANH_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(atanh, nvidia)
+
+#endif // __ATANH_CUDA_API_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/atanh/operator.cc b/src/infiniop/ops/atanh/operator.cc
new file mode 100644
index 000000000..8aa59a36e
--- /dev/null
+++ b/src/infiniop/ops/atanh/operator.cc
@@ -0,0 +1,197 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/atanh.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/atanh_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/atanh_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/atanh_metax.h"
+#endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/atanh_kunlun.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/atanh_bang.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/atanh_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateAtanhDescriptor(
+    infiniopHandle_t handle,
+    infiniopAtanhDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t a_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                             \
+    case CASE:                                                              \
+        return op::atanh::NAMESPACE::Descriptor::create(                    \
+            handle,                                                         \
+            reinterpret_cast<op::atanh::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                         \
+            {a_desc}) // 一元算子只传入 a_desc
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetAtanhWorkspaceSize(infiniopAtanhDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::atanh::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+
+__C infiniStatus_t infiniopAtanh(
+    infiniopAtanhDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *a,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::atanh::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {a}, stream) // 一元算子只传入 a
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyAtanhDescriptor(infiniopAtanhDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::atanh::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/binary_cross_entropy_with_logits/binary_cross_entropy_with_logits.h b/src/infiniop/ops/binary_cross_entropy_with_logits/binary_cross_entropy_with_logits.h
new file mode 100644
index 000000000..5e3d8291e
--- /dev/null
+++ b/src/infiniop/ops/binary_cross_entropy_with_logits/binary_cross_entropy_with_logits.h
@@ -0,0 +1,68 @@
+#ifndef __BINARY_CROSS_ENTROPY_WITH_LOGITS_H__
+#define __BINARY_CROSS_ENTROPY_WITH_LOGITS_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+/**
+ * # 关于 `BCEWithLogits` 算子描述符的说明
+ * * 采用 PImpl 设计模式，将不同硬件后端（如 CUDA 原生算子、CPU 循环、或是芯片厂商的专用库调用）
+ * 封装在 `Opaque` 结构中。
+ * * 描述符在创建时会完成形状校验、步长分析，并确定最优的计算 Workspace 大小。
+ */
+
+#define DESCRIPTOR(NAMESPACE)                                     \
+                                                                  \
+    namespace op::bce_with_logits::NAMESPACE {                    \
+    class Descriptor final : public InfiniopDescriptor {          \
+        struct Opaque;                                            \
+        Opaque *_opaque;                                          \
+        infiniDtype_t _dtype;                                     \
+        BCEWithLogitsInfo _info;    /* 包含各输入输出张量的维度与步长 */ \
+        size_t _workspace_size;                                   \
+        infiniopReduction_t _reduction;                           \
+                                                                  \
+        Descriptor(                                               \
+            infiniDtype_t dtype,                                  \
+            BCEWithLogitsInfo info,                               \
+            infiniopReduction_t reduction,                        \
+            size_t workspace_size_,                               \
+            Opaque *opaque,                                       \
+            infiniDevice_t device_type,                           \
+            int device_id)                                        \
+            : InfiniopDescriptor{device_type, device_id},         \
+              _opaque(opaque),                                    \
+              _dtype(dtype),                                      \
+              _info(info),                                        \
+              _workspace_size(workspace_size_),                    \
+              _reduction(reduction) {}                            \
+                                                                  \
+    public:                                                       \
+        ~Descriptor();                                            \
+                                                                  \
+        size_t workspaceSize() const { return _workspace_size; }  \
+                                                                  \
+        static infiniStatus_t create(                             \
+            infiniopHandle_t handle,                              \
+            Descriptor **desc_ptr,                                \
+            infiniopTensorDescriptor_t out_desc,                  \
+            infiniopTensorDescriptor_t logits_desc,               \
+            infiniopTensorDescriptor_t target_desc,               \
+            infiniopTensorDescriptor_t weight_desc,               \
+            infiniopTensorDescriptor_t pos_weight_desc,           \
+            infiniopReduction_t reduction                         \
+        );                                                        \
+                                                                  \
+        infiniStatus_t calculate(                                 \
+            void *workspace,                                      \
+            size_t workspace_size,                                \
+            void *out,                                            \
+            const void *logits,                                   \
+            const void *target,                                   \
+            const void *weight,      /* 可选，可为 nullptr */      \
+            const void *pos_weight,  /* 可选，可为 nullptr */      \
+            void *stream) const;                                  \
+    };                                                            \
+    }
+
+#endif // __BINARY_CROSS_ENTROPY_WITH_LOGITS_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/binary_cross_entropy_with_logits/cpu/binary_cross_entropy_with_logits_cpu.cc b/src/infiniop/ops/binary_cross_entropy_with_logits/cpu/binary_cross_entropy_with_logits_cpu.cc
new file mode 100644
index 000000000..06ab0d1e8
--- /dev/null
+++ b/src/infiniop/ops/binary_cross_entropy_with_logits/cpu/binary_cross_entropy_with_logits_cpu.cc
@@ -0,0 +1,165 @@
+#include "binary_cross_entropy_with_logits_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include <cmath>
+#include <algorithm>
+
+namespace op::bce_with_logits::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t logits_desc,
+    infiniopTensorDescriptor_t target_desc,
+    infiniopTensorDescriptor_t weight_desc,
+    infiniopTensorDescriptor_t pos_weight_desc,
+    infiniopReduction_t reduction) {
+    
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = logits_desc->dtype();
+
+    // 1. 类型检查
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    // 2. 解析维度信息 (利用之前定义的 BCEWithLogitsInfo)
+    auto result = BCEWithLogitsInfo::create(out_desc, logits_desc, target_desc, 
+                                            weight_desc, pos_weight_desc, reduction);
+    CHECK_RESULT(result);
+
+    // 3. 实例化描述符
+    *desc_ptr = new Descriptor(
+        dtype, result.take(), reduction, 0,
+        nullptr,
+        handle->device, handle->device_id);
+    
+    return INFINI_STATUS_SUCCESS;
+}
+
+/**
+ * 核心数值稳定逻辑：L = -w * [pw * y * log(sigmoid(x)) + (1-y) * log(1-sigmoid(x))]
+ * 变形为：L = w * [max(x, 0) - x * y * pw + (1 + (pw-1) * y) * log(1 + exp(-|x|))]
+ * 当 pw=1 时简化为：L = w * [max(x, 0) - x * y + log(1 + exp(-|x|))]
+ */
+template <typename Tdata>
+void calculate_bce(
+    const BCEWithLogitsInfo &info,
+    void *out,
+    const void *logits,
+    const void *target,
+    const void *weight,
+    const void *pos_weight) {
+
+    size_t n = info.num_elements;
+    float total_loss = 0.0f;
+
+    // 获取各张量指针
+    const Tdata* l_ptr = reinterpret_cast<const Tdata*>(logits);
+    const Tdata* t_ptr = reinterpret_cast<const Tdata*>(target);
+    const Tdata* w_ptr = reinterpret_cast<const Tdata*>(weight);
+    const Tdata* pw_ptr = reinterpret_cast<const Tdata*>(pos_weight);
+    Tdata* o_ptr = reinterpret_cast<Tdata*>(out);
+
+    auto &logits_info = info.logits;
+    auto &target_info = info.target;
+    auto &weight_info = info.weight;
+    auto &out_info = info.out;
+
+#pragma omp parallel for reduction(+:total_loss)
+    for (ptrdiff_t i = 0; i < (ptrdiff_t)n; ++i) {
+        size_t idx = static_cast<size_t>(i);
+
+        // 使用 stride 计算实际内存偏移，支持任意内存布局
+        size_t logits_offset = op::common_cpu::indexToOffset(
+            idx,
+            logits_info.ndim,
+            logits_info.dims.data(),
+            logits_info.stride.data());
+        size_t target_offset = op::common_cpu::indexToOffset(
+            idx,
+            target_info.ndim,
+            target_info.dims.data(),
+            target_info.stride.data());
+
+        float x = utils::cast<float>(l_ptr[logits_offset]);
+        float y = utils::cast<float>(t_ptr[target_offset]);
+
+        // 处理 pos_weight 广播 (假设 logits 形状 [..., C], pos_weight 为 [C] 且连续)
+        float pw = 1.0f;
+        if (pw_ptr && info.pos_weight.total_elements > 0) {
+            size_t c = idx % info.pos_weight.total_elements;
+            pw = utils::cast<float>(pw_ptr[c]);
+        }
+
+        // 处理 weight：
+        // - 如果与 logits 完全同形状，则按 stride 精确索引；
+        // - 如果为向量 [C]，则通过 indexToOffset 实现按最后一维广播。
+        float w = 1.0f;
+        if (w_ptr && weight_info.ndim > 0) {
+            size_t weight_offset = op::common_cpu::indexToOffset(
+                idx,
+                weight_info.ndim,
+                weight_info.dims.data(),
+                weight_info.stride.data());
+            w = utils::cast<float>(w_ptr[weight_offset]);
+        }
+
+        // 数值稳定的 BCEWithLogits 计算（对齐 PyTorch 实现）：
+        // max_val = max(-x, 0)
+        // log_weight = 1 + (pos_weight - 1) * y
+        // loss = (1 - y) * x + log_weight * (log(1 + exp(-|x|)) + max_val)
+        float max_val = std::max(-x, 0.0f);
+        float log_weight = 1.0f + (pw - 1.0f) * y;
+        float loss = (1.0f - y) * x +
+                 log_weight * (std::log1p(std::exp(-std::abs(x))) + max_val);
+
+        loss *= w;
+
+        if (info.reduction == INFINIOP_REDUCTION_NONE) {
+            // 逐元素写回时同样遵循 out 的 stride
+            size_t out_offset = op::common_cpu::indexToOffset(
+                idx,
+                out_info.ndim,
+                out_info.dims.data(),
+                out_info.stride.data());
+            o_ptr[out_offset] = utils::cast<Tdata>(loss);
+        } else {
+            total_loss += loss;
+        }
+    }
+
+    // 处理归约输出
+    if (info.reduction == INFINIOP_REDUCTION_MEAN) {
+        o_ptr[0] = utils::cast<Tdata>(total_loss / n);
+    } else if (info.reduction == INFINIOP_REDUCTION_SUM) {
+        o_ptr[0] = utils::cast<Tdata>(total_loss);
+    }
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *logits,
+    const void *target,
+    const void *weight,
+    const void *pos_weight,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        cpu::calculate_bce<fp16_t>(_info, out, logits, target, weight, pos_weight);
+        return INFINI_STATUS_SUCCESS;
+    case INFINI_DTYPE_BF16:
+        cpu::calculate_bce<bf16_t>(_info, out, logits, target, weight, pos_weight);
+        return INFINI_STATUS_SUCCESS;
+    case INFINI_DTYPE_F32:
+        cpu::calculate_bce<float>(_info, out, logits, target, weight, pos_weight);
+        return INFINI_STATUS_SUCCESS;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+} // namespace op::bce_with_logits::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/binary_cross_entropy_with_logits/cpu/binary_cross_entropy_with_logits_cpu.h b/src/infiniop/ops/binary_cross_entropy_with_logits/cpu/binary_cross_entropy_with_logits_cpu.h
new file mode 100644
index 000000000..2eecc839c
--- /dev/null
+++ b/src/infiniop/ops/binary_cross_entropy_with_logits/cpu/binary_cross_entropy_with_logits_cpu.h
@@ -0,0 +1,16 @@
+#ifndef __BINARY_CROSS_ENTROPY_WITH_LOGITS_CPU_H__
+#define __BINARY_CROSS_ENTROPY_WITH_LOGITS_CPU_H__
+
+#include "../binary_cross_entropy_with_logits.h"
+
+/**
+ * 使用 bce_with_logits.h 中定义的 DESCRIPTOR 宏
+ * * 这将自动在命名空间 op::bce_with_logits::cpu 中生成 Descriptor 类。
+ * 该类将继承自 InfiniopDescriptor，并包含：
+ * - BCEWithLogitsInfo _info (存储校验后的维度和步长)
+ * - create() 静态方法 (负责 CPU 版描述符的实例化)
+ * - calculate() 方法 (负责 CPU 版数值稳定逻辑的执行)
+ */
+DESCRIPTOR(cpu)
+
+#endif // __BINARY_CROSS_ENTROPY_WITH_LOGITS_CPU_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/binary_cross_entropy_with_logits/info.h b/src/infiniop/ops/binary_cross_entropy_with_logits/info.h
new file mode 100644
index 000000000..3d5b30f37
--- /dev/null
+++ b/src/infiniop/ops/binary_cross_entropy_with_logits/info.h
@@ -0,0 +1,130 @@
+#ifndef __BINARY_CROSS_ENTROPY_WITH_LOGITS_INFO_H__
+#define __BINARY_CROSS_ENTROPY_WITH_LOGITS_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include "infiniop/ops/binary_cross_entropy_with_logits.h"
+#include <numeric>
+#include <vector>
+
+namespace op::bce_with_logits {
+
+/**
+ * 描述 BCE 算子中各张量的内存布局
+ * 动态申请 dims 和 stride，支持任意维度的张量
+ */
+struct BCETensorInfo {
+    size_t total_elements = 0;
+    size_t ndim = 0;
+    std::vector<size_t> dims;     // 动态存储维度
+    std::vector<ptrdiff_t> stride; // 动态存储步长
+
+    BCETensorInfo() = default;
+
+    static utils::Result<BCETensorInfo> create(infiniopTensorDescriptor_t desc) {
+        if (desc == nullptr) return INFINI_STATUS_SUCCESS; 
+
+        BCETensorInfo info;
+        info.ndim = desc->ndim();
+        info.total_elements = 1;
+        
+        // 动态调整 vector 大小
+        info.dims.reserve(info.ndim);
+        info.stride.reserve(info.ndim);
+
+        for (size_t i = 0; i < info.ndim; ++i) {
+            size_t d = desc->dim(i);
+            info.dims.push_back(d);
+            info.stride.push_back(desc->stride(i));
+            info.total_elements *= d;
+        }
+        return utils::Result<BCETensorInfo>(std::move(info));
+    }
+
+    // 辅助方法：获取最后一维大小（用于 pos_weight 校验）
+    size_t last_dim() const {
+        return ndim > 0 ? dims.back() : 0;
+    }
+};
+
+class BCEWithLogitsInfo {
+public:
+    BCETensorInfo logits;
+    BCETensorInfo target;
+    BCETensorInfo weight;     
+    BCETensorInfo pos_weight; 
+    BCETensorInfo out;
+
+    size_t num_elements;
+    infiniopReduction_t reduction;
+
+    // 由于 BCETensorInfo 内部使用了 vector，BCEWithLogitsInfo 现在是可移动且安全的
+    BCEWithLogitsInfo() = default;
+
+    static utils::Result<BCEWithLogitsInfo> create(
+        infiniopTensorDescriptor_t out_desc,
+        infiniopTensorDescriptor_t logits_desc,
+        infiniopTensorDescriptor_t target_desc,
+        infiniopTensorDescriptor_t weight_desc,
+        infiniopTensorDescriptor_t pos_weight_desc,
+        infiniopReduction_t reduction) {
+
+        auto logits_res = BCETensorInfo::create(logits_desc);
+        CHECK_RESULT(logits_res);
+        auto target_res = BCETensorInfo::create(target_desc);
+        CHECK_RESULT(target_res);
+        auto out_res = BCETensorInfo::create(out_desc);
+        CHECK_RESULT(out_res);
+
+        BCEWithLogitsInfo info;
+        info.logits = logits_res.take();
+        info.target = target_res.take();
+        info.out = out_res.take();
+        info.reduction = reduction;
+        info.num_elements = info.logits.total_elements;
+
+        // 1. 基本形状一致性校验
+        if (info.logits.ndim != info.target.ndim) return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        for (size_t i = 0; i < info.logits.ndim; ++i) {
+            if (info.logits.dims[i] != info.target.dims[i]) return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        // 2. 校验 weight (需完全一致)
+        if (weight_desc) {
+            auto w_res = BCETensorInfo::create(weight_desc);
+            CHECK_RESULT(w_res);
+            info.weight = w_res.take();
+            
+            // 允许两种情况：
+            // 1. 完全一致
+            // 2. weight 是一个向量，且长度等于 logits 的最后一维 (常见广播场景)
+            bool is_full_match = (info.weight.total_elements == info.logits.total_elements);
+            bool is_last_dim_match = (info.weight.total_elements == info.logits.last_dim());
+
+            if (!is_full_match && !is_last_dim_match) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+        }
+
+        // 3. 记录 pos_weight 信息
+        //    广播行为由计算 Kernel 通过长度进行处理，这里不过度限制形状，
+        //    只要能够提供有效的长度即可，避免误报 Bad Tensor Shape。
+        if (pos_weight_desc) {
+            auto pw_res = BCETensorInfo::create(pos_weight_desc);
+            CHECK_RESULT(pw_res);
+            info.pos_weight = pw_res.take();
+        }
+
+        // 4. 输出形状
+        // 这里不再强制校验 out 与 logits/标量的元素数量完全一致，
+        // 由高层 API 负责创建合理的输出张量；底层实现只依赖
+        // `_info.out` 的 stride 在 reduction==NONE 且逐元素写回时使用。
+
+        return utils::Result<BCEWithLogitsInfo>(std::move(info));
+    }
+};
+
+} // namespace op::bce_with_logits
+
+#endif // __BINARY_CROSS_ENTROPY_WITH_LOGITS_INFO_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/binary_cross_entropy_with_logits/metax/binary_cross_entropy_with_logits_metax.h b/src/infiniop/ops/binary_cross_entropy_with_logits/metax/binary_cross_entropy_with_logits_metax.h
new file mode 100644
index 000000000..a5086f882
--- /dev/null
+++ b/src/infiniop/ops/binary_cross_entropy_with_logits/metax/binary_cross_entropy_with_logits_metax.h
@@ -0,0 +1,17 @@
+#ifndef __BINARY_CROSS_ENTROPY_WITH_LOGITS_METAX_CUH__
+#define __BINARY_CROSS_ENTROPY_WITH_LOGITS_METAX_CUH__
+
+#include "../binary_cross_entropy_with_logits.h"
+
+/**
+ * 使用 bce_with_logits.h 中定义的 DESCRIPTOR 宏。
+ * 这将在命名空间 op::bce_with_logits::metax 中生成针对 METAX 设备的 Descriptor 类。
+ * * * 在 METAX 端的实现（.cu 文件）中，Opaque 结构体通常包含：
+ * - cudnnHandle_t: 如果使用 cuDNN 的算子实现。
+ * - cudnnTensorDescriptor_t: 用于描述各输入输出张量的 cuDNN 格式。
+ * - KernelConfig: 用于自定义 METAX Kernel 的网格（Grid）和线程块（Block）配置。
+ * - dataType: 对应的 METAX 数据类型（如 METAX_R_32F）。
+ */
+DESCRIPTOR(metax)
+
+#endif // __BINARY_CROSS_ENTROPY_WITH_LOGITS_METAX_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/binary_cross_entropy_with_logits/metax/binary_cross_entropy_with_logits_metax.maca b/src/infiniop/ops/binary_cross_entropy_with_logits/metax/binary_cross_entropy_with_logits_metax.maca
new file mode 100644
index 000000000..f98bb53ad
--- /dev/null
+++ b/src/infiniop/ops/binary_cross_entropy_with_logits/metax/binary_cross_entropy_with_logits_metax.maca
@@ -0,0 +1,335 @@
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_kernel_common.h"
+#include "../../../devices/metax/metax_handle.h"
+#include "binary_cross_entropy_with_logits_metax.h"
+#include <type_traits>
+#include <mc_runtime.h>
+
+namespace op::bce_with_logits::metax {
+
+using device::metax::indexToOffset;
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() = default;
+
+// 在 GPU 侧使用的简化张量信息（固定上限维度，支持 stride）
+constexpr int BCE_MAX_DIMS = 8;
+
+struct BCETensorInfoDevice {
+    size_t ndim;
+    size_t shape[BCE_MAX_DIMS];
+    ptrdiff_t strides[BCE_MAX_DIMS];
+};
+
+static inline BCETensorInfoDevice make_device_info(const BCETensorInfo &info) {
+    BCETensorInfoDevice dev{};
+    dev.ndim = info.ndim;
+    for (size_t i = 0; i < info.ndim && i < static_cast<size_t>(BCE_MAX_DIMS); ++i) {
+        dev.shape[i] = info.dims[i];
+        dev.strides[i] = info.stride[i];
+    }
+    return dev;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t logits_desc,
+    infiniopTensorDescriptor_t target_desc,
+    infiniopTensorDescriptor_t weight_desc,
+    infiniopTensorDescriptor_t pos_weight_desc,
+    infiniopReduction_t reduction) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = logits_desc->dtype();
+
+    // METAX 实现支持 F16 / F32 / BF16
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    auto result = BCEWithLogitsInfo::create(out_desc, logits_desc, target_desc,
+                                            weight_desc, pos_weight_desc, reduction);
+    CHECK_RESULT(result);
+
+    auto info = result.take();
+
+    // F16/BF16 在做归约时需要一个 float 标量 workspace 来累加
+    size_t workspace_size = 0;
+    if (reduction != INFINIOP_REDUCTION_NONE &&
+        (dtype == INFINI_DTYPE_F16 || dtype == INFINI_DTYPE_BF16)) {
+        workspace_size = sizeof(float);
+    }
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), reduction, workspace_size,
+        nullptr,
+        handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+// 将任意标量类型提升为 float
+template <typename T>
+__device__ __forceinline__ float to_float(T x) {
+    if constexpr (std::is_same_v<T, float>) {
+        return x;
+    } else if constexpr (std::is_same_v<T, half>) {
+        return __half2float(x);
+    } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+        return __bfloat162float(x);
+    } else {
+        return static_cast<float>(x);
+    }
+}
+
+// 从 float 转回目标标量类型
+template <typename T>
+__device__ __forceinline__ T from_float(float x) {
+    if constexpr (std::is_same_v<T, float>) {
+        return x;
+    } else if constexpr (std::is_same_v<T, half>) {
+        return __float2half(x);
+    } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+        return __float2bfloat16(x);
+    } else {
+        return static_cast<T>(x);
+    }
+}
+
+// --- METAX Kernel: 支持 stride 的数值稳定 BCE 计算 ---
+template <typename Tdata, typename Taccum>
+__global__ void bce_logits_kernel(
+    void *out_raw,
+    const Tdata *logits,
+    const Tdata *target,
+    const Tdata *weight,
+    const Tdata *pos_weight,
+    BCETensorInfoDevice logits_info,
+    BCETensorInfoDevice target_info,
+    BCETensorInfoDevice weight_info,
+    BCETensorInfoDevice out_info,
+    size_t n,
+    size_t pos_weight_len,
+    infiniopReduction_t reduction) {
+
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= n) return;
+
+    // 计算逻辑索引在各张量中的偏移（支持任意 stride）
+    size_t logits_offset = indexToOffset(idx, logits_info.ndim,
+                                         logits_info.shape, logits_info.strides);
+    size_t target_offset = indexToOffset(idx, target_info.ndim,
+                                         target_info.shape, target_info.strides);
+
+    float x = to_float(logits[logits_offset]);
+    float y = to_float(target[target_offset]);
+
+    float pw = 1.0f;
+    if (pos_weight && pos_weight_len > 0) {
+        // 按最后一维广播：假设 pos_weight 是连续的一维张量
+        size_t c = idx % pos_weight_len;
+        pw = to_float(pos_weight[c]);
+    }
+
+    float w = 1.0f;
+    if (weight && weight_info.ndim > 0) {
+        size_t weight_offset = indexToOffset(idx, weight_info.ndim,
+                                             weight_info.shape, weight_info.strides);
+        w = to_float(weight[weight_offset]);
+    }
+
+    // 数值稳定公式：max(x, 0) - x * y * pw + (1 + (pw - 1) * y) * log(1 + exp(-abs(x)))
+    float loss = (fmaxf(x, 0.0f) - x * y * pw +
+                 (1.0f + (pw - 1.0f) * y) * logf(1.0f + expf(-fabsf(x))));
+
+    loss *= w;
+
+    if (reduction == INFINIOP_REDUCTION_NONE) {
+        // 写回逐元素 loss（支持 stride 的 out）
+        size_t out_offset = indexToOffset(idx, out_info.ndim,
+                                          out_info.shape, out_info.strides);
+        auto *out_ptr = static_cast<Tdata *>(out_raw);
+        out_ptr[out_offset] = from_float<Tdata>(loss);
+    } else {
+        // 对于 mean 或 sum，使用 float 累加到标量位置
+        auto *out_accum = static_cast<Taccum *>(out_raw);
+        atomicAdd(out_accum, static_cast<Taccum>(loss));
+    }
+}
+
+// F32 mean 归约：对输出标量做除法
+__global__ void bce_logits_mean_scale_kernel_f32(float *val, size_t count) {
+    if (threadIdx.x == 0 && blockIdx.x == 0) {
+        *val /= static_cast<float>(count);
+    }
+}
+
+// F16/BF16 归约：从 float workspace 写回目标 dtype
+template <typename Tdata>
+__global__ void bce_logits_reduce_finalize_kernel(
+    Tdata *out,
+    float *workspace,
+    size_t count,
+    int is_mean) {
+    if (threadIdx.x == 0 && blockIdx.x == 0) {
+        float v = *workspace;
+        if (is_mean) {
+            v /= static_cast<float>(count);
+        }
+        *out = from_float<Tdata>(v);
+    }
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *logits,
+    const void *target,
+    const void *weight,
+    const void *pos_weight,
+    void *stream) const {
+
+    mcStream_t custream = (mcStream_t)stream;
+    size_t n = _info.num_elements;
+
+    // F16/BF16 + 归约需要 float workspace
+    if (_reduction != INFINIOP_REDUCTION_NONE &&
+        (_dtype == INFINI_DTYPE_F16 || _dtype == INFINI_DTYPE_BF16)) {
+        if (workspace_size < sizeof(float)) {
+            return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+        }
+    }
+
+    int block = 256;
+    int grid = static_cast<int>((n + block - 1) / block);
+
+    // 构造 GPU 侧的张量信息（含 stride）
+    BCETensorInfoDevice logits_info = make_device_info(_info.logits);
+    BCETensorInfoDevice target_info = make_device_info(_info.target);
+    BCETensorInfoDevice out_info = make_device_info(_info.out);
+    BCETensorInfoDevice weight_info = {};
+    if (_info.weight.total_elements != 0) {
+        weight_info = make_device_info(_info.weight);
+    }
+
+    size_t pos_weight_len = _info.pos_weight.total_elements;
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F32: {
+        // 如果是规约操作，计算前需将输出位置清零
+        if (_reduction != INFINIOP_REDUCTION_NONE) {
+            mcMemsetAsync(out, 0, sizeof(float), custream);
+        }
+
+        bce_logits_kernel<float, float><<<grid, block, 0, custream>>>(
+            out,
+            static_cast<const float *>(logits),
+            static_cast<const float *>(target),
+            static_cast<const float *>(weight),
+            static_cast<const float *>(pos_weight),
+            logits_info,
+            target_info,
+            weight_info,
+            out_info,
+            n,
+            pos_weight_len,
+            _reduction);
+
+        if (_reduction == INFINIOP_REDUCTION_MEAN) {
+            bce_logits_mean_scale_kernel_f32<<<1, 1, 0, custream>>>(
+                static_cast<float *>(out), n);
+        }
+        break;
+    }
+    case INFINI_DTYPE_F16: {
+        auto *logits_h = static_cast<const half *>(logits);
+        auto *target_h = static_cast<const half *>(target);
+        auto *weight_h = static_cast<const half *>(weight);
+        auto *pos_weight_h = static_cast<const half *>(pos_weight);
+
+        void *out_raw = nullptr;
+        float *workspace_f = nullptr;
+
+        if (_reduction == INFINIOP_REDUCTION_NONE) {
+            out_raw = out;
+        } else {
+            workspace_f = static_cast<float *>(workspace);
+            mcMemsetAsync(workspace_f, 0, sizeof(float), custream);
+            out_raw = workspace_f;
+        }
+
+        bce_logits_kernel<half, float><<<grid, block, 0, custream>>>(
+            out_raw,
+            logits_h,
+            target_h,
+            weight_h,
+            pos_weight_h,
+            logits_info,
+            target_info,
+            weight_info,
+            out_info,
+            n,
+            pos_weight_len,
+            _reduction);
+
+        if (_reduction != INFINIOP_REDUCTION_NONE) {
+            int is_mean = (_reduction == INFINIOP_REDUCTION_MEAN) ? 1 : 0;
+            bce_logits_reduce_finalize_kernel<half><<<1, 1, 0, custream>>>(
+                static_cast<half *>(out), workspace_f, n, is_mean);
+        }
+
+        break;
+    }
+    case INFINI_DTYPE_BF16: {
+        auto *logits_b = static_cast<const cuda_bfloat16 *>(logits);
+        auto *target_b = static_cast<const cuda_bfloat16 *>(target);
+        auto *weight_b = static_cast<const cuda_bfloat16 *>(weight);
+        auto *pos_weight_b = static_cast<const cuda_bfloat16 *>(pos_weight);
+
+        void *out_raw = nullptr;
+        float *workspace_f = nullptr;
+
+        if (_reduction == INFINIOP_REDUCTION_NONE) {
+            out_raw = out;
+        } else {
+            workspace_f = static_cast<float *>(workspace);
+            mcMemsetAsync(workspace_f, 0, sizeof(float), custream);
+            out_raw = workspace_f;
+        }
+
+        bce_logits_kernel<cuda_bfloat16, float><<<grid, block, 0, custream>>>(
+            out_raw,
+            logits_b,
+            target_b,
+            weight_b,
+            pos_weight_b,
+            logits_info,
+            target_info,
+            weight_info,
+            out_info,
+            n,
+            pos_weight_len,
+            _reduction);
+
+        if (_reduction != INFINIOP_REDUCTION_NONE) {
+            int is_mean = (_reduction == INFINIOP_REDUCTION_MEAN) ? 1 : 0;
+            bce_logits_reduce_finalize_kernel<cuda_bfloat16><<<1, 1, 0, custream>>>(
+                static_cast<cuda_bfloat16 *>(out), workspace_f, n, is_mean);
+        }
+
+        break;
+    }
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    mcError_t err = mcGetLastError();
+    if (err != mcSuccess) {
+        return INFINI_STATUS_INTERNAL_ERROR;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::bce_with_logits::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/binary_cross_entropy_with_logits/moore/binary_cross_entropy_with_logits_moore.h b/src/infiniop/ops/binary_cross_entropy_with_logits/moore/binary_cross_entropy_with_logits_moore.h
new file mode 100644
index 000000000..44997d088
--- /dev/null
+++ b/src/infiniop/ops/binary_cross_entropy_with_logits/moore/binary_cross_entropy_with_logits_moore.h
@@ -0,0 +1,16 @@
+#ifndef __BINARY_CROSS_ENTROPY_WITH_LOGITS_MOORE_H__
+#define __BINARY_CROSS_ENTROPY_WITH_LOGITS_MOORE_H__
+
+#include "../binary_cross_entropy_with_logits.h"
+
+/**
+ * 使用 bce_with_logits.h 中定义的 DESCRIPTOR 宏。
+ * 这将在命名空间 op::bce_with_logits::moore 中生成针对 Moore 设备的 Descriptor 类。
+ * * 在 Moore 端的实现（.mu 文件）中，Opaque 结构体通常包含：
+ * - musaHandle_t: 如果使用 MUSA 库的算子实现。
+ * - KernelConfig: 用于 MUSA Kernel 的网格（Grid）和线程块（Block）配置。
+ * - dataType: 对应的 MUSA 数据类型（如 MUSA_R_32F）。
+ */
+DESCRIPTOR(moore)
+
+#endif // __BINARY_CROSS_ENTROPY_WITH_LOGITS_MOORE_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/binary_cross_entropy_with_logits/moore/binary_cross_entropy_with_logits_moore.mu b/src/infiniop/ops/binary_cross_entropy_with_logits/moore/binary_cross_entropy_with_logits_moore.mu
new file mode 100644
index 000000000..3654fdb49
--- /dev/null
+++ b/src/infiniop/ops/binary_cross_entropy_with_logits/moore/binary_cross_entropy_with_logits_moore.mu
@@ -0,0 +1,321 @@
+#include "../../../devices/moore/moore_handle.h"
+#include "../../../devices/moore/moore_kernel_common.h"
+#include "binary_cross_entropy_with_logits_moore.h"
+#include <musa_runtime.h>
+
+namespace op::bce_with_logits::moore {
+
+using device::moore::indexToOffset;
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() = default;
+
+// 摩尔线程平台通常与 CUDA 保持一致的维度上限
+constexpr int BCE_MAX_DIMS = 8;
+
+struct BCETensorInfoDevice {
+    size_t ndim;
+    size_t shape[BCE_MAX_DIMS];
+    ptrdiff_t strides[BCE_MAX_DIMS];
+};
+
+static inline BCETensorInfoDevice make_device_info(const BCETensorInfo &info) {
+    BCETensorInfoDevice dev{};
+    dev.ndim = info.ndim;
+    for (size_t i = 0; i < info.ndim && i < static_cast<size_t>(BCE_MAX_DIMS); ++i) {
+        dev.shape[i] = info.dims[i];
+        dev.strides[i] = info.stride[i];
+    }
+    return dev;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t logits_desc,
+    infiniopTensorDescriptor_t target_desc,
+    infiniopTensorDescriptor_t weight_desc,
+    infiniopTensorDescriptor_t pos_weight_desc,
+    infiniopReduction_t reduction) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = logits_desc->dtype();
+
+    // Moore 实现支持 F16 / F32 / BF16
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    auto result = BCEWithLogitsInfo::create(out_desc, logits_desc, target_desc,
+                                            weight_desc, pos_weight_desc, reduction);
+    CHECK_RESULT(result);
+
+    auto info = result.take();
+
+    size_t workspace_size = 0;
+    if (reduction != INFINIOP_REDUCTION_NONE &&
+        (dtype == INFINI_DTYPE_F16 || dtype == INFINI_DTYPE_BF16)) {
+        workspace_size = sizeof(float);
+    }
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), reduction, workspace_size,
+        nullptr,
+        handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+// 针对 Moore 平台的类型提升逻辑
+template <typename T>
+__device__ __forceinline__ float to_float(T x) {
+    if constexpr (std::is_same_v<T, float>) {
+        return x;
+    } else if constexpr (std::is_same_v<T, half>) {
+        return __half2float(x);
+    } else if constexpr (std::is_same_v<T, cuda_bfloat16>) { // MUSA 兼容 cuda_bfloat16 名称或使用内部 bf16
+        return __bfloat162float(x);
+    } else {
+        return static_cast<float>(x);
+    }
+}
+
+template <typename T>
+__device__ __forceinline__ T from_float(float x) {
+    if constexpr (std::is_same_v<T, float>) {
+        return x;
+    } else if constexpr (std::is_same_v<T, half>) {
+        return __float2half(x);
+    } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+        return __float2bfloat16_rn(x); // Moore 平台推荐显式使用 _rn
+    } else {
+        return static_cast<T>(x);
+    }
+}
+
+// --- MUSA Kernel ---
+template <typename Tdata, typename Taccum>
+__global__ void bce_logits_kernel(
+    void *out_raw,
+    const Tdata *logits,
+    const Tdata *target,
+    const Tdata *weight,
+    const Tdata *pos_weight,
+    BCETensorInfoDevice logits_info,
+    BCETensorInfoDevice target_info,
+    BCETensorInfoDevice weight_info,
+    BCETensorInfoDevice out_info,
+    size_t n,
+    size_t pos_weight_len,
+    infiniopReduction_t reduction) {
+
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= n) return;
+
+    size_t logits_offset = indexToOffset(idx, logits_info.ndim,
+                                         logits_info.shape, logits_info.strides);
+    size_t target_offset = indexToOffset(idx, target_info.ndim,
+                                         target_info.shape, target_info.strides);
+
+    float x = to_float(logits[logits_offset]);
+    float y = to_float(target[target_offset]);
+
+    float pw = 1.0f;
+    if (pos_weight && pos_weight_len > 0) {
+        size_t c = idx % pos_weight_len;
+        pw = to_float(pos_weight[c]);
+    }
+
+    float w = 1.0f;
+    if (weight && weight_info.ndim > 0) {
+        size_t weight_offset = indexToOffset(idx, weight_info.ndim,
+                                               weight_info.shape, weight_info.strides);
+        w = to_float(weight[weight_offset]);
+    }
+
+    // 数值稳定的 BCEWithLogits 计算（对齐 PyTorch 实现）：
+    // max_val = max(-x, 0)
+    // log_weight = 1 + (pos_weight - 1) * y
+    // loss = (1 - y) * x + log_weight * (log(1 + exp(-|x|)) + max_val)
+    float max_val = fmaxf(-x, 0.0f);
+    float log_weight = 1.0f + (pw - 1.0f) * y;
+    float loss = (1.0f - y) * x +
+                 log_weight * (logf(1.0f + expf(-fabsf(x))) + max_val);
+
+    loss *= w;
+
+    if (reduction == INFINIOP_REDUCTION_NONE) {
+        size_t out_offset = indexToOffset(idx, out_info.ndim,
+                                          out_info.shape, out_info.strides);
+        auto *out_ptr = static_cast<Tdata *>(out_raw);
+        out_ptr[out_offset] = from_float<Tdata>(loss);
+    } else {
+        auto *out_accum = static_cast<Taccum *>(out_raw);
+        atomicAdd(out_accum, static_cast<Taccum>(loss));
+    }
+}
+
+__global__ void bce_logits_mean_scale_kernel_f32(float *val, size_t count) {
+    if (threadIdx.x == 0 && blockIdx.x == 0) {
+        *val /= static_cast<float>(count);
+    }
+}
+
+template <typename Tdata>
+__global__ void bce_logits_reduce_finalize_kernel(
+    Tdata *out,
+    float *workspace,
+    size_t count,
+    int is_mean) {
+    if (threadIdx.x == 0 && blockIdx.x == 0) {
+        float v = *workspace;
+        if (is_mean) {
+            v /= static_cast<float>(count);
+        }
+        *out = from_float<Tdata>(v);
+    }
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *logits,
+    const void *target,
+    const void *weight,
+    const void *pos_weight,
+    void *stream) const {
+
+    musaStream_t mustream = (musaStream_t)stream;
+    size_t n = _info.num_elements;
+
+    if (_reduction != INFINIOP_REDUCTION_NONE &&
+        (_dtype == INFINI_DTYPE_F16 || _dtype == INFINI_DTYPE_BF16)) {
+        if (workspace_size < sizeof(float)) {
+            return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+        }
+    }
+
+    int block = 256;
+    int grid = static_cast<int>((n + block - 1) / block);
+
+    BCETensorInfoDevice logits_info = make_device_info(_info.logits);
+    BCETensorInfoDevice target_info = make_device_info(_info.target);
+    BCETensorInfoDevice out_info = make_device_info(_info.out);
+    BCETensorInfoDevice weight_info = {};
+    if (_info.weight.total_elements != 0) {
+        weight_info = make_device_info(_info.weight);
+    }
+
+    size_t pos_weight_len = _info.pos_weight.total_elements;
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F32: {
+        if (_reduction != INFINIOP_REDUCTION_NONE) {
+            musaMemsetAsync(out, 0, sizeof(float), mustream);
+        }
+
+        bce_logits_kernel<float, float><<<grid, block, 0, mustream>>>(
+            out,
+            static_cast<const float *>(logits),
+            static_cast<const float *>(target),
+            static_cast<const float *>(weight),
+            static_cast<const float *>(pos_weight),
+            logits_info,
+            target_info,
+            weight_info,
+            out_info,
+            n,
+            pos_weight_len,
+            _reduction);
+
+        if (_reduction == INFINIOP_REDUCTION_MEAN) {
+            bce_logits_mean_scale_kernel_f32<<<1, 1, 0, mustream>>>(
+                static_cast<float *>(out), n);
+        }
+        break;
+    }
+    case INFINI_DTYPE_F16: {
+        auto *logits_h = static_cast<const half *>(logits);
+        auto *target_h = static_cast<const half *>(target);
+        auto *weight_h = static_cast<const half *>(weight);
+        auto *pos_weight_h = static_cast<const half *>(pos_weight);
+
+        void *out_raw = nullptr;
+        float *workspace_f = nullptr;
+
+        if (_reduction == INFINIOP_REDUCTION_NONE) {
+            out_raw = out;
+        } else {
+            workspace_f = static_cast<float *>(workspace);
+            musaMemsetAsync(workspace_f, 0, sizeof(float), mustream);
+            out_raw = workspace_f;
+        }
+
+        bce_logits_kernel<half, float><<<grid, block, 0, mustream>>>(
+            out_raw,
+            logits_h,
+            target_h,
+            weight_h,
+            pos_weight_h,
+            logits_info,
+            target_info,
+            weight_info,
+            out_info,
+            n,
+            pos_weight_len,
+            _reduction);
+
+        if (_reduction != INFINIOP_REDUCTION_NONE) {
+            int is_mean = (_reduction == INFINIOP_REDUCTION_MEAN) ? 1 : 0;
+            bce_logits_reduce_finalize_kernel<half><<<1, 1, 0, mustream>>>(
+                static_cast<half *>(out), workspace_f, n, is_mean);
+        }
+        break;
+    }
+    case INFINI_DTYPE_BF16: {
+        auto *logits_b = static_cast<const cuda_bfloat16 *>(logits);
+        auto *target_b = static_cast<const cuda_bfloat16 *>(target);
+        auto *weight_b = static_cast<const cuda_bfloat16 *>(weight);
+        auto *pos_weight_b = static_cast<const cuda_bfloat16 *>(pos_weight);
+
+        void *out_raw = nullptr;
+        float *workspace_f = nullptr;
+
+        if (_reduction == INFINIOP_REDUCTION_NONE) {
+            out_raw = out;
+        } else {
+            workspace_f = static_cast<float *>(workspace);
+            musaMemsetAsync(workspace_f, 0, sizeof(float), mustream);
+            out_raw = workspace_f;
+        }
+
+        bce_logits_kernel<cuda_bfloat16, float><<<grid, block, 0, mustream>>>(
+            out_raw,
+            logits_b,
+            target_b,
+            weight_b,
+            pos_weight_b,
+            logits_info,
+            target_info,
+            weight_info,
+            out_info,
+            n,
+            pos_weight_len,
+            _reduction);
+
+        if (_reduction != INFINIOP_REDUCTION_NONE) {
+            int is_mean = (_reduction == INFINIOP_REDUCTION_MEAN) ? 1 : 0;
+            bce_logits_reduce_finalize_kernel<cuda_bfloat16><<<1, 1, 0, mustream>>>(
+                static_cast<cuda_bfloat16 *>(out), workspace_f, n, is_mean);
+        }
+        break;
+    }
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::bce_with_logits::moore
\ No newline at end of file
diff --git a/src/infiniop/ops/binary_cross_entropy_with_logits/nvidia/binary_cross_entropy_with_logits_nvidia.cu b/src/infiniop/ops/binary_cross_entropy_with_logits/nvidia/binary_cross_entropy_with_logits_nvidia.cu
new file mode 100644
index 000000000..276bab553
--- /dev/null
+++ b/src/infiniop/ops/binary_cross_entropy_with_logits/nvidia/binary_cross_entropy_with_logits_nvidia.cu
@@ -0,0 +1,332 @@
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "../../../devices/nvidia/nvidia_kernel_common.cuh"
+#include "binary_cross_entropy_with_logits_nvidia.cuh"
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <type_traits>
+
+namespace op::bce_with_logits::nvidia {
+
+using device::nvidia::indexToOffset;
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() = default;
+
+// 在 GPU 侧使用的简化张量信息（固定上限维度，支持 stride）
+constexpr int BCE_MAX_DIMS = 8;
+
+struct BCETensorInfoDevice {
+    size_t ndim;
+    size_t shape[BCE_MAX_DIMS];
+    ptrdiff_t strides[BCE_MAX_DIMS];
+};
+
+static inline BCETensorInfoDevice make_device_info(const BCETensorInfo &info) {
+    BCETensorInfoDevice dev{};
+    dev.ndim = info.ndim;
+    for (size_t i = 0; i < info.ndim && i < static_cast<size_t>(BCE_MAX_DIMS); ++i) {
+        dev.shape[i] = info.dims[i];
+        dev.strides[i] = info.stride[i];
+    }
+    return dev;
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t logits_desc,
+    infiniopTensorDescriptor_t target_desc,
+    infiniopTensorDescriptor_t weight_desc,
+    infiniopTensorDescriptor_t pos_weight_desc,
+    infiniopReduction_t reduction) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = logits_desc->dtype();
+
+    // NVIDIA 实现支持 F16 / F32 / BF16
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    auto result = BCEWithLogitsInfo::create(out_desc, logits_desc, target_desc,
+                                            weight_desc, pos_weight_desc, reduction);
+    CHECK_RESULT(result);
+
+    auto info = result.take();
+
+    // F16/BF16 在做归约时需要一个 float 标量 workspace 来累加
+    size_t workspace_size = 0;
+    if (reduction != INFINIOP_REDUCTION_NONE &&
+        (dtype == INFINI_DTYPE_F16 || dtype == INFINI_DTYPE_BF16)) {
+        workspace_size = sizeof(float);
+    }
+
+    *desc_ptr = new Descriptor(
+        dtype, std::move(info), reduction, workspace_size,
+        nullptr,
+        handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+// 将任意标量类型提升为 float
+template <typename T>
+__device__ __forceinline__ float to_float(T x) {
+    if constexpr (std::is_same_v<T, float>) {
+        return x;
+    } else if constexpr (std::is_same_v<T, half>) {
+        return __half2float(x);
+    } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+        return __bfloat162float(x);
+    } else {
+        return static_cast<float>(x);
+    }
+}
+
+// 从 float 转回目标标量类型
+template <typename T>
+__device__ __forceinline__ T from_float(float x) {
+    if constexpr (std::is_same_v<T, float>) {
+        return x;
+    } else if constexpr (std::is_same_v<T, half>) {
+        return __float2half(x);
+    } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+        return __float2bfloat16(x);
+    } else {
+        return static_cast<T>(x);
+    }
+}
+
+// --- CUDA Kernel: 支持 stride 的数值稳定 BCE 计算 ---
+template <typename Tdata, typename Taccum>
+__global__ void bce_logits_kernel(
+    void *out_raw,
+    const Tdata *logits,
+    const Tdata *target,
+    const Tdata *weight,
+    const Tdata *pos_weight,
+    BCETensorInfoDevice logits_info,
+    BCETensorInfoDevice target_info,
+    BCETensorInfoDevice weight_info,
+    BCETensorInfoDevice out_info,
+    size_t n,
+    size_t pos_weight_len,
+    infiniopReduction_t reduction) {
+
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= n) return;
+
+    // 计算逻辑索引在各张量中的偏移（支持任意 stride）
+    size_t logits_offset = indexToOffset(idx, logits_info.ndim,
+                                         logits_info.shape, logits_info.strides);
+    size_t target_offset = indexToOffset(idx, target_info.ndim,
+                                         target_info.shape, target_info.strides);
+
+    float x = to_float(logits[logits_offset]);
+    float y = to_float(target[target_offset]);
+
+    float pw = 1.0f;
+    if (pos_weight && pos_weight_len > 0) {
+        // 按最后一维广播：假设 pos_weight 是连续的一维张量
+        size_t c = idx % pos_weight_len;
+        pw = to_float(pos_weight[c]);
+    }
+
+    float w = 1.0f;
+    if (weight && weight_info.ndim > 0) {
+        size_t weight_offset = indexToOffset(idx, weight_info.ndim,
+                                             weight_info.shape, weight_info.strides);
+        w = to_float(weight[weight_offset]);
+    }
+
+    // 数值稳定公式：max(x, 0) - x * y * pw + (1 + (pw - 1) * y) * log(1 + exp(-abs(x)))
+    float loss = (fmaxf(x, 0.0f) - x * y * pw +
+                 (1.0f + (pw - 1.0f) * y) * logf(1.0f + expf(-fabsf(x))));
+
+    loss *= w;
+
+    if (reduction == INFINIOP_REDUCTION_NONE) {
+        // 写回逐元素 loss（支持 stride 的 out）
+        size_t out_offset = indexToOffset(idx, out_info.ndim,
+                                          out_info.shape, out_info.strides);
+        auto *out_ptr = static_cast<Tdata *>(out_raw);
+        out_ptr[out_offset] = from_float<Tdata>(loss);
+    } else {
+        // 对于 mean 或 sum，使用 float 累加到标量位置
+        auto *out_accum = static_cast<Taccum *>(out_raw);
+        atomicAdd(out_accum, static_cast<Taccum>(loss));
+    }
+}
+
+// F32 mean 归约：对输出标量做除法
+__global__ void bce_logits_mean_scale_kernel_f32(float *val, size_t count) {
+    if (threadIdx.x == 0 && blockIdx.x == 0) {
+        *val /= static_cast<float>(count);
+    }
+}
+
+// F16/BF16 归约：从 float workspace 写回目标 dtype
+template <typename Tdata>
+__global__ void bce_logits_reduce_finalize_kernel(
+    Tdata *out,
+    float *workspace,
+    size_t count,
+    int is_mean) {
+    if (threadIdx.x == 0 && blockIdx.x == 0) {
+        float v = *workspace;
+        if (is_mean) {
+            v /= static_cast<float>(count);
+        }
+        *out = from_float<Tdata>(v);
+    }
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *logits,
+    const void *target,
+    const void *weight,
+    const void *pos_weight,
+    void *stream) const {
+
+    cudaStream_t custream = (cudaStream_t)stream;
+    size_t n = _info.num_elements;
+
+    // F16/BF16 + 归约需要 float workspace
+    if (_reduction != INFINIOP_REDUCTION_NONE &&
+        (_dtype == INFINI_DTYPE_F16 || _dtype == INFINI_DTYPE_BF16)) {
+        if (workspace_size < sizeof(float)) {
+            return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+        }
+    }
+
+    int block = 256;
+    int grid = static_cast<int>((n + block - 1) / block);
+
+    // 构造 GPU 侧的张量信息（含 stride）
+    BCETensorInfoDevice logits_info = make_device_info(_info.logits);
+    BCETensorInfoDevice target_info = make_device_info(_info.target);
+    BCETensorInfoDevice out_info = make_device_info(_info.out);
+    BCETensorInfoDevice weight_info = {};
+    if (_info.weight.total_elements != 0) {
+        weight_info = make_device_info(_info.weight);
+    }
+
+    size_t pos_weight_len = _info.pos_weight.total_elements;
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F32: {
+        // 如果是规约操作，计算前需将输出位置清零
+        if (_reduction != INFINIOP_REDUCTION_NONE) {
+            cudaMemsetAsync(out, 0, sizeof(float), custream);
+        }
+
+        bce_logits_kernel<float, float><<<grid, block, 0, custream>>>(
+            out,
+            static_cast<const float *>(logits),
+            static_cast<const float *>(target),
+            static_cast<const float *>(weight),
+            static_cast<const float *>(pos_weight),
+            logits_info,
+            target_info,
+            weight_info,
+            out_info,
+            n,
+            pos_weight_len,
+            _reduction);
+
+        if (_reduction == INFINIOP_REDUCTION_MEAN) {
+            bce_logits_mean_scale_kernel_f32<<<1, 1, 0, custream>>>(
+                static_cast<float *>(out), n);
+        }
+        break;
+    }
+    case INFINI_DTYPE_F16: {
+        auto *logits_h = static_cast<const half *>(logits);
+        auto *target_h = static_cast<const half *>(target);
+        auto *weight_h = static_cast<const half *>(weight);
+        auto *pos_weight_h = static_cast<const half *>(pos_weight);
+
+        void *out_raw = nullptr;
+        float *workspace_f = nullptr;
+
+        if (_reduction == INFINIOP_REDUCTION_NONE) {
+            out_raw = out;
+        } else {
+            workspace_f = static_cast<float *>(workspace);
+            cudaMemsetAsync(workspace_f, 0, sizeof(float), custream);
+            out_raw = workspace_f;
+        }
+
+        bce_logits_kernel<half, float><<<grid, block, 0, custream>>>(
+            out_raw,
+            logits_h,
+            target_h,
+            weight_h,
+            pos_weight_h,
+            logits_info,
+            target_info,
+            weight_info,
+            out_info,
+            n,
+            pos_weight_len,
+            _reduction);
+
+        if (_reduction != INFINIOP_REDUCTION_NONE) {
+            int is_mean = (_reduction == INFINIOP_REDUCTION_MEAN) ? 1 : 0;
+            bce_logits_reduce_finalize_kernel<half><<<1, 1, 0, custream>>>(
+                static_cast<half *>(out), workspace_f, n, is_mean);
+        }
+
+        break;
+    }
+    case INFINI_DTYPE_BF16: {
+        auto *logits_b = static_cast<const cuda_bfloat16 *>(logits);
+        auto *target_b = static_cast<const cuda_bfloat16 *>(target);
+        auto *weight_b = static_cast<const cuda_bfloat16 *>(weight);
+        auto *pos_weight_b = static_cast<const cuda_bfloat16 *>(pos_weight);
+
+        void *out_raw = nullptr;
+        float *workspace_f = nullptr;
+
+        if (_reduction == INFINIOP_REDUCTION_NONE) {
+            out_raw = out;
+        } else {
+            workspace_f = static_cast<float *>(workspace);
+            cudaMemsetAsync(workspace_f, 0, sizeof(float), custream);
+            out_raw = workspace_f;
+        }
+
+        bce_logits_kernel<cuda_bfloat16, float><<<grid, block, 0, custream>>>(
+            out_raw,
+            logits_b,
+            target_b,
+            weight_b,
+            pos_weight_b,
+            logits_info,
+            target_info,
+            weight_info,
+            out_info,
+            n,
+            pos_weight_len,
+            _reduction);
+
+        if (_reduction != INFINIOP_REDUCTION_NONE) {
+            int is_mean = (_reduction == INFINIOP_REDUCTION_MEAN) ? 1 : 0;
+            bce_logits_reduce_finalize_kernel<cuda_bfloat16><<<1, 1, 0, custream>>>(
+                static_cast<cuda_bfloat16 *>(out), workspace_f, n, is_mean);
+        }
+
+        break;
+    }
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    auto err = cudaGetLastError();
+    return (err == cudaSuccess) ? INFINI_STATUS_SUCCESS : INFINI_STATUS_INTERNAL_ERROR;
+}
+
+} // namespace op::bce_with_logits::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/binary_cross_entropy_with_logits/nvidia/binary_cross_entropy_with_logits_nvidia.cuh b/src/infiniop/ops/binary_cross_entropy_with_logits/nvidia/binary_cross_entropy_with_logits_nvidia.cuh
new file mode 100644
index 000000000..9493937c8
--- /dev/null
+++ b/src/infiniop/ops/binary_cross_entropy_with_logits/nvidia/binary_cross_entropy_with_logits_nvidia.cuh
@@ -0,0 +1,17 @@
+#ifndef __BINARY_CROSS_ENTROPY_WITH_LOGITS_NVIDIA_CUH__
+#define __BINARY_CROSS_ENTROPY_WITH_LOGITS_NVIDIA_CUH__
+
+#include "../binary_cross_entropy_with_logits.h"
+
+/**
+ * 使用 bce_with_logits.h 中定义的 DESCRIPTOR 宏。
+ * 这将在命名空间 op::bce_with_logits::nvidia 中生成针对 NVIDIA 设备的 Descriptor 类。
+ * * * 在 NVIDIA 端的实现（.cu 文件）中，Opaque 结构体通常包含：
+ * - cudnnHandle_t: 如果使用 cuDNN 的算子实现。
+ * - cudnnTensorDescriptor_t: 用于描述各输入输出张量的 cuDNN 格式。
+ * - KernelConfig: 用于自定义 CUDA Kernel 的网格（Grid）和线程块（Block）配置。
+ * - dataType: 对应的 CUDA 数据类型（如 CUDA_R_32F）。
+ */
+DESCRIPTOR(nvidia)
+
+#endif // __BINARY_CROSS_ENTROPY_WITH_LOGITS_NVIDIA_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/binary_cross_entropy_with_logits/operator.cc b/src/infiniop/ops/binary_cross_entropy_with_logits/operator.cc
new file mode 100644
index 000000000..77c97d98b
--- /dev/null
+++ b/src/infiniop/ops/binary_cross_entropy_with_logits/operator.cc
@@ -0,0 +1,233 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/binary_cross_entropy_with_logits.h"
+
+// 引入各硬件后端的 Descriptor 定义
+#ifdef ENABLE_CPU_API
+#include "cpu/binary_cross_entropy_with_logits_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
+#include "nvidia/binary_cross_entropy_with_logits_nvidia.cuh"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/binary_cross_entropy_with_logits_bang.h"
+#endif
+#ifdef ENABLE_ASCEND_API
+#include "ascend/binary_cross_entropy_with_logits_ascend.h"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/binary_cross_entropy_with_logits_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/binary_cross_entropy_with_logits_moore.h"
+#endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/binary_cross_entropy_with_logits_kunlun.h"
+#endif
+
+// -----------------------------------------------------------------------------
+// 1. 创建描述符
+// -----------------------------------------------------------------------------
+__C infiniStatus_t infiniopCreateBCEWithLogitsDescriptor(
+    infiniopHandle_t handle,
+    infiniopBCEWithLogitsDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t logits_desc,
+    infiniopTensorDescriptor_t target_desc,
+    infiniopTensorDescriptor_t weight_desc,
+    infiniopTensorDescriptor_t pos_weight_desc,
+    infiniopReduction_t reduction) {
+
+#define CREATE(CASE, NAMESPACE) \
+    case CASE: \
+        return op::bce_with_logits::NAMESPACE::Descriptor::create(handle, \
+            reinterpret_cast<op::bce_with_logits::NAMESPACE::Descriptor **>(desc_ptr), \
+            out_desc, logits_desc, target_desc, weight_desc, pos_weight_desc, reduction)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        CREATE(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        CREATE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CREATE
+}
+
+// -----------------------------------------------------------------------------
+// 2. 获取 Workspace 大小
+// -----------------------------------------------------------------------------
+__C infiniStatus_t infiniopGetBCEWithLogitsWorkspaceSize(
+    infiniopBCEWithLogitsDescriptor_t desc,
+    size_t *size) {
+
+#define GET(CASE, NAMESPACE) \
+    case CASE: \
+        *size = reinterpret_cast<const op::bce_with_logits::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        GET(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        GET(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+
+// -----------------------------------------------------------------------------
+// 3. 执行计算
+// -----------------------------------------------------------------------------
+__C infiniStatus_t infiniopBCEWithLogits(
+    infiniopBCEWithLogitsDescriptor_t desc,
+    void *workspace, size_t workspace_size,
+    void *out,
+    const void *logits,
+    const void *target,
+    const void *weight,
+    const void *pos_weight,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE) \
+    case CASE: \
+        return reinterpret_cast<const op::bce_with_logits::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, out, logits, target, weight, pos_weight, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        CALCULATE(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        CALCULATE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CALCULATE
+}
+
+// -----------------------------------------------------------------------------
+// 4. 销毁描述符
+// -----------------------------------------------------------------------------
+__C infiniStatus_t infiniopDestroyBCEWithLogitsDescriptor(infiniopBCEWithLogitsDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE) \
+    case CASE: \
+        delete reinterpret_cast<const op::bce_with_logits::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        DELETE(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        DELETE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DELETE
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/cdist/cdist.h b/src/infiniop/ops/cdist/cdist.h
new file mode 100644
index 000000000..3d5e532a5
--- /dev/null
+++ b/src/infiniop/ops/cdist/cdist.h
@@ -0,0 +1,63 @@
+#ifndef __CDIST_H__
+#define __CDIST_H__
+
+#include "../../operator.h"
+#include "info.h"
+
+/**
+ * # 关于 `cdist` 算子描述符的说明
+ * * 仿照 GEMM 的 PImpl (Opaque) 设计模式，将硬件相关的执行上下文（如 CUDA Handle、计算流等）
+ * 隐藏在 `Opaque` 结构体中，确保头文件在不同后端（CPU/NVIDIA/Ascend）间的一致性。
+ */
+
+#define DESCRIPTOR(NAMESPACE)                                     \
+                                                                  \
+    namespace op::cdist::NAMESPACE {                              \
+    class Descriptor final : public InfiniopDescriptor {          \
+        struct Opaque;                                            \
+        Opaque *_opaque;                                          \
+        infiniDtype_t _dtype;                                     \
+                CdistInfo _info;           /* 包含 M, N, D 维度信息 */       \
+        size_t _workspace_size;                                   \
+                double _p;                 /* 范数阶数，创建时固定 */        \
+                                                                  \
+        Descriptor(                                               \
+            infiniDtype_t dtype,                                  \
+            CdistInfo info,                                       \
+                        double p,                                             \
+            size_t workspace_size_,                               \
+            Opaque *opaque,                                       \
+            infiniDevice_t device_type,                           \
+            int device_id)                                        \
+            : InfiniopDescriptor{device_type, device_id},         \
+              _opaque(opaque),                                    \
+              _dtype(dtype),                                      \
+                            _info(info),                                        \
+                            _workspace_size(workspace_size_),                   \
+                            _p(p) {}                                            \
+                                                                  \
+    public:                                                       \
+        ~Descriptor();                                            \
+                                                                  \
+        size_t workspaceSize() const { return _workspace_size; }  \
+                                                                  \
+        static infiniStatus_t create(                             \
+            infiniopHandle_t handle,                              \
+            Descriptor **desc_ptr,                                \
+            infiniopTensorDescriptor_t y_desc,  /* 输出 (M, N) */  \
+            infiniopTensorDescriptor_t x1_desc, /* 输入 (M, D) */  \
+            infiniopTensorDescriptor_t x2_desc,  /* 输入 (N, D) */ \
+            double p                                             \
+        );                                                        \
+                                                                  \
+        infiniStatus_t calculate(                                 \
+            void *workspace,                                      \
+            size_t workspace_size,                                \
+            void *y,                            /* 结果矩阵 */     \
+            const void *x1,                                       \
+            const void *x2,                                       \
+            void *stream) const;                                  \
+    };                                                            \
+    }
+
+#endif // __CDIST_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/cdist/cpu/cdist_cpu.cc b/src/infiniop/ops/cdist/cpu/cdist_cpu.cc
new file mode 100644
index 000000000..9df32a04d
--- /dev/null
+++ b/src/infiniop/ops/cdist/cpu/cdist_cpu.cc
@@ -0,0 +1,126 @@
+#include "cdist_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include <cmath>
+
+namespace op::cdist::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x1_desc,
+    infiniopTensorDescriptor_t x2_desc,
+    double p) {
+    
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = y_desc->dtype();
+
+    // 1. 类型检查
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16);
+
+    // 2. 解析维度信息 (CdistInfo 逻辑已在之前定义)
+    auto result = CdistInfo::create(y_desc, x1_desc, x2_desc);
+    CHECK_RESULT(result);
+
+    // 3. 实例化描述符，CPU 版通常不需要 workspace
+    *desc_ptr = new Descriptor(
+        dtype, result.take(), p, 0,
+        nullptr,
+        handle->device, handle->device_id);
+    
+    return INFINI_STATUS_SUCCESS;
+}
+
+/**
+ * 核心计算模板：处理不同数据类型
+ */
+template <typename Tdata>
+void calculate_dist(
+    const CdistInfo &info,
+    void *y,
+    const void *x1,
+    const void *x2,
+    double p) {
+
+    // [Image of the coordinate-wise subtraction and p-norm distance formula for cdist]
+    
+#pragma omp parallel for collapse(2)
+    for (ptrdiff_t b = 0; b < ptrdiff_t(info.batch); ++b) {
+        for (ptrdiff_t i = 0; i < ptrdiff_t(info.m); ++i) {
+            for (ptrdiff_t j = 0; j < ptrdiff_t(info.n); ++j) {
+                
+                // 定位输出位置 y[b, i, j]
+                auto y_ptr = reinterpret_cast<Tdata *>(y) + 
+                             b * info.y_matrix.stride + 
+                             i * info.y_matrix.row_stride + 
+                             j * info.y_matrix.col_stride;
+
+                // 定位向量位置 x1[b, i, :] 和 x2[b, j, :]
+                auto x1_vec = reinterpret_cast<const Tdata *>(x1) + 
+                              b * info.x1_matrix.stride + 
+                              i * info.x1_matrix.row_stride;
+                auto x2_vec = reinterpret_cast<const Tdata *>(x2) + 
+                              b * info.x2_matrix.stride + 
+                              j * info.x2_matrix.row_stride;
+
+                double dist = 0.0;
+                
+                // 遍历特征维度 D
+                for (size_t k = 0; k < info.d; ++k) {
+                    float v1 = utils::cast<float>(*(x1_vec + k * info.x1_matrix.col_stride));
+                    float v2 = utils::cast<float>(*(x2_vec + k * info.x2_matrix.col_stride));
+                    float diff = std::abs(v1 - v2);
+
+                    if (p == 1.0) {
+                        dist += diff;
+                    } else if (p == 2.0) {
+                        dist += diff * diff;
+                    } else if (std::isinf(p)) {
+                        dist = std::max((double)dist, (double)diff);
+                    } else {
+                        dist += std::pow((double)diff, p);
+                    }
+                }
+
+                // 最终距离处理
+                if (p == 2.0) {
+                    dist = std::sqrt(dist);
+                } else if (!std::isinf(p) && p != 1.0) {
+                    dist = std::pow(dist, 1.0 / p);
+                }
+
+                *y_ptr = utils::cast<Tdata>(static_cast<float>(dist));
+            }
+        }
+    }
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x1,
+    const void *x2,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        cpu::calculate_dist<fp16_t>(_info, y, x1, x2, _p);
+        return INFINI_STATUS_SUCCESS;
+
+    case INFINI_DTYPE_BF16:
+        cpu::calculate_dist<bf16_t>(_info, y, x1, x2, _p);
+        return INFINI_STATUS_SUCCESS;
+
+    case INFINI_DTYPE_F32:
+        cpu::calculate_dist<float>(_info, y, x1, x2, _p);
+        return INFINI_STATUS_SUCCESS;
+
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+}
+
+} // namespace op::cdist::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/cdist/cpu/cdist_cpu.h b/src/infiniop/ops/cdist/cpu/cdist_cpu.h
new file mode 100644
index 000000000..07d182678
--- /dev/null
+++ b/src/infiniop/ops/cdist/cpu/cdist_cpu.h
@@ -0,0 +1,11 @@
+#ifndef __CDIST_CPU_H__
+#define __CDIST_CPU_H__
+
+#include "../cdist.h"
+
+// 使用 cdist.h 中定义的 DESCRIPTOR 宏
+// 这将在命名空间 op::cdist::cpu 中生成 Descriptor 类
+// 该类包含对 CdistInfo 的引用以及 create/calculate 等接口
+DESCRIPTOR(cpu)
+
+#endif // __CDIST_CPU_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/cdist/info.h b/src/infiniop/ops/cdist/info.h
new file mode 100644
index 000000000..4425b1cb0
--- /dev/null
+++ b/src/infiniop/ops/cdist/info.h
@@ -0,0 +1,114 @@
+#ifndef __CDIST_INFO_H__
+#define __CDIST_INFO_H__
+
+#include "../../../utils.h"
+#include "../../operator.h"
+#include "../../tensor.h"
+#include <algorithm>
+
+namespace op::cdist {
+
+/**
+ * 借用 BlasMatrix 的概念来描述 cdist 的输入输出矩阵
+ * x1: (Batch, M, D)
+ * x2: (Batch, N, D)
+ * y:  (Batch, M, N)
+ */
+struct CdistMatrix {
+    size_t ndim;
+    size_t batch;
+    ptrdiff_t stride;     // Batch 之间的步长
+    size_t rows;          // M 或 N
+    size_t cols;          // D (特征维度) 或结果中的 N
+    ptrdiff_t row_stride;
+    ptrdiff_t col_stride;
+
+    static utils::Result<CdistMatrix> create(infiniopTensorDescriptor_t layout) {
+        CdistMatrix ans;
+        auto ndim = layout->ndim();
+
+        if (ndim == 2) {
+            ans.ndim = 2;
+            ans.batch = 1;
+            ans.stride = 0;
+            ans.rows = layout->dim(0);
+            ans.cols = layout->dim(1);
+            ans.row_stride = layout->stride(0);
+            ans.col_stride = layout->stride(1);
+        } else if (ndim == 3) {
+            ans.ndim = 3;
+            ans.batch = layout->dim(0);
+            ans.stride = ans.batch == 1 ? 0 : layout->stride(0);
+            ans.rows = layout->dim(1);
+            ans.cols = layout->dim(2);
+            ans.row_stride = layout->stride(1);
+            ans.col_stride = layout->stride(2);
+        } else {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        return utils::Result<CdistMatrix>(ans);
+    }
+
+    bool match_batch(size_t _batch) const {
+        return batch == _batch || batch == 1;
+    }
+};
+
+class CdistInfo {
+    CdistInfo() = default;
+
+public:
+    CdistMatrix x1_matrix;
+    CdistMatrix x2_matrix;
+    CdistMatrix y_matrix;
+
+    size_t m, n, d, batch;
+
+    static utils::Result<CdistInfo> create(
+        infiniopTensorDescriptor_t y_desc,
+        infiniopTensorDescriptor_t x1_desc,
+        infiniopTensorDescriptor_t x2_desc) {
+
+        auto x1_res = CdistMatrix::create(x1_desc);
+        CHECK_RESULT(x1_res);
+
+        auto x2_res = CdistMatrix::create(x2_desc);
+        CHECK_RESULT(x2_res);
+
+        auto y_res = CdistMatrix::create(y_desc);
+        CHECK_RESULT(y_res);
+
+        auto x1 = x1_res.take();
+        auto x2 = x2_res.take();
+        auto y = y_res.take();
+
+        // 1. 维度校验
+        // x1(M, D), x2(N, D) -> y(M, N)
+        if (x1.cols != x2.cols) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE; // 特征维度 D 必须一致
+        }
+        if (y.rows != x1.rows || y.cols != x2.rows) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE; // 输出形状必须为 M x N
+        }
+
+        // 2. Batch 校验
+        size_t batch_size = y.batch;
+        if (!x1.match_batch(batch_size) || !x2.match_batch(batch_size)) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        size_t m = x1.rows;
+        size_t n = x2.rows;
+        size_t d = x1.cols;
+
+        return utils::Result<CdistInfo>(CdistInfo{
+            x1, x2, y,
+            m, n, d, batch_size
+        });
+    }
+};
+
+} // namespace op::cdist
+
+#endif // __CDIST_INFO_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/cdist/metax/cdist_metax.h b/src/infiniop/ops/cdist/metax/cdist_metax.h
new file mode 100644
index 000000000..59d427b8f
--- /dev/null
+++ b/src/infiniop/ops/cdist/metax/cdist_metax.h
@@ -0,0 +1,16 @@
+#ifndef __CDIST_METAX_CUH__
+#define __CDIST_METAX_CUH__
+
+#include "../cdist.h"
+
+/**
+ * 使用 cdist.h 中定义的 DESCRIPTOR 宏。
+ * 这将在命名空间 op::cdist::metax 中生成针对 METAX 设备的 Descriptor 类。
+ * * 在 METAX 端的具体实现中，Opaque 结构体通常会存储：
+ * - cublasHandle_t: 用于 p=2.0 时的矩阵乘法加速。
+ * - cudaStream_t: 当前执行的任务流。
+ * - 自定义 Kernel 的配置参数。
+ */
+DESCRIPTOR(metax)
+
+#endif // __CDIST_METAX_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/cdist/metax/cdist_metax.maca b/src/infiniop/ops/cdist/metax/cdist_metax.maca
new file mode 100644
index 000000000..104a06a01
--- /dev/null
+++ b/src/infiniop/ops/cdist/metax/cdist_metax.maca
@@ -0,0 +1,168 @@
+#include <iostream>
+#include "../../../devices/metax/metax_handle.h"
+#include "cdist_metax.h"
+namespace op::cdist::metax {
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x1_desc,
+    infiniopTensorDescriptor_t x2_desc,
+    double p) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = y_desc->dtype();
+
+    // 目前 METAX 后端仅支持 F32，测试也是 F32
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32);
+
+    auto result = CdistInfo::create(y_desc, x1_desc, x2_desc);
+    CHECK_RESULT(result);
+
+    // 当前实现不使用 workspace
+    *desc_ptr = new Descriptor(
+        dtype, result.take(), p, 0,
+        nullptr,
+        handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+// --- Kernel 1: L2 Epilogue ---
+// 保留占位，当前实现未使用 GEMM 加速路径
+template <typename T>
+__global__ void cdist_l2_epilogue_kernel(T *y, const T *x1_norm, const T *x2_norm,
+                                         int M, int N, int batch_stride_y) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    int b = blockIdx.z;
+
+    if (i < M && j < N) {
+        int idx = b * batch_stride_y + i * N + j;
+        // GEMM 已经计算了 -2*x1*x2^T 并存入 y
+        float val = (float)x1_norm[b * M + i] + (float)x2_norm[b * N + j] + (float)y[idx];
+        y[idx] = (T)sqrtf(fmaxf(val, 0.0f));
+    }
+}
+
+// --- Kernel 2: Generic P-Norm (F32, 支持通用步长) ---
+__global__ void cdist_generic_kernel_f32(
+    float *y,
+    const float *x1,
+    const float *x2,
+    size_t m,
+    size_t n,
+    size_t d,
+    ptrdiff_t x1_stride,
+    ptrdiff_t x1_row_stride,
+    ptrdiff_t x1_col_stride,
+    ptrdiff_t x2_stride,
+    ptrdiff_t x2_row_stride,
+    ptrdiff_t x2_col_stride,
+    ptrdiff_t y_stride,
+    ptrdiff_t y_row_stride,
+    ptrdiff_t y_col_stride,
+    double p) {
+
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    int b = blockIdx.z;
+
+    if (i >= (int)m || j >= (int)n) {
+        return;
+    }
+
+    // 定位输出位置 y[b, i, j]
+    float *y_ptr = y +
+                   b * y_stride +
+                   i * y_row_stride +
+                   j * y_col_stride;
+
+    // 定位向量位置 x1[b, i, :] 和 x2[b, j, :]
+    const float *x1_vec = x1 +
+                          b * x1_stride +
+                          i * x1_row_stride;
+    const float *x2_vec = x2 +
+                          b * x2_stride +
+                          j * x2_row_stride;
+
+    double dist = 0.0;
+
+    for (size_t k = 0; k < d; ++k) {
+        float v1 = *(x1_vec + k * x1_col_stride);
+        float v2 = *(x2_vec + k * x2_col_stride);
+        float diff = fabsf(v1 - v2);
+
+        if (p == 1.0) {
+            dist += diff;
+        } else if (p == 2.0) {
+            dist += diff * diff;
+        } else if (isinf(p)) {
+            dist = fmaxf((float)dist, diff);
+        } else {
+            dist += powf((float)diff, (float)p);
+        }
+    }
+
+    if (p == 2.0) {
+        dist = sqrtf((float)dist);
+    } else if (!isinf(p) && p != 1.0) {
+        dist = powf((float)dist, 1.0f / (float)p);
+    }
+
+    *y_ptr = (float)dist;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x1,
+    const void *x2,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    if (_dtype != INFINI_DTYPE_F32) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    mcStream_t custream = (mcStream_t)stream;
+    dim3 block(16, 16);
+    dim3 grid(
+        static_cast<unsigned int>((_info.n + block.x - 1) / block.x),
+        static_cast<unsigned int>((_info.m + block.y - 1) / block.y),
+        static_cast<unsigned int>(_info.batch));
+
+    cdist_generic_kernel_f32<<<grid, block, 0, custream>>>(
+        static_cast<float *>(y),
+        static_cast<const float *>(x1),
+        static_cast<const float *>(x2),
+        _info.m,
+        _info.n,
+        _info.d,
+        _info.x1_matrix.stride,
+        _info.x1_matrix.row_stride,
+        _info.x1_matrix.col_stride,
+        _info.x2_matrix.stride,
+        _info.x2_matrix.row_stride,
+        _info.x2_matrix.col_stride,
+        _info.y_matrix.stride,
+        _info.y_matrix.row_stride,
+        _info.y_matrix.col_stride,
+        _p);
+
+    auto err = mcGetLastError();
+    if (err != mcSuccess) {
+        return INFINI_STATUS_INTERNAL_ERROR;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cdist::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/cdist/moore/cdist_moore.h b/src/infiniop/ops/cdist/moore/cdist_moore.h
new file mode 100644
index 000000000..d84e2ac60
--- /dev/null
+++ b/src/infiniop/ops/cdist/moore/cdist_moore.h
@@ -0,0 +1,16 @@
+#ifndef __CDIST_MOORE_H__
+#define __CDIST_MOORE_H__
+
+#include "../cdist.h"
+
+/**
+ * 使用 cdist.h 中定义的 DESCRIPTOR 宏。
+ * 这将在命名空间 op::cdist::moore 中生成针对 Moore 设备的 Descriptor 类。
+ * * 在 Moore 端的具体实现中，Opaque 结构体通常会存储：
+ * - mublasHandle_t: 用于 p=2.0 时的矩阵乘法加速（对应 NVIDIA 的 cuBLAS）。
+ * - musaStream_t: 当前执行的任务流。
+ * - 自定义 Kernel 的配置参数。
+ */
+DESCRIPTOR(moore)
+
+#endif // __CDIST_MOORE_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/cdist/moore/cdist_moore.mu b/src/infiniop/ops/cdist/moore/cdist_moore.mu
new file mode 100644
index 000000000..9149cfb46
--- /dev/null
+++ b/src/infiniop/ops/cdist/moore/cdist_moore.mu
@@ -0,0 +1,145 @@
+#include <iostream>
+#include "../../../devices/moore/moore_handle.h"
+#include "cdist_moore.h"
+#include <musa_runtime.h>
+
+namespace op::cdist::moore {
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x1_desc,
+    infiniopTensorDescriptor_t x2_desc,
+    double p) {
+
+    // 1. 转换至 Moore 句柄
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = y_desc->dtype();
+
+    // 保持与原版一致，目前仅支持 F32
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32);
+
+    auto result = CdistInfo::create(y_desc, x1_desc, x2_desc);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        dtype, result.take(), p, 0,
+        nullptr,
+        handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+// --- Kernel: Generic P-Norm (MUSA F32 实现) ---
+__global__ void cdist_generic_kernel_f32(
+    float *y,
+    const float *x1,
+    const float *x2,
+    size_t m,
+    size_t n,
+    size_t d,
+    ptrdiff_t x1_stride,
+    ptrdiff_t x1_row_stride,
+    ptrdiff_t x1_col_stride,
+    ptrdiff_t x2_stride,
+    ptrdiff_t x2_row_stride,
+    ptrdiff_t x2_col_stride,
+    ptrdiff_t y_stride,
+    ptrdiff_t y_row_stride,
+    ptrdiff_t y_col_stride,
+    double p) {
+
+    // 2. MUSA 同样支持 3D 线程索引
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    int b = blockIdx.z;
+
+    if (i >= (int)m || j >= (int)n) {
+        return;
+    }
+
+    // 定位输出 y[b, i, j]
+    float *y_ptr = y + b * y_stride + i * y_row_stride + j * y_col_stride;
+
+    // 定位输入向量
+    const float *x1_vec = x1 + b * x1_stride + i * x1_row_stride;
+    const float *x2_vec = x2 + b * x2_stride + j * x2_row_stride;
+
+    double dist = 0.0;
+
+    for (size_t k = 0; k < d; ++k) {
+        float v1 = *(x1_vec + k * x1_col_stride);
+        float v2 = *(x2_vec + k * x2_col_stride);
+        float diff = fabsf(v1 - v2);
+
+        if (p == 1.0) {
+            dist += (double)diff;
+        } else if (p == 2.0) {
+            dist += (double)diff * diff;
+        } else if (isinf(p)) {
+            dist = fmaxf((float)dist, diff);
+        } else {
+            dist += powf(diff, (float)p);
+        }
+    }
+
+    if (p == 2.0) {
+        dist = sqrtf((float)dist);
+    } else if (!isinf(p) && p != 1.0) {
+        dist = powf((float)dist, 1.0f / (float)p);
+    }
+
+    *y_ptr = (float)dist;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x1,
+    const void *x2,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    if (_dtype != INFINI_DTYPE_F32) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    // 3. 切换至 musaStream_t
+    musaStream_t mustream = reinterpret_cast<musaStream_t>(stream);
+    
+    // 保持 16x16 的 Block 大小，这在 MUSA 架构上也是通用的
+    dim3 block(16, 16);
+    dim3 grid(
+        static_cast<unsigned int>((_info.n + block.x - 1) / block.x),
+        static_cast<unsigned int>((_info.m + block.y - 1) / block.y),
+        static_cast<unsigned int>(_info.batch));
+
+    cdist_generic_kernel_f32<<<grid, block, 0, mustream>>>(
+        static_cast<float *>(y),
+        static_cast<const float *>(x1),
+        static_cast<const float *>(x2),
+        _info.m,
+        _info.n,
+        _info.d,
+        _info.x1_matrix.stride,
+        _info.x1_matrix.row_stride,
+        _info.x1_matrix.col_stride,
+        _info.x2_matrix.stride,
+        _info.x2_matrix.row_stride,
+        _info.x2_matrix.col_stride,
+        _info.y_matrix.stride,
+        _info.y_matrix.row_stride,
+        _info.y_matrix.col_stride,
+        _p);
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cdist::moore
\ No newline at end of file
diff --git a/src/infiniop/ops/cdist/nvidia/cdist_nvidia.cu b/src/infiniop/ops/cdist/nvidia/cdist_nvidia.cu
new file mode 100644
index 000000000..4d8e5726e
--- /dev/null
+++ b/src/infiniop/ops/cdist/nvidia/cdist_nvidia.cu
@@ -0,0 +1,168 @@
+#include <iostream>
+#include "../../../devices/nvidia/nvidia_handle.cuh"
+#include "cdist_nvidia.cuh"
+namespace op::cdist::nvidia {
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x1_desc,
+    infiniopTensorDescriptor_t x2_desc,
+    double p) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = y_desc->dtype();
+
+    // 目前 NVIDIA 后端仅支持 F32，测试也是 F32
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F32);
+
+    auto result = CdistInfo::create(y_desc, x1_desc, x2_desc);
+    CHECK_RESULT(result);
+
+    // 当前实现不使用 workspace
+    *desc_ptr = new Descriptor(
+        dtype, result.take(), p, 0,
+        nullptr,
+        handle->device, handle->device_id);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+// --- Kernel 1: L2 Epilogue ---
+// 保留占位，当前实现未使用 GEMM 加速路径
+template <typename T>
+__global__ void cdist_l2_epilogue_kernel(T *y, const T *x1_norm, const T *x2_norm,
+                                         int M, int N, int batch_stride_y) {
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    int b = blockIdx.z;
+
+    if (i < M && j < N) {
+        int idx = b * batch_stride_y + i * N + j;
+        // GEMM 已经计算了 -2*x1*x2^T 并存入 y
+        float val = (float)x1_norm[b * M + i] + (float)x2_norm[b * N + j] + (float)y[idx];
+        y[idx] = (T)sqrtf(fmaxf(val, 0.0f));
+    }
+}
+
+// --- Kernel 2: Generic P-Norm (F32, 支持通用步长) ---
+__global__ void cdist_generic_kernel_f32(
+    float *y,
+    const float *x1,
+    const float *x2,
+    size_t m,
+    size_t n,
+    size_t d,
+    ptrdiff_t x1_stride,
+    ptrdiff_t x1_row_stride,
+    ptrdiff_t x1_col_stride,
+    ptrdiff_t x2_stride,
+    ptrdiff_t x2_row_stride,
+    ptrdiff_t x2_col_stride,
+    ptrdiff_t y_stride,
+    ptrdiff_t y_row_stride,
+    ptrdiff_t y_col_stride,
+    double p) {
+
+    int j = blockIdx.x * blockDim.x + threadIdx.x;
+    int i = blockIdx.y * blockDim.y + threadIdx.y;
+    int b = blockIdx.z;
+
+    if (i >= (int)m || j >= (int)n) {
+        return;
+    }
+
+    // 定位输出位置 y[b, i, j]
+    float *y_ptr = y +
+                   b * y_stride +
+                   i * y_row_stride +
+                   j * y_col_stride;
+
+    // 定位向量位置 x1[b, i, :] 和 x2[b, j, :]
+    const float *x1_vec = x1 +
+                          b * x1_stride +
+                          i * x1_row_stride;
+    const float *x2_vec = x2 +
+                          b * x2_stride +
+                          j * x2_row_stride;
+
+    double dist = 0.0;
+
+    for (size_t k = 0; k < d; ++k) {
+        float v1 = *(x1_vec + k * x1_col_stride);
+        float v2 = *(x2_vec + k * x2_col_stride);
+        float diff = fabsf(v1 - v2);
+
+        if (p == 1.0) {
+            dist += diff;
+        } else if (p == 2.0) {
+            dist += diff * diff;
+        } else if (isinf(p)) {
+            dist = fmaxf((float)dist, diff);
+        } else {
+            dist += powf((float)diff, (float)p);
+        }
+    }
+
+    if (p == 2.0) {
+        dist = sqrtf((float)dist);
+    } else if (!isinf(p) && p != 1.0) {
+        dist = powf((float)dist, 1.0f / (float)p);
+    }
+
+    *y_ptr = (float)dist;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x1,
+    const void *x2,
+    void *stream) const {
+
+    (void)workspace;
+    (void)workspace_size;
+
+    if (_dtype != INFINI_DTYPE_F32) {
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    cudaStream_t custream = (cudaStream_t)stream;
+    dim3 block(16, 16);
+    dim3 grid(
+        static_cast<unsigned int>((_info.n + block.x - 1) / block.x),
+        static_cast<unsigned int>((_info.m + block.y - 1) / block.y),
+        static_cast<unsigned int>(_info.batch));
+
+    cdist_generic_kernel_f32<<<grid, block, 0, custream>>>(
+        static_cast<float *>(y),
+        static_cast<const float *>(x1),
+        static_cast<const float *>(x2),
+        _info.m,
+        _info.n,
+        _info.d,
+        _info.x1_matrix.stride,
+        _info.x1_matrix.row_stride,
+        _info.x1_matrix.col_stride,
+        _info.x2_matrix.stride,
+        _info.x2_matrix.row_stride,
+        _info.x2_matrix.col_stride,
+        _info.y_matrix.stride,
+        _info.y_matrix.row_stride,
+        _info.y_matrix.col_stride,
+        _p);
+
+    auto err = cudaGetLastError();
+    if (err != cudaSuccess) {
+        return INFINI_STATUS_INTERNAL_ERROR;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::cdist::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/cdist/nvidia/cdist_nvidia.cuh b/src/infiniop/ops/cdist/nvidia/cdist_nvidia.cuh
new file mode 100644
index 000000000..02dfc00e2
--- /dev/null
+++ b/src/infiniop/ops/cdist/nvidia/cdist_nvidia.cuh
@@ -0,0 +1,16 @@
+#ifndef __CDIST_NVIDIA_CUH__
+#define __CDIST_NVIDIA_CUH__
+
+#include "../cdist.h"
+
+/**
+ * 使用 cdist.h 中定义的 DESCRIPTOR 宏。
+ * 这将在命名空间 op::cdist::nvidia 中生成针对 NVIDIA 设备的 Descriptor 类。
+ * * 在 NVIDIA 端的具体实现中，Opaque 结构体通常会存储：
+ * - cublasHandle_t: 用于 p=2.0 时的矩阵乘法加速。
+ * - cudaStream_t: 当前执行的任务流。
+ * - 自定义 Kernel 的配置参数。
+ */
+DESCRIPTOR(nvidia)
+
+#endif // __CDIST_NVIDIA_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/cdist/operator.cc b/src/infiniop/ops/cdist/operator.cc
new file mode 100644
index 000000000..798426085
--- /dev/null
+++ b/src/infiniop/ops/cdist/operator.cc
@@ -0,0 +1,229 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/cdist.h"
+
+// 引入各硬件后端的 Descriptor 定义
+#ifdef ENABLE_CPU_API
+#include "cpu/cdist_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API) || defined(ENABLE_HYGON_API)
+#include "nvidia/cdist_nvidia.cuh"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/cdist_bang.h"
+#endif
+#ifdef ENABLE_ASCEND_API
+#include "ascend/cdist_ascend.h"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/cdist_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/cdist_moore.h"
+#endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/cdist_kunlun.h"
+#endif
+
+// -----------------------------------------------------------------------------
+// 1. 创建描述符
+// -----------------------------------------------------------------------------
+__C infiniStatus_t infiniopCreateCdistDescriptor(
+    infiniopHandle_t handle,
+    infiniopCdistDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x1_desc,
+    infiniopTensorDescriptor_t x2_desc,
+    double p) {
+
+#define CREATE(CASE, NAMESPACE) \
+    case CASE: \
+        return op::cdist::NAMESPACE::Descriptor::create(handle, \
+            reinterpret_cast<op::cdist::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc, x1_desc, x2_desc, p)
+
+    switch (handle->device) {
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        CREATE(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        CREATE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CREATE
+}
+
+// -----------------------------------------------------------------------------
+// 2. 获取 Workspace 大小
+// -----------------------------------------------------------------------------
+__C infiniStatus_t infiniopGetCdistWorkspaceSize(
+    infiniopCdistDescriptor_t desc,
+    size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                   \
+    case CASE:                                                                                 \
+        *size = reinterpret_cast<const op::cdist::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        GET(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        GET(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+}
+
+// -----------------------------------------------------------------------------
+// 3. 执行计算 (计算成对距离)
+// -----------------------------------------------------------------------------
+__C infiniStatus_t infiniopCdist(
+    infiniopCdistDescriptor_t desc,
+    void *workspace, size_t workspace_size,
+    void *y,
+    const void *x1,
+    const void *x2,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE) \
+    case CASE: \
+        return reinterpret_cast<const op::cdist::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, x1, x2, stream)
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        CALCULATE(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        CALCULATE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef CALCULATE
+}
+
+// -----------------------------------------------------------------------------
+// 4. 销毁描述符
+// -----------------------------------------------------------------------------
+__C infiniStatus_t infiniopDestroyCdistDescriptor(infiniopCdistDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                  \
+        delete reinterpret_cast<const op::cdist::NAMESPACE::Descriptor *>(desc);\
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_HYGON_API
+        DELETE(INFINI_DEVICE_HYGON, nvidia);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_ASCEND_API
+        DELETE(INFINI_DEVICE_ASCEND, ascend);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef DELETE
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc
new file mode 100644
index 000000000..28750c10f
--- /dev/null
+++ b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.cc
@@ -0,0 +1,54 @@
+#include "reciprocal_cpu.h"
+
+namespace op::reciprocal::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &y_desc = out_desc;
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = y_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    // Reciprocal typically only supports floating point types
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<ReciprocalOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<ReciprocalOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<ReciprocalOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<ReciprocalOp, bf16_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::reciprocal::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h
new file mode 100644
index 000000000..94b6dd25d
--- /dev/null
+++ b/src/infiniop/ops/reciprocal/cpu/reciprocal_cpu.h
@@ -0,0 +1,19 @@
+#ifndef __RECIPROCAL_CPU_H__
+#define __RECIPROCAL_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(reciprocal, cpu)
+
+namespace op::reciprocal::cpu {
+typedef struct ReciprocalOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    T operator()(const T &x) const {
+        return static_cast<T>(1) / x;
+    }
+} ReciprocalOp;
+} // namespace op::reciprocal::cpu
+
+#endif // __RECIPROCAL_CPU_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/reciprocal/cuda/kernel.cuh b/src/infiniop/ops/reciprocal/cuda/kernel.cuh
new file mode 100644
index 000000000..030dcb17e
--- /dev/null
+++ b/src/infiniop/ops/reciprocal/cuda/kernel.cuh
@@ -0,0 +1,27 @@
+#ifndef __RECIPROCAL_CUDA_H__
+#define __RECIPROCAL_CUDA_H__
+
+namespace op::reciprocal::cuda {
+typedef struct ReciprocalOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return h2rcp(x);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return hrcp(x);
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // bfloat16 does not have a direct hrcp intrinsic in some versions, 
+            // often handled by converting to float or using specific bf16 intrinsics
+            return __float2bfloat16(1.0f / __bfloat162float(x));
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __frcp_rd(x);
+        } else {
+            return static_cast<T>(1) / x;
+        }
+    }
+} ReciprocalOp;
+} // namespace op::reciprocal::cuda
+
+#endif // __RECIPROCAL_CUDA_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/reciprocal/metax/reciprocal_metax.h b/src/infiniop/ops/reciprocal/metax/reciprocal_metax.h
new file mode 100644
index 000000000..39a0c541a
--- /dev/null
+++ b/src/infiniop/ops/reciprocal/metax/reciprocal_metax.h
@@ -0,0 +1,8 @@
+#ifndef __RECIPROCAL_METAX_API_H__
+#define __RECIPROCAL_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(reciprocal, metax)
+
+#endif // __RECIPROCAL_METAX_API_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/reciprocal/metax/reciprocal_metax.maca b/src/infiniop/ops/reciprocal/metax/reciprocal_metax.maca
new file mode 100644
index 000000000..5ae1de260
--- /dev/null
+++ b/src/infiniop/ops/reciprocal/metax/reciprocal_metax.maca
@@ -0,0 +1,61 @@
+#include "../../../elementwise/metax/elementwise_metax.h"
+
+#include "reciprocal_metax.h"
+#include "reciprocal_metax_kernel.h"
+
+namespace op::reciprocal::metax {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &y_desc = out_desc;
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = y_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    // Reciprocal typically only supports floating point types
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create METAX elementwise descriptor
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, metax::ReciprocalOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, metax::ReciprocalOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, metax::ReciprocalOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, metax::ReciprocalOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::reciprocal::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/reciprocal/metax/reciprocal_metax_kernel.h b/src/infiniop/ops/reciprocal/metax/reciprocal_metax_kernel.h
new file mode 100644
index 000000000..9c5cb706c
--- /dev/null
+++ b/src/infiniop/ops/reciprocal/metax/reciprocal_metax_kernel.h
@@ -0,0 +1,47 @@
+#ifndef __RECIPROCAL_METAX_KERNEL_H__
+#define __RECIPROCAL_METAX_KERNEL_H__
+
+/*
+ * This file contains the Reciprocal operation implementation for the MUSA backend.
+ *
+ * It follows the consistent code structure to ensure alignment across different
+ * hardware platforms within the Moore Threads (MUSA) ecosystem.
+ */
+namespace op::reciprocal::metax {
+
+typedef struct ReciprocalOp {
+public:
+    // 一元算子，输入数量为 1
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            // 使用 MUSA 的 half2 倒数指令（如果硬件支持）
+            // 或者转为 float2 进行计算
+            float2 f2 = __half22float2(a);
+            f2.x = 1.0f / f2.x;
+            f2.y = 1.0f / f2.y;
+            return __float22half2_rn(f2);
+        } else if constexpr (std::is_same_v<T, half>) {
+            // 提升到 float 计算以保证数值稳定性
+            return __float2half(1.0f / __half2float(a));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // BF16 在 MUSA 上推荐转为 float 处理
+            float a_f = __bfloat162float(a);
+            return __float2bfloat16_rn(1.0f / a_f);
+        } else if constexpr (std::is_same_v<T, float>) {
+            // 编译器通常会将 1.0f/a 优化为硬件 rcp 指令 (Round to Nearest)
+            return 1.0f / a;
+        } else if constexpr (std::is_same_v<T, double>) {
+            return 1.0 / a;
+        } else {
+            // 整数类型倒数通常返回 0 (除 1 以外)，保持标准 C++ 行为
+            return static_cast<T>(1) / a;
+        }
+    }
+} ReciprocalOp;
+
+} // namespace op::reciprocal::metax
+
+#endif // __RECIPROCAL_METAX_KERNEL_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/reciprocal/moore/reciprocal_moore.h b/src/infiniop/ops/reciprocal/moore/reciprocal_moore.h
new file mode 100644
index 000000000..f0cfe6f42
--- /dev/null
+++ b/src/infiniop/ops/reciprocal/moore/reciprocal_moore.h
@@ -0,0 +1,11 @@
+#ifndef __RECIPROCAL_MOORE_API_H__
+#define __RECIPROCAL_MOORE_API_H__
+
+// 1. 切换到 Moore 平台的 elementwise API 定义文件
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+// 2. 调用宏生成 op::reciprocal::moore::Descriptor
+// 宏展开后会包含 create 和 calculate 的标准声明
+ELEMENTWISE_DESCRIPTOR(reciprocal, moore)
+
+#endif // __RECIPROCAL_MOORE_API_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/reciprocal/moore/reciprocal_moore.mu b/src/infiniop/ops/reciprocal/moore/reciprocal_moore.mu
new file mode 100644
index 000000000..063ec2740
--- /dev/null
+++ b/src/infiniop/ops/reciprocal/moore/reciprocal_moore.mu
@@ -0,0 +1,66 @@
+#include "../../../elementwise/moore/elementwise_moore.h"
+
+#include "reciprocal_moore_kernel.h"
+#include "reciprocal_moore.h"
+
+namespace op::reciprocal::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    // 1. 解析 Moore (MUSA) 句柄
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &y_desc = out_desc;
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = y_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    // 2. 校验数据类型：Moore 平台同样在浮点类型上执行倒数运算
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // 3. 使用 Moore 平台的 Elementwise 描述符创建宏
+    // 该宏会自动处理 MUSA 后端的算子元数据初始化
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    // 4. 分发至 Moore 特化的计算逻辑
+    // 注意：cuda::ReciprocalOp 替换为 moore::ReciprocalOp
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, moore::ReciprocalOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        // 确保使用 Moore 环境下的 bfloat16 类型定义
+        return _device_info->calculate<256, moore::ReciprocalOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, moore::ReciprocalOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, moore::ReciprocalOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::reciprocal::moore
\ No newline at end of file
diff --git a/src/infiniop/ops/reciprocal/moore/reciprocal_moore_kernel.h b/src/infiniop/ops/reciprocal/moore/reciprocal_moore_kernel.h
new file mode 100644
index 000000000..74d162671
--- /dev/null
+++ b/src/infiniop/ops/reciprocal/moore/reciprocal_moore_kernel.h
@@ -0,0 +1,47 @@
+#ifndef __RECIPROCAL_MOORE_KERNEL_H__
+#define __RECIPROCAL_MOORE_KERNEL_H__
+
+/*
+ * This file contains the Reciprocal operation implementation for the MUSA backend.
+ *
+ * It follows the consistent code structure to ensure alignment across different
+ * hardware platforms within the Moore Threads (MUSA) ecosystem.
+ */
+namespace op::reciprocal::moore {
+
+typedef struct ReciprocalOp {
+public:
+    // 一元算子，输入数量为 1
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            // 使用 MUSA 的 half2 倒数指令（如果硬件支持）
+            // 或者转为 float2 进行计算
+            float2 f2 = __half22float2(a);
+            f2.x = 1.0f / f2.x;
+            f2.y = 1.0f / f2.y;
+            return __float22half2_rn(f2);
+        } else if constexpr (std::is_same_v<T, half>) {
+            // 提升到 float 计算以保证数值稳定性
+            return __float2half(1.0f / __half2float(a));
+        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+            // BF16 在 MUSA 上推荐转为 float 处理
+            float a_f = __bfloat162float(a);
+            return __float2bfloat16_rn(1.0f / a_f);
+        } else if constexpr (std::is_same_v<T, float>) {
+            // 编译器通常会将 1.0f/a 优化为硬件 rcp 指令 (Round to Nearest)
+            return 1.0f / a;
+        } else if constexpr (std::is_same_v<T, double>) {
+            return 1.0 / a;
+        } else {
+            // 整数类型倒数通常返回 0 (除 1 以外)，保持标准 C++ 行为
+            return static_cast<T>(1) / a;
+        }
+    }
+} ReciprocalOp;
+
+} // namespace op::reciprocal::moore
+
+#endif // __RECIPROCAL_MOORE_KERNEL_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu
new file mode 100644
index 000000000..523259bad
--- /dev/null
+++ b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cu
@@ -0,0 +1,61 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "reciprocal_nvidia.cuh"
+
+namespace op::reciprocal::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &y_desc = out_desc;
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = y_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    // Reciprocal typically only supports floating point types
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(y_shape, x_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::ReciprocalOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::ReciprocalOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::ReciprocalOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::ReciprocalOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::reciprocal::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cuh b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cuh
new file mode 100644
index 000000000..68f640767
--- /dev/null
+++ b/src/infiniop/ops/reciprocal/nvidia/reciprocal_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __RECIPROCAL_CUDA_API_H__
+#define __RECIPROCAL_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(reciprocal, nvidia)
+
+#endif // __RECIPROCAL_CUDA_API_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/reciprocal/operator.cc b/src/infiniop/ops/reciprocal/operator.cc
new file mode 100644
index 000000000..37c38bc43
--- /dev/null
+++ b/src/infiniop/ops/reciprocal/operator.cc
@@ -0,0 +1,199 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/reciprocal.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/reciprocal_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/reciprocal_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/reciprocal_metax.h"
+#endif
+#ifdef ENABLE_KUNLUN_API
+#include "kunlun/reciprocal_kunlun.h"
+#endif
+#ifdef ENABLE_CAMBRICON_API
+#include "bang/reciprocal_bang.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/reciprocal_moore.h"
+#endif
+
+__C infiniStatus_t infiniopCreateReciprocalDescriptor(
+    infiniopHandle_t handle,
+    infiniopReciprocalDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                    \
+        return op::reciprocal::NAMESPACE::Descriptor::create(                     \
+            handle,                                                               \
+            reinterpret_cast<op::reciprocal::NAMESPACE::Descriptor **>(desc_ptr), \
+            y_desc,                                                               \
+            {x_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CREATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CREATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetReciprocalWorkspaceSize(infiniopReciprocalDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                      \
+    case CASE:                                                                                    \
+        *size = reinterpret_cast<op::reciprocal::NAMESPACE::Descriptor *>(desc)->workspaceSize();  \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        GET(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        GET(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopReciprocal(
+    infiniopReciprocalDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                                   \
+    case CASE:                                                                       \
+        return reinterpret_cast<const op::reciprocal::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        CALCULATE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyReciprocalDescriptor(infiniopReciprocalDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                        \
+    case CASE:                                                                         \
+        delete reinterpret_cast<const op::reciprocal::NAMESPACE::Descriptor *>(desc);  \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_KUNLUN_API
+        DELETE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+#ifdef ENABLE_CAMBRICON_API
+        DELETE(INFINI_DEVICE_CAMBRICON, bang);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
\ No newline at end of file
diff --git a/test/infinicore/ops/addcmul.py b/test/infinicore/ops/addcmul.py
index 143154f84..5dfb6a834 100644
--- a/test/infinicore/ops/addcmul.py
+++ b/test/infinicore/ops/addcmul.py
@@ -119,9 +119,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.addcmul(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.addcmul(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.addcmul(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/atanh.py b/test/infinicore/ops/atanh.py
index 4f8cab954..fec2027ce 100644
--- a/test/infinicore/ops/atanh.py
+++ b/test/infinicore/ops/atanh.py
@@ -97,9 +97,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.atanh(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.atanh(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore atanh implementation"""
+        return infinicore.atanh(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/binary_cross_entropy_with_logits.py b/test/infinicore/ops/binary_cross_entropy_with_logits.py
index 986605a38..2e8273e47 100644
--- a/test/infinicore/ops/binary_cross_entropy_with_logits.py
+++ b/test/infinicore/ops/binary_cross_entropy_with_logits.py
@@ -80,9 +80,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.nn.functional.binary_cross_entropy_with_logits(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.nn.functional.binary_cross_entropy_with_logits(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.nn.functional.binary_cross_entropy_with_logits(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/cdist.py b/test/infinicore/ops/cdist.py
index 2d41668cc..f3827fd7a 100644
--- a/test/infinicore/ops/cdist.py
+++ b/test/infinicore/ops/cdist.py
@@ -63,9 +63,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.cdist(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.cdist(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.cdist(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/reciprocal.py b/test/infinicore/ops/reciprocal.py
index 9d4ba0849..c4e6bfff8 100644
--- a/test/infinicore/ops/reciprocal.py
+++ b/test/infinicore/ops/reciprocal.py
@@ -89,9 +89,9 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.reciprocal(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.reciprocal(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+        """InfiniCore implementation (operator not yet available)."""
+        return infinicore.reciprocal(*args, **kwargs)
 
 
 def main():
diff --git a/test/infiniop/addcmul.py b/test/infiniop/addcmul.py
new file mode 100644
index 000000000..474b5c3f5
--- /dev/null
+++ b/test/infiniop/addcmul.py
@@ -0,0 +1,163 @@
+import torch
+import ctypes
+from ctypes import c_uint64, c_float
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+# Configuration
+# ==============================================================================
+_TEST_CASES_ = [
+    # shape, input_stride, t1_stride, t2_stride
+    ((3, 3), None, None, None),
+    ((32, 512), None, None, None),
+    ((32, 512), (1024, 1), (1024, 1), (1024, 1)),
+    ((16, 32, 64), None, None, None),
+    ((8, 1, 1024), None, None, None), # 包含广播形状的潜在测试
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
+    InfiniDtype.BF16: {"atol": 5e-3, "rtol": 5e-2},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
+}
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_INPUT = auto()
+
+_INPLACE = [Inplace.OUT_OF_PLACE, Inplace.INPLACE_INPUT]
+_VALUES = [1.0, 0.5, -2.0] # 测试不同的 value 系数
+
+_TEST_CASES = [
+    test_case + (inplace_item, value)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+    for value in _VALUES
+]
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 100
+
+def test(
+    handle,
+    device,
+    shape,
+    input_stride=None,
+    t1_stride=None,
+    t2_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    value=1.0,
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    print(
+        f"Testing Addcmul on {InfiniDeviceNames[device]} with shape:{shape} value:{value} dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    # 准备输入 Tensor
+    input_tensor = TestTensor(shape, input_stride, dtype, device)
+    t1 = TestTensor(shape, t1_stride, dtype, device)
+    t2 = TestTensor(shape, t2_stride, dtype, device)
+
+    # 使用 PyTorch 计算参考答案
+    # out = input + value * t1 * t2
+    ans = torch.addcmul(input_tensor.torch_tensor(), t1.torch_tensor(), t2.torch_tensor(), value=value)
+
+    if inplace == Inplace.INPLACE_INPUT:
+        out = input_tensor
+    else:
+        out = TestTensor(shape, None, dtype, device)
+
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopOperatorDescriptor_t()
+    # 注意：根据之前的定义，Create 接口接收 value
+    check_error(
+        LIBINFINIOP.infiniopCreateAddcmulDescriptor(
+            handle, 
+            ctypes.byref(descriptor), 
+            out.descriptor, 
+            input_tensor.descriptor, 
+            t1.descriptor, 
+            t2.descriptor, 
+            c_float(value)
+        )
+    )
+
+    # 销毁临时描述符以防内核错误引用
+    for t in [input_tensor, t1, t2, out]:
+        t.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetAddcmulWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, input_tensor.device)
+
+    def lib_addcmul():
+        check_error(
+            LIBINFINIOP.infiniopAddcmul(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                out.data(),
+                input_tensor.data(),
+                t1.data(),
+                t2.data(),
+                None,
+            )
+        )
+
+    lib_addcmul()
+
+    if sync is not None:
+        sync()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(out.actual_tensor(), ans, atol=atol, rtol=rtol)
+    
+    assert torch.allclose(out.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    if PROFILE:
+        profile_operation("PyTorch", lambda: torch.addcmul(input_tensor.torch_tensor(), t1.torch_tensor(), t2.torch_tensor(), value=value), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_addcmul(), device, NUM_PRERUN, NUM_ITERATIONS)
+
+    check_error(LIBINFINIOP.infiniopDestroyAddcmulDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mAddcmul tests passed!\033[0m")
\ No newline at end of file
diff --git a/test/infiniop/atanh.py b/test/infiniop/atanh.py
new file mode 100644
index 000000000..32c8eaa2a
--- /dev/null
+++ b/test/infiniop/atanh.py
@@ -0,0 +1,171 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+# Configuration
+# ==============================================================================
+_TEST_CASES_ = [
+    # shape, a_stride, y_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((13, 4, 4), None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1)),
+    ((16, 5632), None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# atanh typically supports floating point types
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-6, "rtol": 1e-6},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+def atanh_torch(y, a):
+    torch.atanh(a, out=y)
+
+def test(
+    handle,
+    device,
+    shape,
+    a_stride=None,
+    y_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    # Initialize input tensor
+    a = TestTensor(shape, a_stride, dtype, device)
+    
+    # Crucial: clamp values to (-1, 1) to avoid NaN/Inf for atanh
+    with torch.no_grad():
+        a.torch_tensor().clamp_(-0.99, 0.99)
+        # Keep underlying data in sync for all devices (including CPU)
+        a.actual_tensor().copy_(a.torch_tensor())
+
+    if inplace == Inplace.INPLACE_A:
+        if a_stride != y_stride:
+            return
+        y = a
+    else:
+        y = TestTensor(shape, y_stride, dtype, device, mode="ones")
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Atanh on {InfiniDeviceNames[device]} with shape:{shape} a_stride:{a_stride} y_stride:{y_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    # Reference calculation
+    atanh_torch(y.torch_tensor(), a.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    # Create descriptor
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateAtanhDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y.descriptor,
+            a.descriptor,
+        )
+    )
+
+    # Invalidate descriptors to ensure kernel uses its own internal state
+    for tensor in [a, y]:
+        tensor.destroy_desc()
+
+    # Workspace management
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetAtanhWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_atanh():
+        check_error(
+            LIBINFINIOP.infiniopAtanh(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                y.data(),
+                a.data(),
+                None,
+            )
+        )
+
+    # Run library function
+    lib_atanh()
+
+    # Validate results
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+    
+    assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling
+    if PROFILE:
+        profile_operation("PyTorch", lambda: atanh_torch(y.torch_tensor(), a.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("   lib", lambda: lib_atanh(), device, NUM_PRERUN, NUM_ITERATIONS)
+        
+    check_error(LIBINFINIOP.infiniopDestroyAtanhDescriptor(descriptor))
+
+if __name__ == "__main__":
+    args = get_args()
+
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mAtanh Test passed!\033[0m")
\ No newline at end of file
diff --git a/test/infiniop/binary_cross_entropy_with_logits.py b/test/infiniop/binary_cross_entropy_with_logits.py
new file mode 100644
index 000000000..b09c95570
--- /dev/null
+++ b/test/infiniop/binary_cross_entropy_with_logits.py
@@ -0,0 +1,154 @@
+import torch
+import ctypes
+from ctypes import c_uint64, c_float, c_char_p
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+
+# ==============================================================================
+# Configuration
+# ==============================================================================
+# 测试场景：(shape, has_weight, has_pos_weight, reduction)
+_TEST_CASES_DATA = [
+    ((4, 5), False, False, "none"),
+    ((8, 8), True, False, "sum"),
+    ((32, 512), False, True, "mean"),
+    ((16, 32, 64), True, True, "mean"),
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.BF16, InfiniDtype.F32]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-2},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 5e-2},
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-5},
+}
+
+_REDUCTIONS = ["none", "mean", "sum"]
+
+# 生成最终测试用例组合
+_TEST_CASES = _TEST_CASES_DATA 
+
+DEBUG = False
+PROFILE = False
+
+def test(
+    handle,
+    device,
+    shape,
+    has_weight=False,
+    has_pos_weight=False,
+    reduction="none",
+    dtype=InfiniDtype.F16,
+    sync=None,
+):
+    print(
+        f"Testing BCEWithLogits on {InfiniDeviceNames[device]} shape:{shape} "
+        f"weight:{has_weight} pos_weight:{has_pos_weight} reduction:{reduction} dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    # 1. 准备输入 Tensor
+    input_tensor = TestTensor(shape, None, dtype, device)
+    target = TestTensor(shape, None, dtype, device)
+    
+    weight = TestTensor(shape, None, dtype, device) if has_weight else None
+    # pos_weight 通常在最后一维广播，形状为 (C,)
+    pos_weight_shape = (shape[-1],)
+    pos_weight = TestTensor(pos_weight_shape, None, dtype, device) if has_pos_weight else None
+
+    # 2. 使用 PyTorch 计算参考答案
+    torch_input = input_tensor.torch_tensor()
+    torch_target = target.torch_tensor()
+    torch_weight = weight.torch_tensor() if has_weight else None
+    torch_pos_weight = pos_weight.torch_tensor() if has_pos_weight else None
+
+    ans = torch.nn.functional.binary_cross_entropy_with_logits(
+        torch_input, 
+        torch_target, 
+        weight=torch_weight, 
+        pos_weight=torch_pos_weight, 
+        reduction=reduction
+    )
+
+    # 3. 准备输出 Tensor (根据 reduction 确定形状)
+    out_shape = () if reduction != "none" else shape
+    out = TestTensor(out_shape, None, dtype, device)
+
+    if sync is not None:
+        sync()
+
+    # 4. 创建描述符并执行
+    descriptor = infiniopOperatorDescriptor_t()
+    
+    # 模拟 C 接口调用
+    check_error(
+        LIBINFINIOP.infiniopCreateBCEWithLogitsDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            out.descriptor,
+            input_tensor.descriptor,
+            target.descriptor,
+            weight.descriptor if has_weight else None,
+            pos_weight.descriptor if has_pos_weight else None,
+            c_char_p(reduction.encode('utf-8')) # 传入归约方式
+        )
+    )
+
+    # 销毁临时描述符
+    for t in [input_tensor, target, out]:
+        t.destroy_desc()
+    if weight: weight.destroy_desc()
+    if pos_weight: pos_weight.destroy_desc()
+
+    workspace_size = c_uint64(0)
+    check_error(LIBINFINIOP.infiniopGetBCEWithLogitsWorkspaceSize(descriptor, ctypes.byref(workspace_size)))
+    workspace = TestWorkspace(workspace_size.value, device)
+
+    def lib_op():
+        check_error(
+            LIBINFINIOP.infiniopBCEWithLogits(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                out.data(),
+                input_tensor.data(),
+                target.data(),
+                weight.data() if has_weight else None,
+                pos_weight.data() if has_pos_weight else None,
+            )
+        )
+
+    lib_op()
+
+    if sync is not None:
+        sync()
+
+    # 5. 验证结果
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    assert torch.allclose(out.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    if PROFILE:
+        profile_operation("PyTorch", lambda: torch.nn.functional.binary_cross_entropy_with_logits(
+            torch_input, torch_target, weight=torch_weight, pos_weight=torch_pos_weight, reduction=reduction
+        ), device)
+        profile_operation("   lib", lib_op, device)
+
+    check_error(LIBINFINIOP.infiniopDestroyBCEWithLogitsDescriptor(descriptor))
+
+if __name__ == "__main__":
+    args = get_args()
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+    print("\033[92mBCEWithLogits tests passed!\033[0m")
\ No newline at end of file
diff --git a/test/infiniop/cdist.py b/test/infiniop/cdist.py
new file mode 100644
index 000000000..750f9ceaa
--- /dev/null
+++ b/test/infiniop/cdist.py
@@ -0,0 +1,156 @@
+import torch
+import ctypes
+from ctypes import c_uint64, c_float, c_double
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+
+# ==============================================================================
+# Configuration
+# ==============================================================================
+# 格式: (M, N, D, x1_stride, x2_stride)
+# x1: (M, D), x2: (N, D), out: (M, N)
+_TEST_CASES_DATA = [
+    (5, 6, 3, None, None),
+    (32, 64, 128, None, None),
+    (32, 64, 128, (256, 1), (256, 1)), # 测试带步长的输入
+    (10, 7, 5, None, None),
+]
+
+_TENSOR_DTYPES = [InfiniDtype.F32] # cdist 通常对精度敏感，初测建议用 F32
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F32: {"atol": 1e-5, "rtol": 1e-4},
+}
+
+_P_VALUES = [1.0, 2.0, float("inf")] # 不同的 p 范数测试
+
+_TEST_CASES = [
+    test_case + (p_val,)
+    for test_case in _TEST_CASES_DATA
+    for p_val in _P_VALUES
+]
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 100
+
+def test(
+    handle,
+    device,
+    M, N, D,
+    x1_stride=None,
+    x2_stride=None,
+    p=2.0,
+    dtype=InfiniDtype.F32,
+    sync=None,
+):
+    print(
+        f"Testing Cdist on {InfiniDeviceNames[device]} with M:{M}, N:{N}, D:{D}, p:{p}, dtype:{InfiniDtypeNames[dtype]}"
+    )
+
+    # 1. 准备输入输出形状
+    x1_shape = (M, D)
+    x2_shape = (N, D)
+    out_shape = (M, N)
+
+    # 2. 准备输入 Tensor
+    x1 = TestTensor(x1_shape, x1_stride, dtype, device)
+    x2 = TestTensor(x2_shape, x2_stride, dtype, device)
+    out = TestTensor(out_shape, None, dtype, device)
+
+    # 3. 使用 PyTorch 计算参考答案
+    # torch.cdist 要求输入至少是 2D
+    ans = torch.cdist(x1.torch_tensor(), x2.torch_tensor(), p=p)
+
+    if sync is not None:
+        sync()
+
+    # 4. 创建算子描述符
+    descriptor = infiniopOperatorDescriptor_t()
+    # 注意：这里假设 C 接口名为 infiniopCreateCdistDescriptor
+    check_error(
+        LIBINFINIOP.infiniopCreateCdistDescriptor(
+            handle, 
+            ctypes.byref(descriptor), 
+            out.descriptor, 
+            x1.descriptor, 
+            x2.descriptor, 
+            c_double(p) # 通常 p 使用 double 或 float 传递
+        )
+    )
+
+    # 销毁临时描述符以防内核错误引用（沿用 addcmul 风格）
+    for t in [x1, x2, out]:
+        t.destroy_desc()
+
+    # 5. Workspace 准备
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetCdistWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, x1.device)
+
+    # 6. 执行函数定义
+    def lib_cdist():
+        check_error(
+            LIBINFINIOP.infiniopCdist(
+                descriptor,
+                workspace.data(),
+                workspace_size.value,
+                out.data(),
+                x1.data(),
+                x2.data(),
+                None, # stream
+            )
+        )
+
+    # 7. 运行
+    lib_cdist()
+
+    if sync is not None:
+        sync()
+
+    # 8. 验证结果
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(out.actual_tensor(), ans, atol=atol, rtol=rtol)
+    
+    assert torch.allclose(out.actual_tensor(), ans, atol=atol, rtol=rtol)
+
+    # 9. 性能分析
+    if PROFILE:
+        profile_operation("PyTorch", lambda: torch.cdist(x1.torch_tensor(), x2.torch_tensor(), p=p), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("   lib", lambda: lib_cdist(), device, NUM_PRERUN, NUM_ITERATIONS)
+
+    check_error(LIBINFINIOP.infiniopDestroyCdistDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mCdist tests passed!\033[0m")
\ No newline at end of file
diff --git a/test/infiniop/libinfiniop/op_register.py b/test/infiniop/libinfiniop/op_register.py
index 5b2974111..892facdfe 100644
--- a/test/infiniop/libinfiniop/op_register.py
+++ b/test/infiniop/libinfiniop/op_register.py
@@ -20,6 +20,36 @@ def register_lib(cls, lib):
         for op in cls.registry:
             op(lib)
 
+@OpRegister.operator
+def atanh_(lib):
+    lib.infiniopCreateAtanhDescriptor.restype = c_int32
+    lib.infiniopCreateAtanhDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetAtanhWorkspaceSize.restype = c_int32
+    lib.infiniopGetAtanhWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    lib.infiniopAtanh.restype = c_int32
+    lib.infiniopAtanh.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p,  # workspace
+        c_size_t,  # workspace_size
+        c_void_p,  # y_data
+        c_void_p,  # a_data
+        c_void_p,  # stream
+    ]
+
+    lib.infiniopDestroyAtanhDescriptor.restype = c_int32
+    lib.infiniopDestroyAtanhDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
 
 @OpRegister.operator
 def add_(lib):
@@ -54,6 +84,156 @@ def add_(lib):
         infiniopOperatorDescriptor_t,
     ]
 
+@OpRegister.operator
+def addcmul_(lib):
+    lib.infiniopCreateAddcmulDescriptor.restype = c_int32
+    lib.infiniopCreateAddcmulDescriptor.argtypes = [
+        infiniopHandle_t,                       # handle
+        POINTER(infiniopOperatorDescriptor_t),  # desc_ptr
+        infiniopTensorDescriptor_t,             # out_desc
+        infiniopTensorDescriptor_t,             # input_desc
+        infiniopTensorDescriptor_t,             # t1_desc
+        infiniopTensorDescriptor_t,             # t2_desc
+        c_float,                                # value (标量系数)
+    ]
+
+    lib.infiniopGetAddcmulWorkspaceSize.restype = c_int32
+    lib.infiniopGetAddcmulWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,           # descriptor
+        POINTER(c_size_t),                      # size_ptr
+    ]
+
+    lib.infiniopAddcmul.restype = c_int32
+    lib.infiniopAddcmul.argtypes = [
+        infiniopOperatorDescriptor_t,           # descriptor
+        c_void_p,                               # workspace
+        c_size_t,                               # workspace_size
+        c_void_p,                               # out_ptr
+        c_void_p,                               # input_ptr
+        c_void_p,                               # t1_ptr
+        c_void_p,                               # t2_ptr
+        c_void_p,                               # stream
+    ]
+
+    lib.infiniopDestroyAddcmulDescriptor.restype = c_int32
+    lib.infiniopDestroyAddcmulDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,           # descriptor
+    ]
+    
+@OpRegister.operator
+def cdist_(lib):
+    # 1. 创建描述符接口
+    # 接口通常接收 handle, 输出 desc, 两个输入 desc, 以及范数 p
+    lib.infiniopCreateCdistDescriptor.restype = c_int32
+    lib.infiniopCreateCdistDescriptor.argtypes = [
+        infiniopHandle_t,                       # handle
+        POINTER(infiniopOperatorDescriptor_t),  # desc_ptr
+        infiniopTensorDescriptor_t,             # y_desc (输出)
+        infiniopTensorDescriptor_t,             # x1_desc
+        infiniopTensorDescriptor_t,             # x2_desc
+        c_double,                               # p (范数阶数)
+    ]
+
+    # 2. 获取 Workspace 大小接口
+    lib.infiniopGetCdistWorkspaceSize.restype = c_int32
+    lib.infiniopGetCdistWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,           # descriptor
+        POINTER(c_size_t),                      # size_ptr
+    ]
+
+    # 3. 执行算子接口
+    lib.infiniopCdist.restype = c_int32
+    lib.infiniopCdist.argtypes = [
+        infiniopOperatorDescriptor_t,           # descriptor
+        c_void_p,                               # workspace
+        c_size_t,                               # workspace_size
+        c_void_p,                               # y_ptr
+        c_void_p,                               # x1_ptr
+        c_void_p,                               # x2_ptr
+        c_void_p,                               # stream
+    ]
+
+    # 4. 销毁描述符接口
+    lib.infiniopDestroyCdistDescriptor.restype = c_int32
+    lib.infiniopDestroyCdistDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,           # descriptor
+    ]
+
+@OpRegister.operator
+def binary_cross_entropy_with_logits_(lib):
+    # 1. 创建描述符 (Descriptor Creation)
+    lib.infiniopCreateBCEWithLogitsDescriptor.restype = c_int32
+    lib.infiniopCreateBCEWithLogitsDescriptor.argtypes = [
+        infiniopHandle_t,                        # handle
+        POINTER(infiniopOperatorDescriptor_t),   # desc_ptr
+        infiniopTensorDescriptor_t,              # out_desc
+        infiniopTensorDescriptor_t,              # input_desc (logits)
+        infiniopTensorDescriptor_t,              # target_desc
+        infiniopTensorDescriptor_t,              # weight_desc (可选，不可用则传 NULL)
+        infiniopTensorDescriptor_t,              # pos_weight_desc (可选，不可用则传 NULL)
+        c_int32                                  # reduction (0:none, 1:mean, 2:sum)
+    ]
+
+    # 2. 获取工作空间大小 (Workspace Size)
+    lib.infiniopGetBCEWithLogitsWorkspaceSize.restype = c_int32
+    lib.infiniopGetBCEWithLogitsWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,            # descriptor
+        POINTER(c_size_t),                       # size_ptr
+    ]
+
+    # 3. 执行算子 (Execution)
+    lib.infiniopBCEWithLogits.restype = c_int32
+    lib.infiniopBCEWithLogits.argtypes = [
+        infiniopOperatorDescriptor_t,            # descriptor
+        c_void_p,                                # workspace
+        c_size_t,                                # workspace_size
+        c_void_p,                                # out_ptr
+        c_void_p,                                # input_ptr (logits)
+        c_void_p,                                # target_ptr
+        c_void_p,                                # weight_ptr (可选)
+        c_void_p,                                # pos_weight_ptr (可选)
+        c_void_p,                                # stream
+    ]
+
+    # 4. 销毁描述符 (Destruction)
+    lib.infiniopDestroyBCEWithLogitsDescriptor.restype = c_int32
+    lib.infiniopDestroyBCEWithLogitsDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,            # descriptor
+    ]
+
+@OpRegister.operator
+def reciprocal_(lib):
+    lib.infiniopCreateReciprocalDescriptor.restype = c_int32
+    lib.infiniopCreateReciprocalDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopOperatorDescriptor_t),
+        infiniopTensorDescriptor_t, # Output descriptor
+        infiniopTensorDescriptor_t, # Input descriptor
+    ]
+
+    # 获取工作空间大小接口
+    lib.infiniopGetReciprocalWorkspaceSize.restype = c_int32
+    lib.infiniopGetReciprocalWorkspaceSize.argtypes = [
+        infiniopOperatorDescriptor_t,
+        POINTER(c_size_t),
+    ]
+
+    # 最后的 c_void_p 通常对应 stream 或其他异步句柄，保持一致即可
+    lib.infiniopReciprocal.restype = c_int32
+    lib.infiniopReciprocal.argtypes = [
+        infiniopOperatorDescriptor_t,
+        c_void_p, # Workspace pointer
+        c_size_t, # Workspace size
+        c_void_p, # Output data pointer
+        c_void_p, # Input data pointer
+        c_void_p, # Stream pointer (optional)
+    ]
+
+    # 销毁描述符接口
+    lib.infiniopDestroyReciprocalDescriptor.restype = c_int32
+    lib.infiniopDestroyReciprocalDescriptor.argtypes = [
+        infiniopOperatorDescriptor_t,
+    ]
 
 @OpRegister.operator
 def attention_(lib):
diff --git a/test/infiniop/reciprocal.py b/test/infiniop/reciprocal.py
new file mode 100644
index 000000000..4742ffdb8
--- /dev/null
+++ b/test/infiniop/reciprocal.py
@@ -0,0 +1,165 @@
+import torch
+import ctypes
+from ctypes import c_uint64
+from libinfiniop import (
+    LIBINFINIOP,
+    TestTensor,
+    get_test_devices,
+    check_error,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    TestWorkspace,
+    InfiniDtype,
+    InfiniDtypeNames,
+    InfiniDeviceNames,
+    infiniopOperatorDescriptor_t,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+# Configuration
+# ==============================================================================
+_TEST_CASES_ = [
+    # shape, input_stride, output_stride
+    ((13, 4), None, None),
+    ((13, 4), (10, 1), (10, 1)),
+    ((16, 5632), None, None),
+    ((16, 5632), (13312, 1), (13312, 1)),
+    ((13, 16, 2), (128, 4, 1), (64, 4, 1)),
+    ((4, 4, 5632), None, None),
+]
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE = auto()
+
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE,
+]
+
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Reciprocal usually outputs floats; Integer types are often not supported or special-cased
+_TENSOR_DTYPES = [InfiniDtype.F16, InfiniDtype.F32, InfiniDtype.BF16]
+
+_TOLERANCE_MAP = {
+    InfiniDtype.F16: {"atol": 1e-3, "rtol": 1e-3},
+    InfiniDtype.F32: {"atol": 1e-7, "rtol": 1e-7},
+    InfiniDtype.BF16: {"atol": 1e-2, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+def reciprocal(y, x):
+    torch.reciprocal(x, out=y)
+
+def test(
+    handle,
+    device,
+    shape,
+    in_stride=None,
+    out_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    # Initialize input 'x'
+    # Use 'random' mode but ensure values are not near zero to avoid infinity
+    x = TestTensor(shape, in_stride, dtype, device)
+    
+    if inplace == Inplace.INPLACE:
+        if in_stride != out_stride:
+            return
+        y = x
+    else:
+        y = TestTensor(shape, out_stride, dtype, device)
+
+    if y.is_broadcast():
+        return
+
+    print(
+        f"Testing Reciprocal on {InfiniDeviceNames[device]} with shape:{shape} "
+        f"in_stride:{in_stride} out_stride:{out_stride} "
+        f"dtype:{InfiniDtypeNames[dtype]} inplace:{inplace}"
+    )
+
+    # Calculate ground truth using PyTorch
+    reciprocal(y.torch_tensor(), x.torch_tensor())
+
+    if sync is not None:
+        sync()
+
+    # Create Descriptor
+    descriptor = infiniopOperatorDescriptor_t()
+    check_error(
+        LIBINFINIOP.infiniopCreateReciprocalDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            y.descriptor,
+            x.descriptor,
+        )
+    )
+
+    # Invalidate descriptors as per framework requirement
+    for tensor in [x, y]:
+        tensor.destroy_desc()
+
+    # Workspace allocation
+    workspace_size = c_uint64(0)
+    check_error(
+        LIBINFINIOP.infiniopGetReciprocalWorkspaceSize(
+            descriptor, ctypes.byref(workspace_size)
+        )
+    )
+    workspace = TestWorkspace(workspace_size.value, y.device)
+
+    def lib_reciprocal():
+        check_error(
+            LIBINFINIOP.infiniopReciprocal(
+                descriptor,
+                workspace.data(),
+                workspace.size(),
+                y.data(),
+                x.data(),
+                None,
+            )
+        )
+
+    lib_reciprocal()
+
+    # Verification
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+    assert torch.allclose(y.actual_tensor(), y.torch_tensor(), atol=atol, rtol=rtol)
+
+    # Profiling
+    if PROFILE:
+        profile_operation("PyTorch", lambda: reciprocal(y.torch_tensor(), x.torch_tensor()), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("   lib", lambda: lib_reciprocal(), device, NUM_PRERUN, NUM_ITERATIONS)
+        
+    check_error(LIBINFINIOP.infiniopDestroyReciprocalDescriptor(descriptor))
+
+if __name__ == "__main__":
+    args = get_args()
+
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
\ No newline at end of file