From 11782aa33afeaf5bd2867da3dde42cf66bf0e87e Mon Sep 17 00:00:00 2001
From: lengmuzhaxi <2690497440@qq.com>
Date: Sat, 10 Jan 2026 20:43:42 +0800
Subject: [PATCH] accomplish softplus broadcast_to linear huber_loss softsign
 operator

---
 include/infinicore/ops/broadcast_to.hpp       |  18 ++
 include/infinicore/ops/huber_loss.hpp         |  21 ++
 include/infinicore/ops/linear.hpp             |   1 +
 include/infinicore/ops/softplus.hpp           |  20 ++
 include/infinicore/ops/softsign.hpp           |  16 +
 include/infiniop.h                            |   5 +-
 include/infiniop/ops/broadcast_to.h           |  24 ++
 include/infiniop/ops/huber_loss.h             |  28 ++
 include/infiniop/ops/softplus.h               |  48 +--
 include/infiniop/ops/softsign.h               |  25 ++
 python/infinicore/__init__.py                 |   2 +
 python/infinicore/nn/functional/__init__.py   |   6 +
 python/infinicore/nn/functional/huber_loss.py |  50 ++++
 python/infinicore/nn/functional/linear.py     |   3 +
 python/infinicore/nn/functional/softplus.py   |  13 +
 python/infinicore/nn/functional/softsign.py   |  11 +
 python/infinicore/ops/broadcast_to.py         |  11 +
 .../ops/broadcast_to/broadcast_to.cc          |  29 ++
 .../ops/broadcast_to/broadcast_to_infiniop.cc |  63 ++++
 src/infinicore/ops/huber_loss/huber_loss.cc   |  35 +++
 .../ops/huber_loss/huber_loss_infiniop.cc     |  68 +++++
 src/infinicore/ops/softplus/softplus.cc       |  34 +++
 .../ops/softplus/softplus_infiniop.cc         |  49 ++++
 src/infinicore/ops/softsign/softsign.cc       |  27 ++
 .../ops/softsign/softsign_infiniop.cc         |  50 ++++
 src/infinicore/pybind11/ops.hpp               |  13 +-
 src/infinicore/pybind11/ops/broadcast_to.hpp  |  26 ++
 src/infinicore/pybind11/ops/huber_loss.hpp    |  44 +++
 src/infinicore/pybind11/ops/softplus.hpp      |  30 ++
 src/infinicore/pybind11/ops/softsign.hpp      |  24 ++
 src/infiniop/ops/broadcast_to/broadcast_to.h  |  48 +++
 .../ops/broadcast_to/cpu/broadcast_to_cpu.cc  | 116 ++++++++
 .../ops/broadcast_to/cpu/broadcast_to_cpu.h   |   8 +
 src/infiniop/ops/broadcast_to/cuda/kernel.cuh |  53 ++++
 src/infiniop/ops/broadcast_to/info.h          |  97 +++++++
 .../broadcast_to/metax/broadcast_to_metax.h   |   8 +
 .../metax/broadcast_to_metax.maca             | 177 ++++++++++++
 .../broadcast_to/moore/broadcast_to_moore.h   |   8 +
 .../broadcast_to/moore/broadcast_to_moore.mu  | 122 ++++++++
 .../moore/broadcast_to_moore_kernel.h         |  57 ++++
 .../nvidia/broadcast_to_nvidia.cu             | 114 ++++++++
 .../nvidia/broadcast_to_nvidia.cuh            |   8 +
 src/infiniop/ops/broadcast_to/operator.cc     | 176 +++++++++++
 .../ops/huber_loss/cpu/huber_loss_cpu.cc      | 140 +++++++++
 .../ops/huber_loss/cpu/huber_loss_cpu.h       |   8 +
 src/infiniop/ops/huber_loss/cuda/kernel.cuh   | 106 +++++++
 src/infiniop/ops/huber_loss/huber_loss.h      |  49 ++++
 src/infiniop/ops/huber_loss/info.h            |  79 +++++
 .../ops/huber_loss/metax/huber_loss_metax.h   |   8 +
 .../huber_loss/metax/huber_loss_metax.maca    | 273 ++++++++++++++++++
 .../ops/huber_loss/moore/huber_loss_moore.h   |   8 +
 .../ops/huber_loss/moore/huber_loss_moore.mu  | 153 ++++++++++
 .../moore/huber_loss_moore_kernel.h           | 141 +++++++++
 .../huber_loss/nvidia/huber_loss_nvidia.cu    | 119 ++++++++
 .../huber_loss/nvidia/huber_loss_nvidia.cuh   |   8 +
 src/infiniop/ops/huber_loss/operator.cc       | 180 ++++++++++++
 src/infiniop/ops/softplus/cpu/softplus_cpu.cc | 121 ++++++--
 src/infiniop/ops/softplus/cpu/softplus_cpu.h  |  21 +-
 src/infiniop/ops/softplus/cuda/kernel.cuh     |  60 +++-
 src/infiniop/ops/softplus/info.h              | 106 +++++++
 .../ops/softplus/metax/softplus_metax.h       |   4 +-
 .../ops/softplus/metax/softplus_metax.maca    | 204 +++++++++++--
 .../ops/softplus/moore/softplus_moore.h       |   8 +
 .../ops/softplus/moore/softplus_moore.mu      | 181 ++++++++++++
 .../softplus/moore/softplus_moore_kernel.h    |  64 ++++
 .../ops/softplus/nvidia/softplus_nvidia.cu    | 176 +++++++++--
 .../ops/softplus/nvidia/softplus_nvidia.cuh   |  10 +-
 src/infiniop/ops/softplus/operator.cc         |  41 ++-
 src/infiniop/ops/softplus/softplus.h          |  49 ++++
 src/infiniop/ops/softsign/cpu/softsign_cpu.cc |  55 ++++
 src/infiniop/ops/softsign/cpu/softsign_cpu.h  |  21 ++
 src/infiniop/ops/softsign/cuda/kernel.cuh     |  42 +++
 .../ops/softsign/metax/softsign_metax.h       |   8 +
 .../ops/softsign/metax/softsign_metax.maca    |  95 ++++++
 .../ops/softsign/moore/softsign_moore.h       |   8 +
 .../ops/softsign/moore/softsign_moore.mu      |  69 +++++
 .../softsign/moore/softsign_moore_kernel.h    | 141 +++++++++
 .../ops/softsign/nvidia/softsign_nvidia.cu    |  56 ++++
 .../ops/softsign/nvidia/softsign_nvidia.cuh   |   8 +
 src/infiniop/ops/softsign/operator.cc         | 201 +++++++++++++
 test/infinicore/ops/broadcast_to.py           |   5 +-
 test/infinicore/ops/huber_loss.py             |   5 +-
 test/infinicore/ops/softplus.py               |   5 +-
 test/infinicore/ops/softsign.py               |   5 +-
 84 files changed, 4681 insertions(+), 146 deletions(-)
 create mode 100644 include/infinicore/ops/broadcast_to.hpp
 create mode 100644 include/infinicore/ops/huber_loss.hpp
 create mode 100644 include/infinicore/ops/softplus.hpp
 create mode 100644 include/infinicore/ops/softsign.hpp
 create mode 100644 include/infiniop/ops/broadcast_to.h
 create mode 100644 include/infiniop/ops/huber_loss.h
 create mode 100644 include/infiniop/ops/softsign.h
 create mode 100644 python/infinicore/nn/functional/huber_loss.py
 create mode 100644 python/infinicore/nn/functional/softplus.py
 create mode 100644 python/infinicore/nn/functional/softsign.py
 create mode 100644 python/infinicore/ops/broadcast_to.py
 create mode 100644 src/infinicore/ops/broadcast_to/broadcast_to.cc
 create mode 100644 src/infinicore/ops/broadcast_to/broadcast_to_infiniop.cc
 create mode 100644 src/infinicore/ops/huber_loss/huber_loss.cc
 create mode 100644 src/infinicore/ops/huber_loss/huber_loss_infiniop.cc
 create mode 100644 src/infinicore/ops/softplus/softplus.cc
 create mode 100644 src/infinicore/ops/softplus/softplus_infiniop.cc
 create mode 100644 src/infinicore/ops/softsign/softsign.cc
 create mode 100644 src/infinicore/ops/softsign/softsign_infiniop.cc
 create mode 100644 src/infinicore/pybind11/ops/broadcast_to.hpp
 create mode 100644 src/infinicore/pybind11/ops/huber_loss.hpp
 create mode 100644 src/infinicore/pybind11/ops/softplus.hpp
 create mode 100644 src/infinicore/pybind11/ops/softsign.hpp
 create mode 100644 src/infiniop/ops/broadcast_to/broadcast_to.h
 create mode 100644 src/infiniop/ops/broadcast_to/cpu/broadcast_to_cpu.cc
 create mode 100644 src/infiniop/ops/broadcast_to/cpu/broadcast_to_cpu.h
 create mode 100644 src/infiniop/ops/broadcast_to/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/broadcast_to/info.h
 create mode 100644 src/infiniop/ops/broadcast_to/metax/broadcast_to_metax.h
 create mode 100644 src/infiniop/ops/broadcast_to/metax/broadcast_to_metax.maca
 create mode 100644 src/infiniop/ops/broadcast_to/moore/broadcast_to_moore.h
 create mode 100644 src/infiniop/ops/broadcast_to/moore/broadcast_to_moore.mu
 create mode 100644 src/infiniop/ops/broadcast_to/moore/broadcast_to_moore_kernel.h
 create mode 100644 src/infiniop/ops/broadcast_to/nvidia/broadcast_to_nvidia.cu
 create mode 100644 src/infiniop/ops/broadcast_to/nvidia/broadcast_to_nvidia.cuh
 create mode 100644 src/infiniop/ops/broadcast_to/operator.cc
 create mode 100644 src/infiniop/ops/huber_loss/cpu/huber_loss_cpu.cc
 create mode 100644 src/infiniop/ops/huber_loss/cpu/huber_loss_cpu.h
 create mode 100644 src/infiniop/ops/huber_loss/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/huber_loss/huber_loss.h
 create mode 100644 src/infiniop/ops/huber_loss/info.h
 create mode 100644 src/infiniop/ops/huber_loss/metax/huber_loss_metax.h
 create mode 100644 src/infiniop/ops/huber_loss/metax/huber_loss_metax.maca
 create mode 100644 src/infiniop/ops/huber_loss/moore/huber_loss_moore.h
 create mode 100644 src/infiniop/ops/huber_loss/moore/huber_loss_moore.mu
 create mode 100644 src/infiniop/ops/huber_loss/moore/huber_loss_moore_kernel.h
 create mode 100644 src/infiniop/ops/huber_loss/nvidia/huber_loss_nvidia.cu
 create mode 100644 src/infiniop/ops/huber_loss/nvidia/huber_loss_nvidia.cuh
 create mode 100644 src/infiniop/ops/huber_loss/operator.cc
 create mode 100644 src/infiniop/ops/softplus/info.h
 create mode 100644 src/infiniop/ops/softplus/moore/softplus_moore.h
 create mode 100644 src/infiniop/ops/softplus/moore/softplus_moore.mu
 create mode 100644 src/infiniop/ops/softplus/moore/softplus_moore_kernel.h
 create mode 100644 src/infiniop/ops/softplus/softplus.h
 create mode 100644 src/infiniop/ops/softsign/cpu/softsign_cpu.cc
 create mode 100644 src/infiniop/ops/softsign/cpu/softsign_cpu.h
 create mode 100644 src/infiniop/ops/softsign/cuda/kernel.cuh
 create mode 100644 src/infiniop/ops/softsign/metax/softsign_metax.h
 create mode 100644 src/infiniop/ops/softsign/metax/softsign_metax.maca
 create mode 100644 src/infiniop/ops/softsign/moore/softsign_moore.h
 create mode 100644 src/infiniop/ops/softsign/moore/softsign_moore.mu
 create mode 100644 src/infiniop/ops/softsign/moore/softsign_moore_kernel.h
 create mode 100644 src/infiniop/ops/softsign/nvidia/softsign_nvidia.cu
 create mode 100644 src/infiniop/ops/softsign/nvidia/softsign_nvidia.cuh
 create mode 100644 src/infiniop/ops/softsign/operator.cc

diff --git a/include/infinicore/ops/broadcast_to.hpp b/include/infinicore/ops/broadcast_to.hpp
new file mode 100644
index 000000000..a1667043c
--- /dev/null
+++ b/include/infinicore/ops/broadcast_to.hpp
@@ -0,0 +1,18 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+#include <vector>
+
+namespace infinicore::op {
+class BroadcastTo {
+public:
+    // Schema: Output(y), Input(x)
+    using schema = void (*)(Tensor, Tensor);
+    static void execute(Tensor y, Tensor x);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+Tensor broadcast_to(Tensor x, const std::vector<int64_t>& shape);
+void broadcast_to_(Tensor y, Tensor x);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/huber_loss.hpp b/include/infinicore/ops/huber_loss.hpp
new file mode 100644
index 000000000..6dac4db35
--- /dev/null
+++ b/include/infinicore/ops/huber_loss.hpp
@@ -0,0 +1,21 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class HuberLoss {
+public:
+    // Schema: output, input, target, delta, reduction
+    using schema = void (*)(Tensor, Tensor, Tensor, float, int64_t);
+    
+    static void execute(Tensor output, Tensor input, Tensor target, float delta, int64_t reduction);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+
+// delta 默认为 1.0f，reduction 默认为 1 (MEAN)
+Tensor huber_loss(Tensor input, Tensor target, float delta = 1.0f, int64_t reduction = 1);
+void huber_loss_(Tensor output, Tensor input, Tensor target, float delta, int64_t reduction);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/linear.hpp b/include/infinicore/ops/linear.hpp
index d69842be3..56145f51f 100644
--- a/include/infinicore/ops/linear.hpp
+++ b/include/infinicore/ops/linear.hpp
@@ -3,6 +3,7 @@
 #include "common/op.hpp"
 #include <optional>
 
+
 namespace infinicore::op {
 
 Tensor linear(Tensor input, Tensor weight, std::optional<Tensor> bias);
diff --git a/include/infinicore/ops/softplus.hpp b/include/infinicore/ops/softplus.hpp
new file mode 100644
index 000000000..1e08b32dc
--- /dev/null
+++ b/include/infinicore/ops/softplus.hpp
@@ -0,0 +1,20 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+
+class Softplus {
+public:
+    // 修改 1: Schema 增加 float beta, float threshold
+    using schema = void (*)(Tensor, Tensor, float, float);
+    static void execute(Tensor y, Tensor x, float beta, float threshold);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+// default: beta = 1.0, threshold = 20.0
+Tensor softplus(Tensor x, float beta = 1.0f, float threshold = 20.0f);
+
+void softplus_(Tensor y, Tensor x, float beta = 1.0f, float threshold = 20.0f);
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infinicore/ops/softsign.hpp b/include/infinicore/ops/softsign.hpp
new file mode 100644
index 000000000..eaaf1eb13
--- /dev/null
+++ b/include/infinicore/ops/softsign.hpp
@@ -0,0 +1,16 @@
+#pragma once
+
+#include "../device.hpp"
+#include "common/op.hpp"
+
+namespace infinicore::op {
+class Softsign {
+public:
+    using schema = void (*)(Tensor, Tensor);
+    static void execute(Tensor y, Tensor x);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+// 返回新 Tensor 的函数接口
+Tensor softsign(Tensor x);
+void softsign_(Tensor y, Tensor x);
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/include/infiniop.h b/include/infiniop.h
index c0a09fcb4..e9d850143 100644
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -27,13 +27,16 @@
 #include "infiniop/ops/sigmoid.h"
 #include "infiniop/ops/silu.h"
 #include "infiniop/ops/softmax.h"
-#include "infiniop/ops/softplus.h"
 #include "infiniop/ops/sub.h"
 #include "infiniop/ops/swiglu.h"
 #include "infiniop/ops/tanh.h"
 #include "infiniop/ops/topkrouter.h"
 #include "infiniop/ops/topksoftmax.h"
 #include "infiniop/ops/zeros.h"
+#include "infiniop/ops/broadcast_to.h"
+#include "infiniop/ops/softplus.h"
+#include "infiniop/ops/softsign.h"
+#include "infiniop/ops/huber_loss.h"
 #include "infiniop/tensor_descriptor.h"
 
 #endif // __INFINIOP_API_H__
diff --git a/include/infiniop/ops/broadcast_to.h b/include/infiniop/ops/broadcast_to.h
new file mode 100644
index 000000000..66612d7cc
--- /dev/null
+++ b/include/infiniop/ops/broadcast_to.h
@@ -0,0 +1,24 @@
+#ifndef __INFINIOP_BROADCAST_TO_API_H__
+#define __INFINIOP_BROADCAST_TO_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopBroadcastToDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateBroadcastToDescriptor(infiniopHandle_t handle,
+                                                                infiniopBroadcastToDescriptor_t *desc_ptr,
+                                                                infiniopTensorDescriptor_t y,
+                                                                infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetBroadcastToWorkspaceSize(infiniopBroadcastToDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopBroadcastTo(infiniopBroadcastToDescriptor_t desc,
+                                                void *workspace,
+                                                size_t workspace_size,
+                                                void *y,
+                                                const void *x,
+                                                void *stream);
+
+__C __export infiniStatus_t infiniopDestroyBroadcastToDescriptor(infiniopBroadcastToDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/include/infiniop/ops/huber_loss.h b/include/infiniop/ops/huber_loss.h
new file mode 100644
index 000000000..bb6a22ec6
--- /dev/null
+++ b/include/infiniop/ops/huber_loss.h
@@ -0,0 +1,28 @@
+#ifndef __INFINIOP_HUBER_LOSS_API_H__
+#define __INFINIOP_HUBER_LOSS_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopHuberLossDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateHuberLossDescriptor(infiniopHandle_t handle,
+                                                              infiniopHuberLossDescriptor_t *desc_ptr,
+                                                              infiniopTensorDescriptor_t output,
+                                                              infiniopTensorDescriptor_t input,
+                                                              infiniopTensorDescriptor_t target,
+                                                              float delta,
+                                                              int reduction);
+
+__C __export infiniStatus_t infiniopGetHuberLossWorkspaceSize(infiniopHuberLossDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopHuberLoss(infiniopHuberLossDescriptor_t desc,
+                                              void *workspace,
+                                              size_t workspace_size,
+                                              void *output,
+                                              const void *input,
+                                              const void *target,
+                                              void *stream);
+
+__C __export infiniStatus_t infiniopDestroyHuberLossDescriptor(infiniopHuberLossDescriptor_t desc);
+
+#endif // __INFINIOP_HUBER_LOSS_API_H__
\ No newline at end of file
diff --git a/include/infiniop/ops/softplus.h b/include/infiniop/ops/softplus.h
index 408452ddd..5863e6a6c 100644
--- a/include/infiniop/ops/softplus.h
+++ b/include/infiniop/ops/softplus.h
@@ -1,24 +1,38 @@
-#ifndef __INFINIOP_SOFTPLUS_API_H__
-#define __INFINIOP_SOFTPLUS_API_H__
+#ifndef __INFINIOP_OPS_SOFTPLUS_H__
+#define __INFINIOP_OPS_SOFTPLUS_H__
+#include "../tensor_descriptor.h"
 
-#include "../operator_descriptor.h"
-
-typedef struct InfiniopDescriptor *infiniopSoftplusDescriptor_t;
+#ifdef __cplusplus
+extern "C" {
+#endif
 
-__C __export infiniStatus_t infiniopCreateSoftplusDescriptor(infiniopHandle_t handle,
-                                                             infiniopSoftplusDescriptor_t *desc_ptr,
-                                                             infiniopTensorDescriptor_t y,
-                                                             infiniopTensorDescriptor_t x);
+typedef struct InfiniopSoftplusDescriptor *infiniopSoftplusDescriptor_t;
+__C __export infiniStatus_t infiniopCreateSoftplusDescriptor(
+    infiniopHandle_t handle,
+    infiniopSoftplusDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc,
+    float beta,       
+    float threshold   
+);
 
-__C __export infiniStatus_t infiniopGetSoftplusWorkspaceSize(infiniopSoftplusDescriptor_t desc, size_t *size);
+__C __export infiniStatus_t infiniopGetSoftplusWorkspaceSize(
+    infiniopSoftplusDescriptor_t desc, 
+    size_t *size);
 
-__C __export infiniStatus_t infiniopSoftplus(infiniopSoftplusDescriptor_t desc,
-                                             void *workspace,
-                                             size_t workspace_size,
-                                             void *y,
-                                             const void *x,
-                                             void *stream);
+__C __export infiniStatus_t infiniopSoftplus(
+    infiniopSoftplusDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream);
 
-__C __export infiniStatus_t infiniopDestroySoftplusDescriptor(infiniopSoftplusDescriptor_t desc);
+__C __export infiniStatus_t infiniopDestroySoftplusDescriptor(
+    infiniopSoftplusDescriptor_t desc);
 
+#ifdef __cplusplus
+}
 #endif
+
+#endif // __INFINIOP_OPS_SOFTPLUS_H__
\ No newline at end of file
diff --git a/include/infiniop/ops/softsign.h b/include/infiniop/ops/softsign.h
new file mode 100644
index 000000000..6239fbbc6
--- /dev/null
+++ b/include/infiniop/ops/softsign.h
@@ -0,0 +1,25 @@
+
+#ifndef __INFINIOP_SOFTSIGN_API_H__
+#define __INFINIOP_SOFTSIGN_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopSoftsignDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateSoftsignDescriptor(infiniopHandle_t handle,
+                                                             infiniopSoftsignDescriptor_t *desc_ptr,
+                                                             infiniopTensorDescriptor_t y,
+                                                             infiniopTensorDescriptor_t x);
+
+__C __export infiniStatus_t infiniopGetSoftsignWorkspaceSize(infiniopSoftsignDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopSoftsign(infiniopSoftsignDescriptor_t desc,
+                                             void *workspace,
+                                             size_t workspace_size,
+                                             void *y,
+                                             const void *x,
+                                             void *stream);
+
+__C __export infiniStatus_t infiniopDestroySoftsignDescriptor(infiniopSoftsignDescriptor_t desc);
+
+#endif
\ No newline at end of file
diff --git a/python/infinicore/__init__.py b/python/infinicore/__init__.py
index b7288f3ac..fa92854ab 100644
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
@@ -51,6 +51,7 @@
 from infinicore.ops.rearrange import rearrange
 from infinicore.ops.squeeze import squeeze
 from infinicore.ops.unsqueeze import unsqueeze
+from infinicore.ops.broadcast_to import broadcast_to
 from infinicore.tensor import (
     Tensor,
     empty,
@@ -125,6 +126,7 @@
     "paged_attention",
     "paged_attention_prefill",
     "ones",
+    "broadcast_to",
     "strided_empty",
     "strided_from_blob",
     "zeros",
diff --git a/python/infinicore/nn/functional/__init__.py b/python/infinicore/nn/functional/__init__.py
index 255079790..c28c8a8aa 100644
--- a/python/infinicore/nn/functional/__init__.py
+++ b/python/infinicore/nn/functional/__init__.py
@@ -5,6 +5,9 @@
 from .rms_norm import rms_norm
 from .rope import RopeAlgo, rope
 from .silu import silu
+from .softplus import softplus
+from .softsign import softsign
+from .huber_loss import huber_loss
 from .swiglu import swiglu
 
 __all__ = [
@@ -17,4 +20,7 @@
     "embedding",
     "rope",
     "RopeAlgo",
+    "softplus",
+    "siftsign",
+    "huber_loss",
 ]
diff --git a/python/infinicore/nn/functional/huber_loss.py b/python/infinicore/nn/functional/huber_loss.py
new file mode 100644
index 000000000..a9845e4b6
--- /dev/null
+++ b/python/infinicore/nn/functional/huber_loss.py
@@ -0,0 +1,50 @@
+from typing import Optional
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+_REDUCTION_MODES = {
+    "none": 0,
+    "mean": 1,
+    "sum": 2,
+}
+
+def huber_loss(
+    input: Tensor, 
+    target: Tensor, 
+    delta: float = 1.0, 
+    reduction: str = "mean", 
+    *, 
+    out: Optional[Tensor] = None
+) -> Tensor:
+    r"""Creates a criterion that uses a squared term if the absolute
+    element-wise error falls below delta and a delta-scaled L1 term otherwise.
+    """
+
+    if not input.is_contiguous():
+        input = input.contiguous()
+    if not target.is_contiguous():
+        target = target.contiguous()
+    
+    # 解析 reduction 参数
+    if reduction not in _REDUCTION_MODES:
+        raise ValueError(f"{reduction} is not a valid value for reduction")
+    reduction_val = _REDUCTION_MODES[reduction]
+
+    if out is not None:
+        _infinicore.huber_loss_(
+            out._underlying,
+            input._underlying,
+            target._underlying,
+            delta,
+            reduction_val
+        )
+        return out
+
+    return Tensor(
+        _infinicore.huber_loss(
+            input._underlying,
+            target._underlying,
+            delta,
+            reduction_val
+        )
+    )
\ No newline at end of file
diff --git a/python/infinicore/nn/functional/linear.py b/python/infinicore/nn/functional/linear.py
index 22ab9b1ef..f77953aae 100644
--- a/python/infinicore/nn/functional/linear.py
+++ b/python/infinicore/nn/functional/linear.py
@@ -1,6 +1,9 @@
 from infinicore.lib import _infinicore
 from infinicore.tensor import Tensor
 
+__all__ = ["linear"]
+
+
 
 def linear(input: Tensor, weight: Tensor, bias=None, *, out=None) -> Tensor:
     r"""Applies a linear transformation to the incoming data: y=xA^T+b."""
diff --git a/python/infinicore/nn/functional/softplus.py b/python/infinicore/nn/functional/softplus.py
new file mode 100644
index 000000000..d393e1646
--- /dev/null
+++ b/python/infinicore/nn/functional/softplus.py
@@ -0,0 +1,13 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def softplus(input, beta=1, threshold=20, *, out=None):
+    if out is None:
+        # 修改：将 beta 和 threshold 传递给底层 C++
+        return Tensor(_infinicore.softplus(input._underlying, beta, threshold))
+
+    # 修改：将 beta 和 threshold 传递给底层 C++ (In-place)
+    _infinicore.softplus_(out._underlying, input._underlying, beta, threshold)
+
+    return out
\ No newline at end of file
diff --git a/python/infinicore/nn/functional/softsign.py b/python/infinicore/nn/functional/softsign.py
new file mode 100644
index 000000000..47a94a34e
--- /dev/null
+++ b/python/infinicore/nn/functional/softsign.py
@@ -0,0 +1,11 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+
+def softsign(input, *, out=None):
+    if out is None:
+        return Tensor(_infinicore.softsign(input._underlying))
+
+    _infinicore.softsign_(out._underlying, input._underlying)
+
+    return out
\ No newline at end of file
diff --git a/python/infinicore/ops/broadcast_to.py b/python/infinicore/ops/broadcast_to.py
new file mode 100644
index 000000000..203dc8498
--- /dev/null
+++ b/python/infinicore/ops/broadcast_to.py
@@ -0,0 +1,11 @@
+from infinicore.lib import _infinicore
+from infinicore.tensor import Tensor
+
+# 修改说明：将参数名 'shape' 改为 'size'，以匹配测试用例中的调用方式 kwargs={size=...}
+def broadcast_to(input, size, *, out=None):
+    if out is None:
+        return Tensor(_infinicore.broadcast_to(input._underlying, size))
+
+    _infinicore.broadcast_to_(out._underlying, input._underlying)
+
+    return out
\ No newline at end of file
diff --git a/src/infinicore/ops/broadcast_to/broadcast_to.cc b/src/infinicore/ops/broadcast_to/broadcast_to.cc
new file mode 100644
index 000000000..a96123aa0
--- /dev/null
+++ b/src/infinicore/ops/broadcast_to/broadcast_to.cc
@@ -0,0 +1,29 @@
+#include "infinicore/ops/broadcast_to.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<BroadcastTo::schema> &BroadcastTo::dispatcher() {
+    static common::OpDispatcher<BroadcastTo::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void BroadcastTo::execute(Tensor y, Tensor x) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, x);
+    infinicore::context::setDevice(y->device());
+    dispatcher().lookup(y->device().getType())(y, x);
+}
+
+Tensor broadcast_to(Tensor x, const std::vector<int64_t>& shape) {
+    Shape target_shape(shape.begin(), shape.end());
+    
+    auto y = Tensor::empty(target_shape, x->dtype(), x->device());
+    broadcast_to_(y, x);
+    return y;
+}
+
+void broadcast_to_(Tensor y, Tensor x) {
+    BroadcastTo::execute(y, x);
+}
+
+} // namespace infinicore::op
diff --git a/src/infinicore/ops/broadcast_to/broadcast_to_infiniop.cc b/src/infinicore/ops/broadcast_to/broadcast_to_infiniop.cc
new file mode 100644
index 000000000..c21c83797
--- /dev/null
+++ b/src/infinicore/ops/broadcast_to/broadcast_to_infiniop.cc
@@ -0,0 +1,63 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/broadcast_to.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::broadcast_to_impl::infiniop {
+
+// 定义描述符缓存
+thread_local common::OpCache<size_t, infiniopBroadcastToDescriptor_t> caches(
+    100, // capacity
+    [](infiniopBroadcastToDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyBroadcastToDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor y, Tensor x) {
+    size_t seed = hash_combine(y, x);
+
+    auto device = context::getDevice();
+    auto &cache = caches.getCache(device);
+
+    auto desc_opt = cache.get(seed);
+    infiniopBroadcastToDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        // 2. 创建描述符
+        INFINICORE_CHECK_ERROR(infiniopCreateBroadcastToDescriptor(
+            context::getInfiniopHandle(device),
+            &desc,
+            y->desc(),
+            x->desc()
+        ));
+        
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    // 3. 获取 Workspace 并执行
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetBroadcastToWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopBroadcastTo(
+        desc,
+        workspace->data(),
+        workspace_size,
+        y->data(),
+        x->data(),
+        context::getStream()
+    ));
+}
+
+// 4. 注册算子实现
+static bool registered = []() {
+    BroadcastTo::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::broadcast_to_impl::infiniop
\ No newline at end of file
diff --git a/src/infinicore/ops/huber_loss/huber_loss.cc b/src/infinicore/ops/huber_loss/huber_loss.cc
new file mode 100644
index 000000000..38e442c1d
--- /dev/null
+++ b/src/infinicore/ops/huber_loss/huber_loss.cc
@@ -0,0 +1,35 @@
+#include "infinicore/ops/huber_loss.hpp"
+
+namespace infinicore::op {
+
+// 1. 定义 Dispatcher 单例
+common::OpDispatcher<HuberLoss::schema> &HuberLoss::dispatcher() {
+    static common::OpDispatcher<HuberLoss::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void HuberLoss::execute(Tensor output, Tensor input, Tensor target, float delta, int64_t reduction) {
+    dispatcher().lookup(context::getDevice().getType())(output, input, target, delta, reduction);
+}
+
+// 3. 函数式接口
+Tensor huber_loss(Tensor input, Tensor target, float delta, int64_t reduction) {
+    Shape output_shape;
+    if (reduction == 0) { // None
+        // HuberLoss 是 Element-wise 的，reduction='none' 时输出形状通常与输入一致
+        output_shape = input->shape();
+    } else { 
+        output_shape = {}; // Scalar
+    }
+
+    auto output = Tensor::empty(output_shape, input->dtype(), input->device());
+    
+    huber_loss_(output, input, target, delta, reduction);
+    return output;
+}
+
+void huber_loss_(Tensor output, Tensor input, Tensor target, float delta, int64_t reduction) {
+    HuberLoss::execute(output, input, target, delta, reduction);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/huber_loss/huber_loss_infiniop.cc b/src/infinicore/ops/huber_loss/huber_loss_infiniop.cc
new file mode 100644
index 000000000..9d04bfb30
--- /dev/null
+++ b/src/infinicore/ops/huber_loss/huber_loss_infiniop.cc
@@ -0,0 +1,68 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/huber_loss.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::huber_loss_impl::infiniop {
+
+// 定义描述符缓存
+thread_local common::OpCache<size_t, infiniopHuberLossDescriptor_t> caches(
+    100, // capacity
+    [](infiniopHuberLossDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroyHuberLossDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor output, Tensor input, Tensor target, float delta, int64_t reduction) {
+    size_t seed = hash_combine(output, input, target, delta, reduction);
+
+    auto device_type = context::getDevice().getType();
+    auto device_index = context::getDevice().getIndex();
+
+    auto &cache = caches.getCache(device_type, device_index);
+
+    auto desc_opt = cache.get(seed);
+    infiniopHuberLossDescriptor_t desc = nullptr;
+    
+    if (!desc_opt) {
+        // 3. 创建描述符
+        INFINICORE_CHECK_ERROR(infiniopCreateHuberLossDescriptor(
+            context::getInfiniopHandle(output->device()), 
+            &desc,
+            output->desc(), 
+            input->desc(), 
+            target->desc(), 
+            delta,
+            static_cast<int>(reduction)
+        ));
+        
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    // 4. 获取 Workspace 并执行
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetHuberLossWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    INFINICORE_CHECK_ERROR(infiniopHuberLoss(
+        desc, 
+        workspace->data(), 
+        workspace_size,
+        output->data(), 
+        input->data(), 
+        target->data(), 
+        context::getStream()
+    ));
+}
+
+static bool registered = []() {
+    HuberLoss::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::huber_loss_impl::infiniop
\ No newline at end of file
diff --git a/src/infinicore/ops/softplus/softplus.cc b/src/infinicore/ops/softplus/softplus.cc
new file mode 100644
index 000000000..c3e6c6e83
--- /dev/null
+++ b/src/infinicore/ops/softplus/softplus.cc
@@ -0,0 +1,34 @@
+#include "infinicore/ops/softplus.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<Softplus::schema> &Softplus::dispatcher() {
+    static common::OpDispatcher<Softplus::schema> dispatcher_;
+    return dispatcher_;
+};
+
+// 修改：增加 beta 和 threshold 参数
+void Softplus::execute(Tensor y, Tensor x, float beta, float threshold) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, x);
+    infinicore::context::setDevice(y->device());
+    
+    // 修改：将 beta 和 threshold 传递给底层的实现 (infiniop wrapper)
+    dispatcher().lookup(y->device().getType())(y, x, beta, threshold);
+}
+
+// 修改：增加 beta 和 threshold 参数
+Tensor softplus(Tensor x, float beta, float threshold) {
+    auto y = Tensor::empty(x->shape(), x->dtype(), x->device());
+    // 传递参数
+    softplus_(y, x, beta, threshold);
+    return y;
+}
+
+// 修改：增加 beta 和 threshold 参数
+void softplus_(Tensor y, Tensor x, float beta, float threshold) {
+    // 传递参数
+    Softplus::execute(y, x, beta, threshold);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/softplus/softplus_infiniop.cc b/src/infinicore/ops/softplus/softplus_infiniop.cc
new file mode 100644
index 000000000..7a62f5ffa
--- /dev/null
+++ b/src/infinicore/ops/softplus/softplus_infiniop.cc
@@ -0,0 +1,49 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/softplus.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::softplus_impl::infiniop {
+thread_local common::OpCache<size_t, infiniopSoftplusDescriptor_t> caches(
+    100, // capacity
+    [](infiniopSoftplusDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroySoftplusDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+void calculate(Tensor y, Tensor x, float beta, float threshold) {
+    size_t seed = hash_combine(y, x, beta, threshold);
+
+    auto device = context::getDevice();
+    auto &cache = caches.getCache(device);
+
+    auto desc_opt = cache.get(seed);
+    infiniopSoftplusDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateSoftplusDescriptor(
+            context::getInfiniopHandle(device), &desc,
+            y->desc(), x->desc(), beta, threshold));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    // 获取并分配 Workspace
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetSoftplusWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    // 执行计算
+    INFINICORE_CHECK_ERROR(infiniopSoftplus(
+        desc, workspace->data(), workspace_size,
+        y->data(), x->data(), context::getStream()));
+}
+static bool registered = []() {
+    Softplus::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::softplus_impl::infiniop
\ No newline at end of file
diff --git a/src/infinicore/ops/softsign/softsign.cc b/src/infinicore/ops/softsign/softsign.cc
new file mode 100644
index 000000000..3a4d3c69e
--- /dev/null
+++ b/src/infinicore/ops/softsign/softsign.cc
@@ -0,0 +1,27 @@
+#include "infinicore/ops/softsign.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+common::OpDispatcher<Softsign::schema> &Softsign::dispatcher() {
+    static common::OpDispatcher<Softsign::schema> dispatcher_;
+    return dispatcher_;
+};
+
+void Softsign::execute(Tensor y, Tensor x) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, x);
+    infinicore::context::setDevice(y->device());
+    dispatcher().lookup(y->device().getType())(y, x);
+}
+
+Tensor softsign(Tensor x) {
+    auto y = Tensor::empty(x->shape(), x->dtype(), x->device());
+    softsign_(y, x);
+    return y;
+}
+
+void softsign_(Tensor y, Tensor x) {
+    Softsign::execute(y, x);
+}
+
+} // namespace infinicore::op
\ No newline at end of file
diff --git a/src/infinicore/ops/softsign/softsign_infiniop.cc b/src/infinicore/ops/softsign/softsign_infiniop.cc
new file mode 100644
index 000000000..c7ea317e1
--- /dev/null
+++ b/src/infinicore/ops/softsign/softsign_infiniop.cc
@@ -0,0 +1,50 @@
+#include "../../utils.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/softsign.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::softsign_impl::infiniop {
+thread_local common::OpCache<size_t, infiniopSoftsignDescriptor_t> caches(
+    100, // capacity
+    [](infiniopSoftsignDescriptor_t &desc) {
+        if (desc != nullptr) {
+            INFINICORE_CHECK_ERROR(infiniopDestroySoftsignDescriptor(desc));
+            desc = nullptr;
+        }
+    });
+
+void calculate(Tensor y, Tensor x) {
+    size_t seed = hash_combine(y, x);
+
+    auto device = context::getDevice();
+    auto &cache = caches.getCache(device);
+
+    auto desc_opt = cache.get(seed);
+    infiniopSoftsignDescriptor_t desc = nullptr;
+
+    if (!desc_opt) {
+        INFINICORE_CHECK_ERROR(infiniopCreateSoftsignDescriptor(
+            context::getInfiniopHandle(device), &desc,
+            y->desc(), x->desc()));
+        cache.put(seed, desc);
+    } else {
+        desc = *desc_opt;
+    }
+
+    // 获取工作空间大小
+    size_t workspace_size = 0;
+    INFINICORE_CHECK_ERROR(infiniopGetSoftsignWorkspaceSize(desc, &workspace_size));
+    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+
+    // 执行计算
+    INFINICORE_CHECK_ERROR(infiniopSoftsign(
+        desc, workspace->data(), workspace_size,
+        y->data(), x->data(), context::getStream()));
+}
+static bool registered = []() {
+    Softsign::dispatcher().registerAll(&calculate, false);
+    return true;
+}();
+
+} // namespace infinicore::op::softsign_impl::infiniop
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops.hpp b/src/infinicore/pybind11/ops.hpp
index 431c3a37b..4caf293eb 100644
--- a/src/infinicore/pybind11/ops.hpp
+++ b/src/infinicore/pybind11/ops.hpp
@@ -7,7 +7,6 @@
 #include "ops/attention.hpp"
 #include "ops/causal_softmax.hpp"
 #include "ops/embedding.hpp"
-#include "ops/linear.hpp"
 #include "ops/matmul.hpp"
 #include "ops/mul.hpp"
 #include "ops/paged_attention.hpp"
@@ -18,7 +17,11 @@
 #include "ops/rope.hpp"
 #include "ops/silu.hpp"
 #include "ops/swiglu.hpp"
-
+#include "ops/softsign.hpp"
+#include "ops/softplus.hpp"
+#include "ops/broadcast_to.hpp"
+#include "ops/linear.hpp"
+#include "ops/huber_loss.hpp"
 namespace py = pybind11;
 
 namespace infinicore::ops {
@@ -29,7 +32,6 @@ inline void bind(py::module &m) {
     bind_attention(m);
     bind_causal_softmax(m);
     bind_random_sample(m);
-    bind_linear(m);
     bind_matmul(m);
     bind_mul(m);
     bind_paged_attention(m);
@@ -39,6 +41,11 @@ inline void bind(py::module &m) {
     bind_silu(m);
     bind_swiglu(m);
     bind_rope(m);
+    bind_broadcast_to(m);
+    bind_softplus(m);
+    bind_softsign(m);
+    bind_linear(m);
+    bind_huber_loss(m);
     bind_embedding(m);
 }
 
diff --git a/src/infinicore/pybind11/ops/broadcast_to.hpp b/src/infinicore/pybind11/ops/broadcast_to.hpp
new file mode 100644
index 000000000..ca2832270
--- /dev/null
+++ b/src/infinicore/pybind11/ops/broadcast_to.hpp
@@ -0,0 +1,26 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "infinicore/ops/broadcast_to.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_broadcast_to(py::module &m) {
+    m.def("broadcast_to",
+          &op::broadcast_to,
+          py::arg("x"),
+          py::arg("shape"),
+          R"doc(Broadcast tensor to target shape.)doc");
+
+    m.def("broadcast_to_",
+          &op::broadcast_to_,
+          py::arg("y"),
+          py::arg("x"),
+          R"doc(In-place/Out broadcast tensor.)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/huber_loss.hpp b/src/infinicore/pybind11/ops/huber_loss.hpp
new file mode 100644
index 000000000..507b33254
--- /dev/null
+++ b/src/infinicore/pybind11/ops/huber_loss.hpp
@@ -0,0 +1,44 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include "infinicore/ops/huber_loss.hpp" 
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_huber_loss(py::module &m) {
+    m.def("huber_loss",
+        [](const Tensor& input, const Tensor& target, float delta, int reduction) {
+            return op::huber_loss(input, target, delta, reduction);
+        },
+        py::arg("input"),
+        py::arg("target"),
+        py::arg("delta") = 1.0f,
+        py::arg("reduction") = 1,
+        R"doc(Computes the Huber Loss between input and target.
+    
+    Args:
+        input (Tensor): Input tensor of arbitrary shape.
+        target (Tensor): Ground truth labels, same shape as input.
+        delta (float, optional): The threshold at which to change between delta-scaled L1 and L2 loss. Default: 1.0.
+        reduction (int, optional): Specifies the reduction to apply to the output: 0=None, 1=Mean, 2=Sum. Default: 1.
+    )doc");
+
+    // -------------------------------------------------------------------------
+    // 2. 绑定 in-place 接口 (huber_loss_)
+    // -------------------------------------------------------------------------
+    m.def("huber_loss_",
+        [](Tensor& output, const Tensor& input, const Tensor& target, float delta, int reduction) {
+            // 调用底层
+            op::huber_loss_(output, input, target, delta, reduction);
+        },
+        py::arg("output"),
+        py::arg("input"),
+        py::arg("target"),
+        py::arg("delta") = 1.0f,
+        py::arg("reduction") = 1,
+        R"doc(Explicit output Huber Loss operation. Writes the result into the output tensor.)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/softplus.hpp b/src/infinicore/pybind11/ops/softplus.hpp
new file mode 100644
index 000000000..cb7c6d568
--- /dev/null
+++ b/src/infinicore/pybind11/ops/softplus.hpp
@@ -0,0 +1,30 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/softplus.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_softplus(py::module &m) {
+    // Functional interface: returns a new Tensor
+    m.def("softplus",
+          &op::softplus,
+          py::arg("x"),
+          py::arg("beta") = 1.0f,
+          py::arg("threshold") = 20.0f,
+          R"doc(Computes the softplus function element-wise: y = 1/beta * log(1 + exp(beta * x)).)doc");
+
+    // In-place/Out-variant interface: writes to provided output Tensor
+    m.def("softplus_",
+          &op::softplus_,
+          py::arg("y"),
+          py::arg("x"),
+          py::arg("beta") = 1.0f,
+          py::arg("threshold") = 20.0f,
+          R"doc(In-place softplus activation. Writes result into y.)doc");
+}
+
+} // namespace infinicore::ops
\ No newline at end of file
diff --git a/src/infinicore/pybind11/ops/softsign.hpp b/src/infinicore/pybind11/ops/softsign.hpp
new file mode 100644
index 000000000..1191abe02
--- /dev/null
+++ b/src/infinicore/pybind11/ops/softsign.hpp
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <pybind11/pybind11.h>
+
+#include "infinicore/ops/softsign.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::ops {
+
+inline void bind_softsign(py::module &m) {
+    m.def("softsign",
+          &op::softsign,
+          py::arg("x"),
+          R"doc(Softsign activation function.)doc");
+
+    m.def("softsign_",
+          &op::softsign_,
+          py::arg("y"),
+          py::arg("x"),
+          R"doc(In-place softsign activation.)doc");
+}
+
+} // namespace infinicore::ops
diff --git a/src/infiniop/ops/broadcast_to/broadcast_to.h b/src/infiniop/ops/broadcast_to/broadcast_to.h
new file mode 100644
index 000000000..39e2c3090
--- /dev/null
+++ b/src/infiniop/ops/broadcast_to/broadcast_to.h
@@ -0,0 +1,48 @@
+#ifndef __BROADCAST_TO_H__
+#define __BROADCAST_TO_H__
+
+#include "../../operator.h"
+#include "info.h" // 引用对应的 BroadcastToInfo 定义
+#include <vector>
+
+// 宏定义：用于生成不同命名空间下的 Descriptor 类
+#define DESCRIPTOR(NAMESPACE)                                                     \
+    namespace op::broadcast_to::NAMESPACE {                                       \
+    class Descriptor final : public InfiniopDescriptor {                          \
+        struct Opaque;                                                            \
+        Opaque *_opaque;                                                          \
+        BroadcastToInfo _info;                                                    \
+        size_t _workspace_size;                                                   \
+                                                                                  \
+        Descriptor(                                                               \
+            Opaque *opaque,                                                       \
+            BroadcastToInfo info,                                                 \
+            size_t workspace_size,                                                \
+            infiniDevice_t device_type,                                           \
+            int device_id)                                                        \
+            : InfiniopDescriptor{device_type, device_id},                         \
+              _opaque(opaque),                                                    \
+              _info(info),                                                        \
+              _workspace_size(workspace_size) {}                                  \
+                                                                                  \
+    public:                                                                       \
+        ~Descriptor();                                                            \
+                                                                                  \
+        size_t workspaceSize() const { return _workspace_size; }                  \
+                                                                                  \
+        static infiniStatus_t create(                                             \
+            infiniopHandle_t handle,                                              \
+            Descriptor **desc_ptr,                                                \
+            infiniopTensorDescriptor_t out_desc,                                  \
+            const std::vector<infiniopTensorDescriptor_t> &input_descs);          \
+                                                                                  \
+        infiniStatus_t calculate(                                                 \
+            void *workspace,                                                      \
+            size_t workspace_size,                                                \
+            void *output,                                                         \
+            const std::vector<const void *> &inputs,                              \
+            void *stream) const;                                                  \
+    };                                                                            \
+    }
+
+#endif // __BROADCAST_TO_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/broadcast_to/cpu/broadcast_to_cpu.cc b/src/infiniop/ops/broadcast_to/cpu/broadcast_to_cpu.cc
new file mode 100644
index 000000000..2c09fecc8
--- /dev/null
+++ b/src/infiniop/ops/broadcast_to/cpu/broadcast_to_cpu.cc
@@ -0,0 +1,116 @@
+#include "broadcast_to_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include <omp.h>
+#include <vector>
+
+#include "../../../../utils/custom_types.h"
+
+namespace op::broadcast_to::cpu {
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+        _opaque = nullptr;
+    }
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    const std::vector<infiniopTensorDescriptor_t> &input_descs) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto result = BroadcastToInfo::create(out_desc, input_descs);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        new Opaque(),
+        result.take(),
+        0, // CPU 实现不需要 workspace
+        handle->device, 
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T>
+void calculate_cpu_impl(
+    const BroadcastToInfo &info,
+    void *output,
+    const void *input) {
+
+    size_t count = info.count();
+    int ndim = info.ndim();
+    
+    auto out_ptr = reinterpret_cast<T *>(output);
+    auto in_ptr = reinterpret_cast<const T *>(input);
+
+    // 并行遍历输出的每一个元素
+    #pragma omp parallel for schedule(static)
+    for (size_t i = 0; i < count; ++i) {
+        size_t temp_idx = i;
+        size_t input_offset = 0;
+
+        // 坐标变换：Output Linear Index -> Coordinate -> Input Linear Offset
+        for (int d = 0; d < ndim; ++d) {
+            size_t out_stride = info._out_strides[d];
+            size_t coord = temp_idx / out_stride;
+            temp_idx %= out_stride;
+            input_offset += coord * info._in_strides[d];
+        }
+
+        // 3. 赋值
+        out_ptr[i] = in_ptr[input_offset];
+    }
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const std::vector<const void *> &inputs,
+    void *stream) const {
+
+    if (inputs.size() != 1) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+    const void *input = inputs[0];
+    auto dtype = _info.dtype();
+
+    switch (dtype) {
+    case INFINI_DTYPE_F32:
+        cpu::calculate_cpu_impl<float>(_info, output, input);
+        break;
+    case INFINI_DTYPE_F64:
+        cpu::calculate_cpu_impl<double>(_info, output, input);
+        break;
+    case INFINI_DTYPE_F16:
+        cpu::calculate_cpu_impl<fp16_t>(_info, output, input);
+        break;
+    case INFINI_DTYPE_BF16:
+        cpu::calculate_cpu_impl<bf16_t>(_info, output, input);
+        break;
+    case INFINI_DTYPE_I64:
+        cpu::calculate_cpu_impl<int64_t>(_info, output, input);
+        break;
+    case INFINI_DTYPE_I32:
+        cpu::calculate_cpu_impl<int32_t>(_info, output, input);
+        break;
+    case INFINI_DTYPE_U8:
+        cpu::calculate_cpu_impl<uint8_t>(_info, output, input);
+        break;
+    case INFINI_DTYPE_I8:
+        cpu::calculate_cpu_impl<int8_t>(_info, output, input);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::broadcast_to::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/broadcast_to/cpu/broadcast_to_cpu.h b/src/infiniop/ops/broadcast_to/cpu/broadcast_to_cpu.h
new file mode 100644
index 000000000..2edb89442
--- /dev/null
+++ b/src/infiniop/ops/broadcast_to/cpu/broadcast_to_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __BROADCAST_TO_CPU_H__
+#define __BROADCAST_TO_CPU_H__
+
+#include "../broadcast_to.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __BROADCAST_TO_CPU_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/broadcast_to/cuda/kernel.cuh b/src/infiniop/ops/broadcast_to/cuda/kernel.cuh
new file mode 100644
index 000000000..637f5ea7c
--- /dev/null
+++ b/src/infiniop/ops/broadcast_to/cuda/kernel.cuh
@@ -0,0 +1,53 @@
+#ifndef __BROADCAST_TO_CUDA_CUH__
+#define __BROADCAST_TO_CUDA_CUH__
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+
+
+#include <cmath>
+#include <cstdio>
+#include <cstdint>
+
+namespace op::broadcast_to::cuda {
+
+// 最大维度定义，需与 BroadcastToInfo 中的保持一致
+static constexpr int MAX_DIM = 8;
+struct BroadcastStrides {
+    int64_t out_strides[MAX_DIM];
+    int64_t in_strides[MAX_DIM];
+};
+template <typename T>
+__global__ void broadcast_kernel(
+    T * __restrict__ output,        // Output data pointer
+    const T * __restrict__ input,   // Input data pointer
+    int ndim,
+    size_t count,                   // Total elements in output
+    BroadcastStrides strides) {     // Strides passed by value
+
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < count) {
+        size_t temp_idx = idx;
+        size_t input_offset = 0;
+
+        // 坐标变换与偏移计算
+        #pragma unroll
+        for (int i = 0; i < MAX_DIM; ++i) {
+            if (i >= ndim) break;
+
+            int64_t out_s = strides.out_strides[i];
+            int64_t in_s = strides.in_strides[i];
+            size_t coord = temp_idx / out_s;
+            temp_idx %= out_s;
+            input_offset += coord * in_s;
+        }
+
+        output[idx] = input[input_offset];
+    }
+}
+
+} // namespace op::broadcast_to::cuda
+
+#endif // __BROADCAST_TO_CUDA_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/broadcast_to/info.h b/src/infiniop/ops/broadcast_to/info.h
new file mode 100644
index 000000000..130480c6c
--- /dev/null
+++ b/src/infiniop/ops/broadcast_to/info.h
@@ -0,0 +1,97 @@
+#ifndef __BROADCAST_TO_INFO_H__
+#define __BROADCAST_TO_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+#include <algorithm> // for std::max
+
+namespace op::broadcast_to {
+
+class BroadcastToInfo {
+    BroadcastToInfo() = default;
+
+public:
+    static constexpr int MAX_DIM = 8; // 定义最大维度，方便做定长数组
+
+    int _dtype;
+    int _ndim;          // 统一后的维度（等于输出维度）
+    size_t _count;
+    
+    // 存储对齐后的用于计算的信息
+    int64_t _out_shape[MAX_DIM];
+    int64_t _out_strides[MAX_DIM];
+    int64_t _in_shape[MAX_DIM];   // 已经对齐并填充了1的输入Shape
+    int64_t _in_strides[MAX_DIM]; // 已经处理过广播（stride=0）的输入Stride
+
+    int dtype() const { return _dtype; }
+    int ndim() const { return _ndim; }
+    size_t count() const { return _count; }
+
+    // 构造函数
+    BroadcastToInfo(int dtype, int ndim, size_t count)
+        : _dtype(dtype), _ndim(ndim), _count(count) {}
+
+    static utils::Result<BroadcastToInfo> create(
+        infiniopTensorDescriptor_t out_desc,
+        const std::vector<infiniopTensorDescriptor_t> &input_descs) {
+
+        if (input_descs.size() != 1) return INFINI_STATUS_BAD_PARAM;
+        auto input_desc = input_descs[0];
+
+        if (out_desc->dtype() != input_desc->dtype()) return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        if (out_desc->ndim() < input_desc->ndim()) return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        if (out_desc->ndim() > MAX_DIM) return INFINI_STATUS_BAD_PARAM;
+
+        BroadcastToInfo info(out_desc->dtype(), out_desc->ndim(), 0);
+        
+        // 1. 计算总元素个数并拷贝 Output 信息
+        size_t count = 1;
+        for (int i = 0; i < info._ndim; ++i) {
+            info._out_shape[i] = out_desc->shape()[i];
+            info._out_strides[i] = out_desc->strides()[i];
+            count *= out_desc->shape()[i];
+        }
+        info._count = count;
+
+        // 2. 维度对齐与广播规则检查 (Alignment & Broadcasting)
+        // 计算维度差：例如 out(2,3,4), in(3,4) -> offset = 1
+        int offset = info._ndim - input_desc->ndim();
+
+        for (int i = 0; i < info._ndim; ++i) {
+            // i 是输出的维度索引
+            // in_i 是对应的输入维度索引
+            int in_i = i - offset;
+
+            int64_t out_dim = info._out_shape[i];
+            int64_t in_dim = 1;      // 默认填充 1 (Input 维度不足时)
+            int64_t in_stride = 0;   // 默认 Stride 0 (对应填充的 1)
+
+            if (in_i >= 0) {
+                // 如果输入在这个维度有定义
+                in_dim = input_desc->shape()[in_i];
+                in_stride = input_desc->strides()[in_i];
+            }
+
+            // 检查规则
+            if (in_dim != out_dim && in_dim != 1) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+
+            // 保存对齐后的信息
+            info._in_shape[i] = in_dim;
+            
+            if (in_dim == 1 && out_dim > 1) {
+                info._in_strides[i] = 0;
+            } else {
+                info._in_strides[i] = in_stride;
+            }
+        }
+
+        return utils::Result<BroadcastToInfo>(info);
+    }
+};
+
+} // namespace op::broadcast_to
+
+#endif // __BROADCAST_TO_INFO_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/broadcast_to/metax/broadcast_to_metax.h b/src/infiniop/ops/broadcast_to/metax/broadcast_to_metax.h
new file mode 100644
index 000000000..119686978
--- /dev/null
+++ b/src/infiniop/ops/broadcast_to/metax/broadcast_to_metax.h
@@ -0,0 +1,8 @@
+#ifndef __BROADCAST_TO_METAX_API_H__
+#define __BROADCAST_TO_METAX_API_H__
+
+#include "../broadcast_to.h"
+
+DESCRIPTOR(metax)
+
+#endif // __BROADCAST_TO_METAX_API_H__
diff --git a/src/infiniop/ops/broadcast_to/metax/broadcast_to_metax.maca b/src/infiniop/ops/broadcast_to/metax/broadcast_to_metax.maca
new file mode 100644
index 000000000..83b2fa7d9
--- /dev/null
+++ b/src/infiniop/ops/broadcast_to/metax/broadcast_to_metax.maca
@@ -0,0 +1,177 @@
+#include "broadcast_to_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+
+#include <mcr/mc_runtime.h>
+#include <cstdint>
+#include <algorithm>
+#include <vector>
+#include <cmath>
+#include <cstdio>
+
+#include <maca_fp16.h>
+#include <maca_bfloat16.h>
+using nv_bfloat16  = __maca_bfloat16;
+using nv_bfloat162 = __maca_bfloat162;
+
+namespace op::broadcast_to::metax {
+
+// ==================================================================
+// 1. Kernel 定义
+// ==================================================================
+
+// 最大维度需与 BroadcastToInfo::MAX_DIM 保持一致
+static constexpr int MAX_DIM = 8;
+
+struct BroadcastStrides {
+    int64_t out_strides[MAX_DIM];
+    int64_t in_strides[MAX_DIM];
+};
+
+template <typename T>
+__global__ void broadcast_kernel(
+    T * __restrict__ output,        // Output data pointer
+    const T * __restrict__ input,   // Input data pointer
+    int ndim,
+    size_t count,                   // Total elements in output
+    BroadcastStrides strides) {     // Strides passed by value
+
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < count) {
+        size_t temp_idx     = idx;
+        size_t input_offset = 0;
+#pragma unroll
+        for (int i = 0; i < MAX_DIM; ++i) {
+            if (i >= ndim) break;
+
+            int64_t out_s = strides.out_strides[i];
+            int64_t in_s  = strides.in_strides[i];
+            size_t coord  = temp_idx / out_s;
+            temp_idx     %= out_s;
+            input_offset += coord * in_s;
+        }
+
+        output[idx] = input[input_offset];
+    }
+}
+
+// ==================================================================
+// 2. Kernel Launch Logic
+// ==================================================================
+template <typename T>
+void launch_kernel(
+    void *output, 
+    const void *input, 
+    const BroadcastToInfo& info,
+    void *stream) {
+
+    auto in_ptr  = reinterpret_cast<const T *>(input);
+    auto out_ptr = reinterpret_cast<T *>(output);
+    
+    auto mc_stream = reinterpret_cast<mcStream_t>(stream);
+
+    BroadcastStrides strides;
+    for (int i = 0; i < BroadcastToInfo::MAX_DIM; ++i) {
+        strides.out_strides[i] = info._out_strides[i];
+        strides.in_strides[i]  = info._in_strides[i];
+    }
+
+    size_t count      = info.count();
+    size_t block_size = 256;
+    size_t grid_size  = (count + block_size - 1) / block_size;
+    if (grid_size == 0) grid_size = 1;
+
+    broadcast_kernel<T>
+        <<<grid_size, block_size, 0, mc_stream>>>(
+            out_ptr,
+            in_ptr,
+            info.ndim(),
+            count,
+            strides
+        );
+}
+
+// ==================================================================
+// 3. Descriptor 实现
+// ==================================================================
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() { 
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc, 
+    const std::vector<infiniopTensorDescriptor_t> &input_descs) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+
+    auto info_result = BroadcastToInfo::create(out_desc, input_descs);
+    if (!info_result) {
+        return info_result.status();
+    }
+
+    size_t workspace_size = 0;  // broadcast_to 不需要额外 workspace
+
+    *desc_ptr = new Descriptor(
+        new Opaque(),
+        info_result.take(),
+        workspace_size,
+        handle->device,
+        handle->device_id
+    );
+    
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, 
+    size_t workspace_size, 
+    void *output,
+    const std::vector<const void *> &inputs, 
+    void *stream) const {
+
+    if (inputs.size() != 1) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+    const void *input = inputs[0];
+
+    auto dtype = _info.dtype();
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        launch_kernel<__half>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_BF16:
+        launch_kernel<nv_bfloat16>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_F32:
+        launch_kernel<float>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_F64:
+        launch_kernel<double>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_I64:
+        launch_kernel<int64_t>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_I32:
+        launch_kernel<int32_t>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_U8:
+        launch_kernel<uint8_t>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_I8:
+        launch_kernel<int8_t>(output, input, _info, stream);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::broadcast_to::metax
diff --git a/src/infiniop/ops/broadcast_to/moore/broadcast_to_moore.h b/src/infiniop/ops/broadcast_to/moore/broadcast_to_moore.h
new file mode 100644
index 000000000..c0401398a
--- /dev/null
+++ b/src/infiniop/ops/broadcast_to/moore/broadcast_to_moore.h
@@ -0,0 +1,8 @@
+#ifndef __BROADCAST_TO_MOORE_H__
+#define __BROADCAST_TO_MOORE_H__
+
+#include "../broadcast_to.h"
+
+DESCRIPTOR(moore)
+
+#endif // __BROADCAST_TO_MOORE_H__
diff --git a/src/infiniop/ops/broadcast_to/moore/broadcast_to_moore.mu b/src/infiniop/ops/broadcast_to/moore/broadcast_to_moore.mu
new file mode 100644
index 000000000..4574737d3
--- /dev/null
+++ b/src/infiniop/ops/broadcast_to/moore/broadcast_to_moore.mu
@@ -0,0 +1,122 @@
+#include "broadcast_to_moore.h"
+#include "broadcast_to_moore_kernel.h"
+#include "../../../devices/moore/moore_handle.h"
+#include <cstdint>
+#include <algorithm>
+#include <vector>
+
+namespace op::broadcast_to::moore {
+
+// ==================================================================
+// Kernel Launch Logic
+// ==================================================================
+template <typename T>
+void launch_kernel(
+    void *output, 
+    const void *input, 
+    const BroadcastToInfo& info,
+    void *stream) {
+    
+    auto in_ptr = reinterpret_cast<const T *>(input);
+    auto out_ptr = reinterpret_cast<T *>(output);
+    
+    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
+
+    // 复制 strides 到 Kernel 定义的结构体中
+    op::broadcast_to::moore::BroadcastStrides strides;
+    for (int i = 0; i < BroadcastToInfo::MAX_DIM; ++i) {
+        strides.out_strides[i] = info._out_strides[i];
+        strides.in_strides[i] = info._in_strides[i];
+    }
+
+    size_t count = info.count();
+    size_t block_size = 256;
+    size_t grid_size = (count + block_size - 1) / block_size;
+
+    op::broadcast_to::moore::broadcast_kernel<T>
+        <<<grid_size, block_size, 0, musa_stream>>>(
+            out_ptr, 
+            in_ptr, 
+            info.ndim(), 
+            count, 
+            strides
+        );
+}
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() { 
+    if (_opaque) delete _opaque; 
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_, 
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc, 
+    const std::vector<infiniopTensorDescriptor_t> &input_descs) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+
+    auto info_result = BroadcastToInfo::create(out_desc, input_descs);
+    if (!info_result) return info_result.status();
+    size_t workspace_size = 0;
+
+    *desc_ptr = new Descriptor(
+        new Opaque(), 
+        info_result.take(), 
+        workspace_size, 
+        handle->device, 
+        handle->device_id
+    );
+    
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, 
+    size_t workspace_size, 
+    void *output,
+    const std::vector<const void *> &inputs, 
+    void *stream) const {
+
+    if (inputs.size() != 1) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+    const void *input = inputs[0];
+
+    auto dtype = _info.dtype();
+
+    // 3. 根据数据类型分发 Kernel
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        launch_kernel<half>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_BF16:
+        launch_kernel<__mt_bfloat16>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_F32:
+        launch_kernel<float>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_F64:
+        launch_kernel<double>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_I64:
+        launch_kernel<int64_t>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_I32:
+        launch_kernel<int32_t>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_U8:
+        launch_kernel<uint8_t>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_I8:
+        launch_kernel<int8_t>(output, input, _info, stream);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::broadcast_to::moore
\ No newline at end of file
diff --git a/src/infiniop/ops/broadcast_to/moore/broadcast_to_moore_kernel.h b/src/infiniop/ops/broadcast_to/moore/broadcast_to_moore_kernel.h
new file mode 100644
index 000000000..1e9a401d7
--- /dev/null
+++ b/src/infiniop/ops/broadcast_to/moore/broadcast_to_moore_kernel.h
@@ -0,0 +1,57 @@
+#ifndef __BROADCAST_TO_MOORE_KERNEL_H__
+#define __BROADCAST_TO_MOORE_KERNEL_H__
+
+#include <musa_runtime.h>
+#include <musa_fp16.h>
+#include <musa_bf16.h>
+
+#include <cmath>
+#include <cstdio>
+#include <cstdint>
+
+namespace op::broadcast_to::moore {
+
+// 最大维度定义，需与 BroadcastToInfo 中的保持一致
+static constexpr int MAX_DIM = 8;
+
+struct BroadcastStrides {
+    int64_t out_strides[MAX_DIM];
+    int64_t in_strides[MAX_DIM];
+};
+
+template <typename T>
+__global__ void broadcast_kernel(
+    T * __restrict__ output,        // Output data pointer
+    const T * __restrict__ input,   // Input data pointer
+    int ndim,
+    size_t count,                   // Total elements in output
+    BroadcastStrides strides) {     // Strides passed by value
+
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < count) {
+        size_t temp_idx = idx;
+        size_t input_offset = 0;
+
+        // 坐标变换与偏移计算
+        // 将扁平的 output index 转换为多维坐标，再根据 input strides 计算 input offset
+        #pragma unroll
+        for (int i = 0; i < MAX_DIM; ++i) {
+            if (i >= ndim) break;
+
+            int64_t out_s = strides.out_strides[i];
+            int64_t in_s = strides.in_strides[i];
+            
+            // 计算当前维度的坐标
+            size_t coord = temp_idx / out_s;
+            temp_idx %= out_s;
+            input_offset += coord * in_s;
+        }
+
+        output[idx] = input[input_offset];
+    }
+}
+
+} // namespace op::broadcast_to::moore
+
+#endif // __BROADCAST_TO_MOORE_KERNEL_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/broadcast_to/nvidia/broadcast_to_nvidia.cu b/src/infiniop/ops/broadcast_to/nvidia/broadcast_to_nvidia.cu
new file mode 100644
index 000000000..4a1f0c63f
--- /dev/null
+++ b/src/infiniop/ops/broadcast_to/nvidia/broadcast_to_nvidia.cu
@@ -0,0 +1,114 @@
+#include "broadcast_to_nvidia.cuh"
+#include "../cuda/kernel.cuh"   
+#include "../../../handle.h"
+#include <cstdint>
+#include <algorithm>
+#include <vector>
+
+namespace op::broadcast_to::nvidia {
+
+// ==================================================================
+// Kernel Launch Logic
+// ==================================================================
+template <typename T>
+void launch_kernel(
+    void *output, 
+    const void *input, 
+    const BroadcastToInfo& info,
+    void *stream) {
+    auto in_ptr = reinterpret_cast<const T *>(input);
+    auto out_ptr = reinterpret_cast<T *>(output);
+    
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+    op::broadcast_to::cuda::BroadcastStrides strides;
+    for (int i = 0; i < BroadcastToInfo::MAX_DIM; ++i) {
+        strides.out_strides[i] = info._out_strides[i];
+        strides.in_strides[i] = info._in_strides[i];
+    }
+    size_t count = info.count();
+    size_t block_size = 256;
+    size_t grid_size = (count + block_size - 1) / block_size;
+    op::broadcast_to::cuda::broadcast_kernel<T>
+        <<<grid_size, block_size, 0, cuda_stream>>>(
+            out_ptr, 
+            in_ptr, 
+            info.ndim(), 
+            count, 
+            strides
+        );
+}
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() { 
+    if (_opaque) delete _opaque; 
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle, 
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc, 
+    const std::vector<infiniopTensorDescriptor_t> &input_descs) {
+    auto info_result = BroadcastToInfo::create(out_desc, input_descs);
+    if (!info_result) return info_result.status();
+    size_t workspace_size = 0;
+
+    *desc_ptr = new Descriptor(
+        new Opaque(), 
+        info_result.take(), 
+        workspace_size, 
+        handle->device, 
+        handle->device_id
+    );
+    
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, 
+    size_t workspace_size, 
+    void *output,
+    const std::vector<const void *> &inputs, 
+    void *stream) const {
+
+    if (inputs.size() != 1) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+    const void *input = inputs[0];
+
+    auto dtype = _info.dtype();
+
+    // 3. 根据数据类型分发 Kernel
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        launch_kernel<half>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_BF16:
+        launch_kernel<nv_bfloat16>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_F32:
+        launch_kernel<float>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_F64:
+        launch_kernel<double>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_I64:
+        launch_kernel<int64_t>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_I32:
+        launch_kernel<int32_t>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_U8:
+        launch_kernel<uint8_t>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_I8:
+        launch_kernel<int8_t>(output, input, _info, stream);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::broadcast_to::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/broadcast_to/nvidia/broadcast_to_nvidia.cuh b/src/infiniop/ops/broadcast_to/nvidia/broadcast_to_nvidia.cuh
new file mode 100644
index 000000000..62069ca51
--- /dev/null
+++ b/src/infiniop/ops/broadcast_to/nvidia/broadcast_to_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __BROADCAST_TO_NVIDIA_CUH__
+#define __BROADCAST_TO_NVIDIA_CUH__
+
+#include "../broadcast_to.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __BROADCAST_TO_NVIDIA_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/broadcast_to/operator.cc b/src/infiniop/ops/broadcast_to/operator.cc
new file mode 100644
index 000000000..61e6a9dea
--- /dev/null
+++ b/src/infiniop/ops/broadcast_to/operator.cc
@@ -0,0 +1,176 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/broadcast_to.h"
+
+// --- 后端实现头文件 ---
+#ifdef ENABLE_CPU_API
+#include "cpu/broadcast_to_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/broadcast_to_nvidia.cuh"
+#endif
+
+#ifdef ENABLE_METAX_API
+#include "metax/broadcast_to_metax.h"
+#endif
+
+#ifdef ENABLE_MOORE_API
+#include "moore/broadcast_to_moore.h"
+#endif
+
+extern "C" {
+
+// =======================================================================
+// 1. 创建算子描述符
+// =======================================================================
+__C infiniStatus_t infiniopCreateBroadcastToDescriptor(
+    infiniopHandle_t handle,
+    infiniopBroadcastToDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+    #define CREATE(CASE, NAMESPACE)                                                   \
+        case CASE:                                                                    \
+            return op::broadcast_to::NAMESPACE::Descriptor::create(                   \
+                handle,                                                               \
+                reinterpret_cast<op::broadcast_to::NAMESPACE::Descriptor **>(desc_ptr), \
+                y_desc,                                                               \
+                {x_desc})
+
+    switch (handle->device) {
+    #ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CREATE
+}
+
+// =======================================================================
+// 2. 获取 Workspace 大小
+// =======================================================================
+__C infiniStatus_t infiniopGetBroadcastToWorkspaceSize(infiniopBroadcastToDescriptor_t desc, size_t *size) {
+
+    #define GET(CASE, NAMESPACE)                                                                   \
+        case CASE:                                                                                 \
+            *size = reinterpret_cast<op::broadcast_to::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+            return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef GET
+}
+
+// =======================================================================
+// 3. 执行计算 (Calculate)
+// =======================================================================
+__C infiniStatus_t infiniopBroadcastTo(
+    infiniopBroadcastToDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+    // 注意：{x} 用于构造 std::vector<const void*>
+    #define CALCULATE(CASE, NAMESPACE)                                            \
+        case CASE:                                                                \
+            return reinterpret_cast<const op::broadcast_to::NAMESPACE::Descriptor *>(desc) \
+                ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CALCULATE
+}
+
+// =======================================================================
+// 4. 销毁算子描述符
+// =======================================================================
+__C infiniStatus_t infiniopDestroyBroadcastToDescriptor(infiniopBroadcastToDescriptor_t desc) {
+
+    #define DELETE(CASE, NAMESPACE)                                                        \
+        case CASE:                                                                         \
+            delete reinterpret_cast<const op::broadcast_to::NAMESPACE::Descriptor *>(desc); \
+            return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef DELETE
+}
+
+} // extern "C"
\ No newline at end of file
diff --git a/src/infiniop/ops/huber_loss/cpu/huber_loss_cpu.cc b/src/infiniop/ops/huber_loss/cpu/huber_loss_cpu.cc
new file mode 100644
index 000000000..55e6c709a
--- /dev/null
+++ b/src/infiniop/ops/huber_loss/cpu/huber_loss_cpu.cc
@@ -0,0 +1,140 @@
+#include "huber_loss_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include <algorithm>
+#include <cmath>
+#include <omp.h>
+#include <cstdint>
+
+#include "../../../../utils/custom_types.h"
+
+namespace op::huber_loss::cpu {
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+        _opaque = nullptr;
+    }
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t input_desc,
+    infiniopTensorDescriptor_t target_desc,
+    float delta,
+    int reduction) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto result = HuberLossInfo::create(out_desc, input_desc, target_desc, delta, reduction);
+    CHECK_RESULT(result);
+
+    *desc_ptr = new Descriptor(
+        new Opaque(),
+        result.take(),
+        0, 
+        handle->device, 
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+template <typename T>
+void calculate_cpu_impl(
+    const HuberLossInfo &info,
+    void *output,
+    const void *input,
+    const void *target) {
+
+    size_t count = info.count();
+    float delta = info.delta();
+    int reduction = info.reduction();
+
+    auto out_ptr = reinterpret_cast<T *>(output);
+    auto in_ptr = reinterpret_cast<const T *>(input);
+    // Huber Loss 中 target 是数值，类型与 input 一致
+    auto tar_ptr = reinterpret_cast<const T *>(target);
+    float half_delta = 0.5f * delta;
+
+    if (reduction == 0) { // None
+        #pragma omp parallel for schedule(static)
+        for (size_t i = 0; i < count; ++i) {
+            float val = utils::cast<float>(in_ptr[i]);
+            float tgt = utils::cast<float>(tar_ptr[i]);
+            
+            float diff = val - tgt;
+            float abs_diff = std::abs(diff);
+            float loss = 0.0f;
+
+            if (abs_diff < delta) {
+                // 0.5 * (x - y)^2
+                loss = 0.5f * diff * diff;
+            } else {
+                // delta * (|x - y| - 0.5 * delta)
+                loss = delta * (abs_diff - half_delta);
+            }
+
+            out_ptr[i] = utils::cast<T>(loss);
+        }
+    } else { // Mean or Sum
+        double total_loss = 0.0;
+
+        #pragma omp parallel for reduction(+:total_loss) schedule(static)
+        for (size_t i = 0; i < count; ++i) {
+            float val = utils::cast<float>(in_ptr[i]);
+            float tgt = utils::cast<float>(tar_ptr[i]);
+            
+            float diff = val - tgt;
+            float abs_diff = std::abs(diff);
+            float loss = 0.0f;
+
+            if (abs_diff < delta) {
+                loss = 0.5f * diff * diff;
+            } else {
+                loss = delta * (abs_diff - half_delta);
+            }
+
+            total_loss += static_cast<double>(loss);
+        }
+
+        if (reduction == 1) { // Mean
+            total_loss /= static_cast<double>(count);
+        }
+
+        out_ptr[0] = utils::cast<T>(static_cast<float>(total_loss));
+    }
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *target,
+    void *stream) const {
+
+    auto dtype = _info.dtype();
+    switch (dtype) {
+    case INFINI_DTYPE_F32:
+        cpu::calculate_cpu_impl<float>(_info, output, input, target);
+        break;
+    case INFINI_DTYPE_F64:
+        cpu::calculate_cpu_impl<double>(_info, output, input, target);
+        break;
+    case INFINI_DTYPE_F16:
+        cpu::calculate_cpu_impl<fp16_t>(_info, output, input, target);
+        break;
+    case INFINI_DTYPE_BF16:
+        cpu::calculate_cpu_impl<bf16_t>(_info, output, input, target);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::huber_loss::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/huber_loss/cpu/huber_loss_cpu.h b/src/infiniop/ops/huber_loss/cpu/huber_loss_cpu.h
new file mode 100644
index 000000000..1a0b45978
--- /dev/null
+++ b/src/infiniop/ops/huber_loss/cpu/huber_loss_cpu.h
@@ -0,0 +1,8 @@
+#ifndef __HUBER_LOSS_CPU_H__
+#define __HUBER_LOSS_CPU_H__
+
+#include "../huber_loss.h"
+
+DESCRIPTOR(cpu)
+
+#endif // __HUBER_LOSS_CPU_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/huber_loss/cuda/kernel.cuh b/src/infiniop/ops/huber_loss/cuda/kernel.cuh
new file mode 100644
index 000000000..ec48d9a9e
--- /dev/null
+++ b/src/infiniop/ops/huber_loss/cuda/kernel.cuh
@@ -0,0 +1,106 @@
+#ifndef __HUBER_LOSS_CUDA_CUH__
+#define __HUBER_LOSS_CUDA_CUH__
+
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cmath>
+#include <cstdio> 
+
+namespace op::huber_loss::cuda {
+
+__device__ __forceinline__ float warpReduceSum(float val) {
+    unsigned int mask = 0xffffffff;
+    for (int offset = warpSize / 2; offset > 0; offset /= 2)
+        val += __shfl_down_sync(mask, val, offset);
+    return val;
+}
+
+__device__ __forceinline__ float blockReduceSum(float val) {
+    static __shared__ float shared[32]; 
+    int lane = threadIdx.x % warpSize;
+    int wid = threadIdx.x / warpSize;
+
+    val = warpReduceSum(val);
+    if (lane == 0) shared[wid] = val;
+    __syncthreads();
+
+    val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0f;
+    if (wid == 0) val = warpReduceSum(val);
+    return val;
+}
+
+struct HuberLossFunctor {
+    float delta;
+    float half_delta; 
+
+    __host__ __device__ HuberLossFunctor(float delta_val) 
+        : delta(delta_val), half_delta(0.5f * delta_val) {}
+
+    __device__ __forceinline__ float compute(float input_val, float target_val) const {
+        float diff = input_val - target_val;
+        float abs_diff = std::abs(diff);
+
+        if (abs_diff < delta) {
+            return 0.5f * diff * diff;
+        } else {
+            return delta * (abs_diff - half_delta);
+        }
+    }
+};
+
+template <typename T>
+__global__ void huber_loss_kernel(
+    T * __restrict__ output,       
+    const T * __restrict__ input,  
+    const T * __restrict__ target, 
+    size_t count,                  
+    HuberLossFunctor functor) {
+
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    if (idx < count) {
+        float in_val = static_cast<float>(input[idx]);
+        float tg_val = static_cast<float>(target[idx]);
+        
+        float loss = functor.compute(in_val, tg_val);
+
+        output[idx] = static_cast<T>(loss);
+    }
+}
+
+template <typename T>
+__global__ void huber_loss_reduce_kernel(
+    float * output,                 
+    const T * __restrict__ input,   
+    const T * __restrict__ target,  
+    size_t count,                   
+    HuberLossFunctor functor,
+    float scale 
+) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t stride = blockDim.x * gridDim.x;
+    float local_sum = 0.0f;
+
+    for (size_t i = idx; i < count; i += stride) {
+        float in_val = static_cast<float>(input[i]);
+        float tg_val = static_cast<float>(target[i]);
+        
+        local_sum += functor.compute(in_val, tg_val);
+    }
+
+    float block_sum = blockReduceSum(local_sum);
+
+    if (threadIdx.x == 0) {
+        atomicAdd(output, block_sum * scale);
+    }
+}
+
+template <typename T>
+__global__ void cast_float_to_t(T* output, const float* src) {
+    *output = static_cast<T>(*src);
+}
+
+} // namespace op::huber_loss::cuda
+
+#endif // __HUBER_LOSS_CUDA_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/huber_loss/huber_loss.h b/src/infiniop/ops/huber_loss/huber_loss.h
new file mode 100644
index 000000000..be602ca1d
--- /dev/null
+++ b/src/infiniop/ops/huber_loss/huber_loss.h
@@ -0,0 +1,49 @@
+#ifndef __HUBER_LOSS_H__
+#define __HUBER_LOSS_H__
+
+#include "../../operator.h"
+#include "info.h" // 引用对应的 HuberLossInfo 定义
+#define DESCRIPTOR(NAMESPACE)                                            \
+    namespace op::huber_loss::NAMESPACE {                                \
+    class Descriptor final : public InfiniopDescriptor {                 \
+        struct Opaque;                                                   \
+        Opaque *_opaque;                                                 \
+        HuberLossInfo _info;                                             \
+        size_t _workspace_size;                                          \
+                                                                         \
+        Descriptor(                                                      \
+            Opaque *opaque,                                              \
+            HuberLossInfo info,                                          \
+            size_t workspace_size,                                       \
+            infiniDevice_t device_type,                                  \
+            int device_id)                                               \
+            : InfiniopDescriptor{device_type, device_id},                \
+              _opaque(opaque),                                           \
+              _info(info),                                               \
+              _workspace_size(workspace_size) {}                         \
+                                                                         \
+    public:                                                              \
+        ~Descriptor();                                                   \
+                                                                         \
+        size_t workspaceSize() const { return _workspace_size; }         \
+                                                                         \
+        static infiniStatus_t create(                                    \
+            infiniopHandle_t handle,                                     \
+            Descriptor **desc_ptr,                                       \
+            infiniopTensorDescriptor_t out_desc,                         \
+            infiniopTensorDescriptor_t input_desc,                       \
+            infiniopTensorDescriptor_t target_desc,                      \
+            float delta,                                                 \
+            int reduction);                                              \
+                                                                         \
+        infiniStatus_t calculate(                                        \
+            void *workspace,                                             \
+            size_t workspace_size,                                       \
+            void *output,                                                \
+            const void *input,                                           \
+            const void *target,                                          \
+            void *stream) const;                                         \
+    };                                                                   \
+    }
+
+#endif // __HUBER_LOSS_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/huber_loss/info.h b/src/infiniop/ops/huber_loss/info.h
new file mode 100644
index 000000000..fcfe09b72
--- /dev/null
+++ b/src/infiniop/ops/huber_loss/info.h
@@ -0,0 +1,79 @@
+#ifndef __HUBER_LOSS_INFO_H__
+#define __HUBER_LOSS_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+
+namespace op::huber_loss {
+
+class HuberLossInfo {
+    HuberLossInfo() = default;
+
+public:
+    int _dtype;
+    float _delta;
+    int _reduction;
+    size_t _count;
+
+    int dtype() const { return _dtype; }
+    float delta() const { return _delta; }
+    int reduction() const { return _reduction; }
+    size_t count() const { return _count; }
+
+    HuberLossInfo(int dtype, float delta, int reduction, size_t count)
+        : _dtype(dtype), _delta(delta), _reduction(reduction), _count(count) {}
+
+    static utils::Result<HuberLossInfo> create(
+        infiniopTensorDescriptor_t out_desc,
+        infiniopTensorDescriptor_t input_desc,
+        infiniopTensorDescriptor_t target_desc,
+        float delta,
+        int reduction) {
+
+        if (input_desc->ndim() != target_desc->ndim()) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+        
+        size_t total_count = input_desc->numel();
+        if (target_desc->numel() != total_count) {
+             return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        for (size_t i = 0; i < input_desc->ndim(); ++i) {
+            if (input_desc->shape()[i] != target_desc->shape()[i]) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+        }
+
+        if (input_desc->dtype() != target_desc->dtype() || 
+            input_desc->dtype() != out_desc->dtype()) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        
+        if (reduction == 0) {
+            if (out_desc->ndim() != input_desc->ndim()) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+            for (size_t i = 0; i < out_desc->ndim(); ++i) {
+                if (out_desc->shape()[i] != input_desc->shape()[i]) {
+                    return INFINI_STATUS_BAD_TENSOR_SHAPE;
+                }
+            }
+        } else {
+            if (out_desc->numel() != 1) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+        }
+
+        return utils::Result<HuberLossInfo>(HuberLossInfo{
+            input_desc->dtype(),
+            delta,
+            reduction,
+            total_count
+        });
+    }
+};
+
+} // namespace op::huber_loss
+
+#endif // __HUBER_LOSS_INFO_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/huber_loss/metax/huber_loss_metax.h b/src/infiniop/ops/huber_loss/metax/huber_loss_metax.h
new file mode 100644
index 000000000..7091e9bef
--- /dev/null
+++ b/src/infiniop/ops/huber_loss/metax/huber_loss_metax.h
@@ -0,0 +1,8 @@
+#ifndef __HUBER_LOSS_METAX_API_H__
+#define __HUBER_LOSS_METAX_API_H__
+
+#include "../huber_loss.h"
+
+DESCRIPTOR(metax)
+
+#endif // __HUBER_LOSS_METAX_API_H__
diff --git a/src/infiniop/ops/huber_loss/metax/huber_loss_metax.maca b/src/infiniop/ops/huber_loss/metax/huber_loss_metax.maca
new file mode 100644
index 000000000..81aad96f3
--- /dev/null
+++ b/src/infiniop/ops/huber_loss/metax/huber_loss_metax.maca
@@ -0,0 +1,273 @@
+#include "huber_loss_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
+#include <mcr/mc_runtime.h>
+
+#include <cstdint>
+#include <algorithm>
+#include <cstdio>
+#include <cmath>
+
+#include <maca_fp16.h>
+#include <maca_bfloat16.h>
+using nv_bfloat16  = __maca_bfloat16;
+using nv_bfloat162 = __maca_bfloat162;
+
+namespace op::huber_loss::metax {
+
+// ==================================================================
+// 1. Functor: 核心数学逻辑
+// ==================================================================
+struct HuberLossFunctor {
+    float delta;
+    float half_delta; 
+
+    __host__ __device__ HuberLossFunctor(float delta_val) 
+        : delta(delta_val), half_delta(0.5f * delta_val) {}
+
+    __device__ __forceinline__ float compute(float input_val, float target_val) const {
+        float diff     = input_val - target_val;
+        float abs_diff = fabsf(diff);
+
+        if (abs_diff < delta) {
+            return 0.5f * diff * diff;
+        } else {
+            return delta * (abs_diff - half_delta);
+        }
+    }
+};
+
+// ==================================================================
+// 2. Kernel 定义
+// ==================================================================
+
+// ------------------------------------------------------------------
+// Kernel 1: Elementwise 模式 (Reduction = None)
+// 输出形状同 input/target，长度为 count
+// ------------------------------------------------------------------
+template <typename T>
+__global__ void huber_loss_kernel(
+    T * __restrict__ output,        // [count]
+    const T * __restrict__ input,   // [count]
+    const T * __restrict__ target,  // [count]
+    size_t count,                   // 元素个数
+    HuberLossFunctor functor) {
+
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    if (idx < count) {
+        float in_val = static_cast<float>(input[idx]);
+        float tg_val = static_cast<float>(target[idx]);
+        
+        float loss = functor.compute(in_val, tg_val);
+
+        output[idx] = static_cast<T>(loss);
+    }
+}
+
+// ------------------------------------------------------------------
+// Kernel 2: Reduction 模式 (Mean / Sum)
+// ------------------------------------------------------------------
+template <typename T>
+__global__ void huber_loss_reduce_kernel(
+    float * output,                 // [1] Accumulator (Float)
+    const T * __restrict__ input,   // [count]
+    const T * __restrict__ target,  // [count]
+    size_t count,                   // 元素个数
+    HuberLossFunctor functor,
+    float scale                     // Mean: 1/count, Sum: 1.0
+) {
+    __shared__ volatile float shared_mem[256];
+
+    size_t idx    = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t stride = blockDim.x * gridDim.x;
+    float  local_sum = 0.0f;
+
+    // 1. Grid-Stride Loop 计算本线程负责的 loss 累加
+    for (size_t i = idx; i < count; i += stride) {
+        float in_val = static_cast<float>(input[i]);
+        float tg_val = static_cast<float>(target[i]);
+
+        local_sum += functor.compute(in_val, tg_val);
+    }
+
+    // 2. 写入 shared memory
+    unsigned int tid = threadIdx.x;
+    if (tid < 256) {
+        shared_mem[tid] = local_sum;
+    }
+    __syncthreads();
+
+    // 3. Block 内树形归约 (Unrolled Reduction)
+    if (tid < 128) { shared_mem[tid] += shared_mem[tid + 128]; } __syncthreads();
+    if (tid < 64)  { shared_mem[tid] += shared_mem[tid + 64];  } __syncthreads();
+    if (tid < 32)  { shared_mem[tid] += shared_mem[tid + 32];  } __syncthreads();
+    if (tid < 16)  { shared_mem[tid] += shared_mem[tid + 16];  } __syncthreads();
+    if (tid < 8)   { shared_mem[tid] += shared_mem[tid + 8];   } __syncthreads();
+    if (tid < 4)   { shared_mem[tid] += shared_mem[tid + 4];   } __syncthreads();
+    if (tid < 2)   { shared_mem[tid] += shared_mem[tid + 2];   } __syncthreads();
+    if (tid < 1)   { shared_mem[tid] += shared_mem[tid + 1];   } __syncthreads();
+
+    // 4. Block 级结果累加到全局 output
+    if (tid == 0) {
+        float block_sum = shared_mem[0];
+        atomicAdd(output, block_sum * scale);
+    }
+}
+
+// ------------------------------------------------------------------
+// Kernel 3: 类型转换 (Float -> T)
+// ------------------------------------------------------------------
+template <typename T>
+__global__ void cast_float_to_t(T* output, const float* src) {
+    *output = static_cast<T>(*src);
+}
+
+// ==================================================================
+// 3. Kernel Launch Logic
+// ==================================================================
+template <typename T>
+void launch_kernel(
+    void *output, 
+    const void *input, 
+    const void *target, 
+    void* workspace,
+    const HuberLossInfo& info,
+    void *stream) {
+
+    auto in_ptr  = reinterpret_cast<const T *>(input);
+    auto tar_ptr = reinterpret_cast<const T *>(target);
+    auto out_ptr = reinterpret_cast<T *>(output);
+    
+    auto mc_stream = reinterpret_cast<mcStream_t>(stream);
+    
+    size_t count     = info.count();
+    int    reduction = info.reduction();
+    
+    HuberLossFunctor functor(info.delta());
+
+    // --------------------------------------------------------------
+    // Mode 1: Elementwise (Reduction = None)
+    // --------------------------------------------------------------
+    if (reduction == 0) {
+        size_t block_size = 256;
+        size_t grid_size  = (count + block_size - 1) / block_size;
+        if (grid_size == 0) grid_size = 1;
+        
+        huber_loss_kernel<T>
+            <<<grid_size, block_size, 0, mc_stream>>>(
+                out_ptr, in_ptr, tar_ptr, count, functor
+            );
+    } 
+    // --------------------------------------------------------------
+    // Mode 2: Reduction (Mean / Sum)
+    // --------------------------------------------------------------
+    else {
+        // workspace 用作浮点累加器
+        float* acc_ptr = reinterpret_cast<float*>(workspace);
+        mcMemsetAsync(acc_ptr, 0, sizeof(float), mc_stream);
+
+        float scale = (reduction == 1)
+                        ? (1.0f / static_cast<float>(count))   // reduction == 1: mean
+                        : 1.0f;                                 // reduction == 2: sum
+
+        size_t block_size = 256;
+        size_t grid_size  = std::min(
+            (count + block_size - 1) / block_size,
+            static_cast<size_t>(1024)
+        );
+        if (grid_size == 0) grid_size = 1;
+
+        huber_loss_reduce_kernel<T>
+            <<<grid_size, block_size, 0, mc_stream>>>(
+                acc_ptr, in_ptr, tar_ptr, count, functor, scale
+            );
+
+        // 将 float 标量结果转换回目标类型 T
+        cast_float_to_t<T>
+            <<<1, 1, 0, mc_stream>>>(out_ptr, acc_ptr);
+    }
+}
+
+// ==================================================================
+// 4. Descriptor Implementation
+// ==================================================================
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() { 
+    if (_opaque) {
+        delete _opaque;
+    }
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc, 
+    infiniopTensorDescriptor_t input_desc, 
+    infiniopTensorDescriptor_t target_desc,
+    float delta, 
+    int reduction) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+
+    auto info_result = HuberLossInfo::create(
+        out_desc, input_desc, target_desc, delta, reduction
+    );
+    if (!info_result) {
+        return info_result.status();
+    }
+
+    // reduction 模式下需要一个 float workspace（用于 atomicAdd 累加）
+    size_t workspace_size = 0;
+    if (reduction != 0) {
+        workspace_size = sizeof(float);
+    }
+
+    *desc_ptr = new Descriptor(
+        new Opaque(),
+        info_result.take(),
+        workspace_size,
+        handle->device,
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, 
+    size_t workspace_size, 
+    void *output,
+    const void *input, 
+    const void *target, 
+    void *stream) const {
+
+    auto dtype     = _info.dtype();
+    int  reduction = _info.reduction();
+
+    if (reduction != 0 && workspace_size < sizeof(float)) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        launch_kernel<__half>(output, input, target, workspace, _info, stream);
+        break;
+    case INFINI_DTYPE_BF16:
+        launch_kernel<nv_bfloat16>(output, input, target, workspace, _info, stream);
+        break;
+    case INFINI_DTYPE_F32:
+        launch_kernel<float>(output, input, target, workspace, _info, stream);
+        break;
+    case INFINI_DTYPE_F64:
+        launch_kernel<double>(output, input, target, workspace, _info, stream);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::huber_loss::metax
diff --git a/src/infiniop/ops/huber_loss/moore/huber_loss_moore.h b/src/infiniop/ops/huber_loss/moore/huber_loss_moore.h
new file mode 100644
index 000000000..a3a2b4dde
--- /dev/null
+++ b/src/infiniop/ops/huber_loss/moore/huber_loss_moore.h
@@ -0,0 +1,8 @@
+#ifndef __HUBER_LOSS_MOORE_H__
+#define __HUBER_LOSS_MOORE_H__
+
+#include "../huber_loss.h"
+
+DESCRIPTOR(moore)
+
+#endif // __HUBER_LOSS_MOORE_H__
diff --git a/src/infiniop/ops/huber_loss/moore/huber_loss_moore.mu b/src/infiniop/ops/huber_loss/moore/huber_loss_moore.mu
new file mode 100644
index 000000000..7d56380ba
--- /dev/null
+++ b/src/infiniop/ops/huber_loss/moore/huber_loss_moore.mu
@@ -0,0 +1,153 @@
+#include "huber_loss_moore.h"
+#include "huber_loss_moore_kernel.h"
+#include "../../../devices/moore/moore_handle.h"
+#include <cstdint>
+#include <algorithm>
+
+namespace op::huber_loss::moore {
+
+template <typename T>
+static inline bool is_aligned(const void *ptr, size_t alignment) {
+    return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
+}
+
+// ==================================================================
+// Kernel Launch Logic
+// ==================================================================
+template <typename T>
+void launch_kernel(
+    void *output, 
+    const void *input, 
+    const void *target, 
+    void* workspace,
+    const HuberLossInfo& info,
+    void *stream) {
+
+    // 1. 准备指针
+    auto in_ptr = reinterpret_cast<const T *>(input);
+    auto out_ptr = reinterpret_cast<T *>(output);
+    // Huber Loss 中 Target 类型与 Input 一致
+    auto tar_ptr = reinterpret_cast<const T *>(target);
+    
+    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
+    
+    // 2. 准备参数
+    size_t count = info.count();
+    int reduction = info.reduction();
+    
+    op::huber_loss::moore::HuberLossFunctor functor(info.delta());
+
+    // ------------------------------------------
+    // 模式 1: Elementwise (Reduction = None)
+    // ------------------------------------------
+    if (reduction == 0) {
+        // 每个线程处理一个元素
+        size_t block_size = 256;
+        size_t grid_size = (count + block_size - 1) / block_size;
+        
+        op::huber_loss::moore::huber_loss_kernel<T>
+            <<<grid_size, block_size, 0, musa_stream>>>(
+                out_ptr, in_ptr, tar_ptr, count, functor
+            );
+    } 
+    // ------------------------------------------
+    // 模式 2: Reduction (Mean / Sum)
+    // ------------------------------------------
+    else {
+        // 使用 workspace 作为临时的 float 累加器
+        float* acc_ptr = reinterpret_cast<float*>(workspace);
+        musaMemsetAsync(acc_ptr, 0, sizeof(float), musa_stream);
+        
+        // 1=Mean, 2=Sum
+        float scale = (reduction == 1) ? (1.0f / static_cast<float>(count)) : 1.0f; 
+        
+        size_t block_size = 256;
+        size_t grid_size = std::min((count + block_size - 1) / block_size, static_cast<size_t>(1024));
+
+        op::huber_loss::moore::huber_loss_reduce_kernel<T>
+            <<<grid_size, block_size, 0, musa_stream>>>(
+                acc_ptr, in_ptr, tar_ptr, count, functor, scale
+            );
+            
+        // 将 float 累加结果转回 T 写入 output
+        op::huber_loss::moore::cast_float_to_t<T>
+            <<<1, 1, 0, musa_stream>>>(out_ptr, acc_ptr);
+    }
+}
+
+// ==================================================================
+// Descriptor 实现
+// ==================================================================
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() { 
+    if (_opaque) delete _opaque; 
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_, 
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc, 
+    infiniopTensorDescriptor_t input_desc, 
+    infiniopTensorDescriptor_t target_desc,
+    float delta, 
+    int reduction) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+
+    auto info_result = HuberLossInfo::create(out_desc, input_desc, target_desc, delta, reduction);
+    if (!info_result) return info_result.status();
+    
+    size_t workspace_size = 0;
+    if (reduction != 0) {
+        workspace_size = sizeof(float);
+    }
+
+    *desc_ptr = new Descriptor(
+        new Opaque(), 
+        info_result.take(), 
+        workspace_size, 
+        handle->device, 
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, 
+    size_t workspace_size, 
+    void *output,
+    const void *input, 
+    const void *target, 
+    void *stream) const {
+
+    auto dtype = _info.dtype();
+    int reduction = _info.reduction();
+
+    // 检查 workspace 是否够用
+    if (reduction != 0 && workspace_size < sizeof(float)) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        launch_kernel<half>(output, input, target, workspace, _info, stream);
+        break;
+    case INFINI_DTYPE_BF16:
+        launch_kernel<__mt_bfloat16>(output, input, target, workspace, _info, stream);
+        break;
+    case INFINI_DTYPE_F32:
+        launch_kernel<float>(output, input, target, workspace, _info, stream);
+        break;
+    case INFINI_DTYPE_F64:
+        launch_kernel<double>(output, input, target, workspace, _info, stream);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::huber_loss::moore
\ No newline at end of file
diff --git a/src/infiniop/ops/huber_loss/moore/huber_loss_moore_kernel.h b/src/infiniop/ops/huber_loss/moore/huber_loss_moore_kernel.h
new file mode 100644
index 000000000..d1e96879c
--- /dev/null
+++ b/src/infiniop/ops/huber_loss/moore/huber_loss_moore_kernel.h
@@ -0,0 +1,141 @@
+#ifndef __HUBER_LOSS_MOORE_KERNEL_H__
+#define __HUBER_LOSS_MOORE_KERNEL_H__
+
+#include <musa_runtime.h>
+#include <musa_fp16.h>
+#include <musa_bf16.h>
+#include <cmath>
+#include <cstdio> 
+#include <type_traits>
+
+namespace op::huber_loss::moore {
+template <typename T>
+__device__ __forceinline__ float to_float(T val) {
+    if constexpr (std::is_same_v<T, half>) {
+        return __half2float(val);
+    } else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
+        return __bfloat162float(val);
+    } else {
+        return static_cast<float>(val);
+    }
+}
+
+template <typename T>
+__device__ __forceinline__ T from_float(float val) {
+    if constexpr (std::is_same_v<T, half>) {
+        return __float2half(val);
+    } else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
+        return __float2bfloat16(val);
+    } else {
+        return static_cast<T>(val);
+    }
+}
+__device__ __forceinline__ float warpReduceSum(float val) {
+    unsigned int mask = 0xffffffff;
+    for (int offset = warpSize / 2; offset > 0; offset /= 2)
+        val += __shfl_down_sync(mask, val, offset);
+    return val;
+}
+
+__device__ __forceinline__ float blockReduceSum(float val) {
+    static __shared__ float shared[32]; // Max 1024 threads / 32 warps
+    int lane = threadIdx.x % warpSize;
+    int wid = threadIdx.x / warpSize;
+
+    val = warpReduceSum(val);
+    if (lane == 0) shared[wid] = val;
+    __syncthreads();
+    val = (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : 0.0f;
+    if (wid == 0) val = warpReduceSum(val);
+    return val;
+}
+
+struct HuberLossFunctor {
+    float delta;
+    float half_delta; // 预计算 0.5 * delta
+
+    __host__ __device__ HuberLossFunctor(float delta_val) 
+        : delta(delta_val), half_delta(0.5f * delta_val) {}
+
+    // Huber Loss 计算:
+    // if |x - y| < delta: 0.5 * (x - y)^2
+    // else: delta * (|x - y| - 0.5 * delta)
+    __device__ __forceinline__ float compute(float input_val, float target_val) const {
+        float diff = input_val - target_val;
+        float abs_diff = std::abs(diff);
+
+        if (abs_diff < delta) {
+            return 0.5f * diff * diff;
+        } else {
+            return delta * (abs_diff - half_delta);
+        }
+    }
+};
+
+// ==================================================================
+// Kernel 1: Reduction = None (Element-wise output)
+// ==================================================================
+template <typename T>
+__global__ void huber_loss_kernel(
+    T * __restrict__ output,        // [N]
+    const T * __restrict__ input,   // [N]
+    const T * __restrict__ target,  // [N]
+    size_t count,                   // Total elements (numel)
+    HuberLossFunctor functor) {
+
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    if (idx < count) {
+        float in_val = to_float(input[idx]);
+        float tg_val = to_float(target[idx]);
+        
+        float loss = functor.compute(in_val, tg_val);
+
+        output[idx] = from_float<T>(loss);
+    }
+}
+
+// ==================================================================
+// Kernel 2: Reduction = Mean / Sum (Scalar output)
+// ==================================================================
+template <typename T>
+__global__ void huber_loss_reduce_kernel(
+    float * output,                 // [1] Accumulator (Float)
+    const T * __restrict__ input,   // [N]
+    const T * __restrict__ target,  // [N]
+    size_t count,                   // Total elements
+    HuberLossFunctor functor,
+    float scale // Mean模式传 1/N, Sum模式传 1.0
+) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t stride = blockDim.x * gridDim.x;
+    float local_sum = 0.0f;
+
+    // Grid-Stride Loop over all elements
+    for (size_t i = idx; i < count; i += stride) {
+        float in_val = to_float(input[i]);
+        float tg_val = to_float(target[i]);
+        
+        local_sum += functor.compute(in_val, tg_val);
+    }
+
+    // Block Reduction
+    float block_sum = blockReduceSum(local_sum);
+
+    // Global Atomic Add (Reduce to scalar)
+    if (threadIdx.x == 0) {
+        atomicAdd(output, block_sum * scale);
+    }
+}
+
+// ==================================================================
+// Helper: Cast float result to T (used for scalar output)
+// ==================================================================
+template <typename T>
+__global__ void cast_float_to_t(T* output, const float* src) {
+    *output = from_float<T>(*src);
+}
+
+} // namespace op::huber_loss::moore
+
+#endif // __HUBER_LOSS_MOORE_KERNEL_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/huber_loss/nvidia/huber_loss_nvidia.cu b/src/infiniop/ops/huber_loss/nvidia/huber_loss_nvidia.cu
new file mode 100644
index 000000000..4f1d9bd59
--- /dev/null
+++ b/src/infiniop/ops/huber_loss/nvidia/huber_loss_nvidia.cu
@@ -0,0 +1,119 @@
+#include "huber_loss_nvidia.cuh"
+#include "../cuda/kernel.cuh"
+#include "../../../handle.h"
+#include <cstdint>
+#include <algorithm>
+
+namespace op::huber_loss::nvidia {
+template <typename T>
+static inline bool is_aligned(const void *ptr, size_t alignment) {
+    return reinterpret_cast<uintptr_t>(ptr) % alignment == 0;
+}
+
+template <typename T>
+void launch_kernel(
+    void *output, 
+    const void *input, 
+    const void *target, 
+    void* workspace,
+    const HuberLossInfo& info,
+    void *stream) {
+
+    auto in_ptr = reinterpret_cast<const T *>(input);
+    auto out_ptr = reinterpret_cast<T *>(output);
+    auto tar_ptr = reinterpret_cast<const T *>(target);
+    
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+    
+    size_t count = info.count();
+    int reduction = info.reduction();
+    
+    op::huber_loss::cuda::HuberLossFunctor functor(info.delta());
+
+    if (reduction == 0) {
+        size_t block_size = 256;
+        size_t grid_size = (count + block_size - 1) / block_size;
+        
+        op::huber_loss::cuda::huber_loss_kernel<T>
+            <<<grid_size, block_size, 0, cuda_stream>>>(
+                out_ptr, in_ptr, tar_ptr, count, functor
+            );
+    } 
+    else {
+        float* acc_ptr = reinterpret_cast<float*>(workspace);
+        cudaMemsetAsync(acc_ptr, 0, sizeof(float), cuda_stream);
+        float scale = (reduction == 1) ? (1.0f / static_cast<float>(count)) : 1.0f;
+        
+        size_t block_size = 256;
+        size_t grid_size = std::min((count + block_size - 1) / block_size, static_cast<size_t>(1024));
+
+        op::huber_loss::cuda::huber_loss_reduce_kernel<T>
+            <<<grid_size, block_size, 0, cuda_stream>>>(
+                acc_ptr, in_ptr, tar_ptr, count, functor, scale
+            );
+        op::huber_loss::cuda::cast_float_to_t<T>
+            <<<1, 1, 0, cuda_stream>>>(out_ptr, acc_ptr);
+    }
+}
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() { 
+    if (_opaque) delete _opaque; 
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle, Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc, 
+    infiniopTensorDescriptor_t input_desc, 
+    infiniopTensorDescriptor_t target_desc,
+    float delta, 
+    int reduction) {
+
+    auto info_result = HuberLossInfo::create(out_desc, input_desc, target_desc, delta, reduction);
+    if (!info_result) return info_result.status();
+    size_t workspace_size = 0;
+    if (reduction != 0) {
+        workspace_size = sizeof(float);
+    }
+
+    *desc_ptr = new Descriptor(new Opaque(), info_result.take(), workspace_size, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace, 
+    size_t workspace_size, 
+    void *output,
+    const void *input, 
+    const void *target, 
+    void *stream) const {
+
+    auto dtype = _info.dtype();
+    int reduction = _info.reduction();
+
+    if (reduction != 0 && workspace_size < sizeof(float)) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (dtype) {
+    case INFINI_DTYPE_F16:
+        launch_kernel<half>(output, input, target, workspace, _info, stream);
+        break;
+    case INFINI_DTYPE_BF16:
+        launch_kernel<nv_bfloat16>(output, input, target, workspace, _info, stream);
+        break;
+    case INFINI_DTYPE_F32:
+        launch_kernel<float>(output, input, target, workspace, _info, stream);
+        break;
+    case INFINI_DTYPE_F64:
+        launch_kernel<double>(output, input, target, workspace, _info, stream);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::huber_loss::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/huber_loss/nvidia/huber_loss_nvidia.cuh b/src/infiniop/ops/huber_loss/nvidia/huber_loss_nvidia.cuh
new file mode 100644
index 000000000..90f76b2b1
--- /dev/null
+++ b/src/infiniop/ops/huber_loss/nvidia/huber_loss_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __HUBER_LOSS_NVIDIA_CUH__
+#define __HUBER_LOSS_NVIDIA_CUH__
+
+#include "../huber_loss.h"
+
+DESCRIPTOR(nvidia)
+
+#endif // __HUBER_LOSS_NVIDIA_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/huber_loss/operator.cc b/src/infiniop/ops/huber_loss/operator.cc
new file mode 100644
index 000000000..7db651f19
--- /dev/null
+++ b/src/infiniop/ops/huber_loss/operator.cc
@@ -0,0 +1,180 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/huber_loss.h"
+
+// --- 后端实现头文件 ---
+#ifdef ENABLE_CPU_API
+#include "cpu/huber_loss_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/huber_loss_nvidia.cuh"
+#endif
+
+#ifdef ENABLE_METAX_API
+#include "metax/huber_loss_metax.h"
+#endif
+
+#ifdef ENABLE_MOORE_API
+#include "moore/huber_loss_moore.h"
+#endif
+
+extern "C" {
+
+// =======================================================================
+// 1. 创建算子描述符
+// =======================================================================
+__C infiniStatus_t infiniopCreateHuberLossDescriptor(
+    infiniopHandle_t handle,
+    infiniopHuberLossDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t output,
+    infiniopTensorDescriptor_t input,
+    infiniopTensorDescriptor_t target,
+    float delta,
+    int reduction) {
+
+    #define CREATE(CASE, NAMESPACE)                                                             \
+        case CASE:                                                                              \
+            return op::huber_loss::NAMESPACE::Descriptor::create(                               \
+                handle,                                                                         \
+                reinterpret_cast<op::huber_loss::NAMESPACE::Descriptor **>(desc_ptr),           \
+                output,                                                                         \
+                input,                                                                          \
+                target,                                                                         \
+                delta,                                                                          \
+                reduction)
+
+    switch (handle->device) {
+    #ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CREATE
+}
+
+// =======================================================================
+// 2. 获取 Workspace 大小
+// =======================================================================
+__C infiniStatus_t infiniopGetHuberLossWorkspaceSize(infiniopHuberLossDescriptor_t desc, size_t *size) {
+
+    #define GET(CASE, NAMESPACE)                                                                                \
+        case CASE:                                                                                              \
+            *size = reinterpret_cast<op::huber_loss::NAMESPACE::Descriptor *>(desc)->workspaceSize();           \
+            return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef GET
+}
+
+// =======================================================================
+// 3. 执行计算 (Calculate)
+// =======================================================================
+__C infiniStatus_t infiniopHuberLoss(
+    infiniopHuberLossDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    const void *target,
+    void *stream) {
+
+    #define CALCULATE(CASE, NAMESPACE)                                                          \
+        case CASE:                                                                              \
+            return reinterpret_cast<const op::huber_loss::NAMESPACE::Descriptor *>(desc)        \
+                ->calculate(workspace, workspace_size, output, input, target, stream)
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef CALCULATE
+}
+
+__C infiniStatus_t infiniopDestroyHuberLossDescriptor(infiniopHuberLossDescriptor_t desc) {
+
+    #define DELETE(CASE, NAMESPACE)                                                                 \
+        case CASE:                                                                                  \
+            delete reinterpret_cast<const op::huber_loss::NAMESPACE::Descriptor *>(desc);           \
+            return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+    #ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+    #endif
+    #ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+    #endif
+    #ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+    #endif
+    #ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+    #endif
+    #ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+    #endif
+    #ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+    #endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+    #undef DELETE
+}
+
+} // extern "C"
\ No newline at end of file
diff --git a/src/infiniop/ops/softplus/cpu/softplus_cpu.cc b/src/infiniop/ops/softplus/cpu/softplus_cpu.cc
index 4272bc37d..e41ab4e47 100644
--- a/src/infiniop/ops/softplus/cpu/softplus_cpu.cc
+++ b/src/infiniop/ops/softplus/cpu/softplus_cpu.cc
@@ -1,52 +1,135 @@
 #include "softplus_cpu.h"
+#include "../../../devices/cpu/common_cpu.h"
+#include <algorithm>
+#include <cmath>
+#include <omp.h>
+#include <cstdint>
+#include <vector>
+
+#include "../../../../utils/custom_types.h"
 
 namespace op::softplus::cpu {
 
-Descriptor::~Descriptor() = default;
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+        _opaque = nullptr;
+    }
+}
 
 infiniStatus_t Descriptor::create(
     infiniopHandle_t handle_,
     Descriptor **desc_ptr,
     infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+    infiniopTensorDescriptor_t input_desc,
+    float beta,
+    float threshold) {
 
     auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
+    auto result = SoftplusInfo::create(out_desc, input_desc, beta, threshold);
+    
+    if (!result) {
+        return result.status();
+    }
+
+    // 2. 创建 Descriptor
+    *desc_ptr = new Descriptor(
+        new Opaque(),
+        result.take(),
+        0, 
+        handle->device,
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+// -----------------------------------------------------------------------
+// 核心计算实现：支持 Stride (非连续内存)
+// -----------------------------------------------------------------------
+template <typename T>
+void calculate_cpu_impl(
+    const SoftplusInfo &info,
+    void *output,
+    const void *input) {
 
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
+    size_t num_elements = info.num_elements();
+    float beta = info.beta();
+    float threshold = info.threshold();
+    
+    // 获取内存布局信息 (依赖更新后的 SoftplusInfo)
+    bool is_contiguous = info.is_contiguous();
+    int ndim = info.ndim();
+    const auto& shape = info.shape();
+    const auto& strides = info.strides();
 
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
+    auto out_ptr = reinterpret_cast<T *>(output);
+    auto in_ptr = reinterpret_cast<const T *>(input);
 
-    CHECK_SAME_SHAPE(y_shape, x_shape);
+    #pragma omp parallel for schedule(static)
+    for (size_t i = 0; i < num_elements; ++i) {
+        // 1. 计算输入偏移量 (Input Offset)
+        size_t input_offset = i; // 默认为线性索引
 
-    // create CPU elementwise descriptor
-    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+        if (!is_contiguous) {
+            // 如果内存不连续，需要进行坐标变换：Linear Index -> Coordinate -> Physical Offset
+            input_offset = 0;
+            size_t temp_idx = i;
+            for (int d = ndim - 1; d >= 0; --d) {
+                size_t dim_size = shape[d];
+                size_t coord = temp_idx % dim_size;
+                temp_idx /= dim_size;
+                input_offset += coord * strides[d];
+            }
+        }
+        using CalcType = std::conditional_t<std::is_same_v<T, double>, double, float>;
+        
+        CalcType x = utils::cast<CalcType>(in_ptr[input_offset]);
+        CalcType b = static_cast<CalcType>(beta);
+        CalcType t = static_cast<CalcType>(threshold);
+        
+        CalcType bx = b * x;
+        CalcType result;
 
-    return INFINI_STATUS_SUCCESS;
+        // 3. 计算 Softplus
+        if (bx > t) {
+            result = x;
+        } else {
+            result = std::log1p(std::exp(bx)) / b;
+        }
+        out_ptr[i] = utils::cast<T>(result);
+    }
 }
 
 infiniStatus_t Descriptor::calculate(
     void *workspace,
     size_t workspace_size,
     void *output,
-    std::vector<const void *> inputs,
+    const void *input,
     void *stream) const {
+    
+    auto dtype = _info.dtype();
 
-    switch (_dtype) {
-    case INFINI_DTYPE_F16:
-        return _device_info->calculate<SoftplusOp, fp16_t>(_info, output, inputs, stream);
+    switch (dtype) {
     case INFINI_DTYPE_F32:
-        return _device_info->calculate<SoftplusOp, float>(_info, output, inputs, stream);
+        calculate_cpu_impl<float>(_info, output, input);
+        break;
     case INFINI_DTYPE_F64:
-        return _device_info->calculate<SoftplusOp, double>(_info, output, inputs, stream);
+        calculate_cpu_impl<double>(_info, output, input);
+        break;
+    case INFINI_DTYPE_F16:
+        calculate_cpu_impl<fp16_t>(_info, output, input);
+        break;
     case INFINI_DTYPE_BF16:
-        return _device_info->calculate<SoftplusOp, bf16_t>(_info, output, inputs, stream);
+        calculate_cpu_impl<bf16_t>(_info, output, input);
+        break;
     default:
         return INFINI_STATUS_BAD_TENSOR_DTYPE;
     }
 
     return INFINI_STATUS_SUCCESS;
 }
-} // namespace op::softplus::cpu
+
+} // namespace op::softplus::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/softplus/cpu/softplus_cpu.h b/src/infiniop/ops/softplus/cpu/softplus_cpu.h
index ce00fe1a3..9ab0b7ed1 100644
--- a/src/infiniop/ops/softplus/cpu/softplus_cpu.h
+++ b/src/infiniop/ops/softplus/cpu/softplus_cpu.h
@@ -1,23 +1,8 @@
 #ifndef __SOFTPLUS_CPU_H__
 #define __SOFTPLUS_CPU_H__
 
-#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include "../softplus.h"
 
-ELEMENTWISE_DESCRIPTOR(softplus, cpu)
+DESCRIPTOR(cpu)
 
-namespace op::softplus::cpu {
-typedef struct SoftplusOp {
-public:
-    static constexpr size_t num_inputs = 1;
-    template <typename T>
-    T operator()(const T &x) const {
-        if (x > T(20)) {
-            return x;
-        } else {
-            return std::log(T(1) + std::exp(x));
-        }
-    }
-} SoftplusOp;
-} // namespace op::softplus::cpu
-
-#endif // __SOFTPLUS_CPU_H__
+#endif // __SOFTPLUS_CPU_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/softplus/cuda/kernel.cuh b/src/infiniop/ops/softplus/cuda/kernel.cuh
index 33cef97db..c045dbabf 100644
--- a/src/infiniop/ops/softplus/cuda/kernel.cuh
+++ b/src/infiniop/ops/softplus/cuda/kernel.cuh
@@ -1,34 +1,66 @@
 #ifndef __SOFTPLUS_CUDA_H__
 #define __SOFTPLUS_CUDA_H__
 
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cmath>
+#include <type_traits>
+
 namespace op::softplus::cuda {
+
 typedef struct SoftplusOp {
 public:
     static constexpr size_t num_inputs = 1;
 
     template <typename T>
-    __device__ __forceinline__ T operator()(const T &x) const {
+    __device__ __forceinline__ T operator()(const T &x, float beta, float threshold) const {
+        
         if constexpr (std::is_same_v<T, half>) {
-            // promote to float for stability, then cast back
             float xf = __half2float(x);
-            float out = (xf > 20.0f) ? xf : log1pf(expf(xf));
+            float bx = beta * xf;
+            float out = (bx > threshold) ? xf : log1pf(expf(bx)) / beta;
             return __float2half(out);
-        } else if constexpr (std::is_same_v<T, cuda_bfloat16>) {
+        } 
+        else if constexpr (std::is_same_v<T, nv_bfloat16>) {
             float xf = __bfloat162float(x);
-            float out = (xf > 20.0f) ? xf : log1pf(expf(xf));
+            float bx = beta * xf;
+            float out = (bx > threshold) ? xf : log1pf(expf(bx)) / beta;
             return __float2bfloat16(out);
-        } else if constexpr (std::is_same_v<T, half2>) {
-            // process as two lanes
+        } 
+        else if constexpr (std::is_same_v<T, half2>) {
             float2 xf = __half22float2(x);
-            xf.x = (xf.x > 20.0f) ? xf.x : log1pf(expf(xf.x));
-            xf.y = (xf.y > 20.0f) ? xf.y : log1pf(expf(xf.y));
-            return __floats2half2_rn(xf.x, xf.y);
-        } else {
-            // default: float, double, etc.
-            return (x > T(20)) ? x : log1p(exp(x));
+            float2 out;
+
+            float bx_x = beta * xf.x;
+            out.x = (bx_x > threshold) ? xf.x : log1pf(expf(bx_x)) / beta;
+
+            float bx_y = beta * xf.y;
+            out.y = (bx_y > threshold) ? xf.y : log1pf(expf(bx_y)) / beta;
+
+            return __floats2half2_rn(out.x, out.y);
+        } 
+        else {
+            using CalcType = std::conditional_t<std::is_same_v<T, double>, double, float>;
+            
+            CalcType x_val = static_cast<CalcType>(x);
+            CalcType b_val = static_cast<CalcType>(beta);
+            CalcType t_val = static_cast<CalcType>(threshold);
+            
+            CalcType bx = b_val * x_val;
+            
+            if (bx > t_val) {
+                return static_cast<T>(x_val);
+            } else {
+                if constexpr (std::is_same_v<CalcType, double>) {
+                    return static_cast<T>(::log1p(::exp(bx)) / b_val);
+                } else {
+                    return static_cast<T>(::log1pf(::expf(bx)) / b_val);
+                }
+            }
         }
     }
 } SoftplusOp;
+
 } // namespace op::softplus::cuda
 
-#endif // __SOFTPLUS_CUDA_H__
+#endif // __SOFTPLUS_CUDA_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/softplus/info.h b/src/infiniop/ops/softplus/info.h
new file mode 100644
index 000000000..2eddf2864
--- /dev/null
+++ b/src/infiniop/ops/softplus/info.h
@@ -0,0 +1,106 @@
+#ifndef __SOFTPLUS_INFO_H__
+#define __SOFTPLUS_INFO_H__
+
+#include "../../../utils.h"
+#include "../../tensor.h"
+#include <vector>
+#include <algorithm> // for std::equal
+
+namespace op::softplus {
+
+class SoftplusInfo {
+    SoftplusInfo() = default;
+
+public:
+    int _dtype;           // 数据类型
+    float _beta;          // 缩放参数
+    float _threshold;     // 阈值参数
+    size_t _num_elements; // 元素总数
+    
+    // [新增] 内存布局信息
+    bool _is_contiguous;           // 是否内存连续
+    std::vector<int64_t> _shape;   // 形状
+    std::vector<int64_t> _strides; // 步长
+
+    int dtype() const { return _dtype; }
+    float beta() const { return _beta; }
+    float threshold() const { return _threshold; }
+    size_t num_elements() const { return _num_elements; }
+    
+    // [新增] Getters
+    bool is_contiguous() const { return _is_contiguous; }
+    const std::vector<int64_t>& shape() const { return _shape; }
+    const std::vector<int64_t>& strides() const { return _strides; }
+    int ndim() const { return _shape.size(); }
+
+    // 构造函数
+    SoftplusInfo(int dtype, float beta, float threshold, size_t num_elements, 
+                 bool is_contiguous, std::vector<int64_t> shape, std::vector<int64_t> strides)
+        : _dtype(dtype), _beta(beta), _threshold(threshold), _num_elements(num_elements),
+          _is_contiguous(is_contiguous), _shape(std::move(shape)), _strides(std::move(strides)) {}
+
+    static utils::Result<SoftplusInfo> create(
+        infiniopTensorDescriptor_t out_desc,
+        infiniopTensorDescriptor_t input_desc,
+        float beta,
+        float threshold) {
+
+        if (out_desc->dtype() != input_desc->dtype()) {
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;
+        }
+        if (out_desc->ndim() != input_desc->ndim()) {
+            return INFINI_STATUS_BAD_TENSOR_SHAPE;
+        }
+
+        // 1. 检查形状一致性并计算元素总数
+        // 注意：这里假设 shape() 返回的是支持下标访问的容器 (如 vector)
+        auto out_shape = out_desc->shape();
+        auto in_shape = input_desc->shape();
+        size_t num_elements = 1;
+
+        for (size_t i = 0; i < input_desc->ndim(); ++i) {
+            if (out_shape[i] != in_shape[i]) {
+                return INFINI_STATUS_BAD_TENSOR_SHAPE;
+            }
+            num_elements *= in_shape[i];
+        }
+
+        // 2. [关键修复] 提取形状和步长
+        // input_desc->shape() 返回 vector，必须用迭代器初始化，不能用指针加法
+        auto in_strides = input_desc->strides();
+        
+        // 使用迭代器区间构造，同时自动处理从 uint64 到 int64 的隐式转换
+        std::vector<int64_t> shape(in_shape.begin(), in_shape.end());
+        std::vector<int64_t> strides(in_strides.begin(), in_strides.end());
+        
+        int ndim = shape.size();
+
+        // 3. 检查连续性
+        // 从后往前检查：stride[i] == stride[i+1] * shape[i+1]
+        bool is_contiguous = true;
+        int64_t expected_stride = 1;
+        for (int i = ndim - 1; i >= 0; --i) {
+            if (shape[i] > 1) {
+                if (strides[i] != expected_stride) {
+                    is_contiguous = false;
+                    break;
+                }
+                expected_stride *= shape[i];
+            }
+        }
+
+        return utils::Result<SoftplusInfo>(SoftplusInfo{
+            input_desc->dtype(), // _dtype
+            beta,                // _beta
+            threshold,           // _threshold
+            num_elements,        // _num_elements
+            is_contiguous,       // _is_contiguous
+            shape,               // _shape
+            strides              // _strides
+        });
+    }
+};
+
+} // namespace op::softplus
+
+#endif // __SOFTPLUS_INFO_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/softplus/metax/softplus_metax.h b/src/infiniop/ops/softplus/metax/softplus_metax.h
index 8da2b4d76..c0d1c7a13 100644
--- a/src/infiniop/ops/softplus/metax/softplus_metax.h
+++ b/src/infiniop/ops/softplus/metax/softplus_metax.h
@@ -1,8 +1,8 @@
 #ifndef __SOFTPLUS_METAX_API_H__
 #define __SOFTPLUS_METAX_API_H__
 
-#include "../../../elementwise/metax/elementwise_metax_api.h"
+#include "../softplus.h"
 
-ELEMENTWISE_DESCRIPTOR(softplus, metax)
+DESCRIPTOR(metax)
 
 #endif // __SOFTPLUS_METAX_API_H__
diff --git a/src/infiniop/ops/softplus/metax/softplus_metax.maca b/src/infiniop/ops/softplus/metax/softplus_metax.maca
index 5744f8c04..16cbd6295 100644
--- a/src/infiniop/ops/softplus/metax/softplus_metax.maca
+++ b/src/infiniop/ops/softplus/metax/softplus_metax.maca
@@ -1,32 +1,186 @@
 #include "softplus_metax.h"
+#include "../../../devices/metax/metax_common.h"
+#include "../../../devices/metax/metax_handle.h"
 
-#include "../../../elementwise/metax/elementwise_metax.h"
+#include <mcr/mc_runtime.h>
+#include <maca_fp16.h>
+#include <maca_bfloat16.h>
 
-#include "../cuda/kernel.cuh"
+#include <cstdint>
+#include <algorithm>
+#include <cmath>
+#include <type_traits>
+
+using nv_bfloat16  = __maca_bfloat16;
+using nv_bfloat162 = __maca_bfloat162;
 
 namespace op::softplus::metax {
 
-Descriptor::~Descriptor() = default;
+struct SoftplusOp {
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x, float beta, float threshold) const {
+        if constexpr (std::is_same_v<T, __half>) {
+            float xf = __half2float(x);
+            float bx = beta * xf;
+            float out = (bx > threshold) ? xf : log1pf(expf(bx)) / beta;
+            return __float2half(out);
+        } else if constexpr (std::is_same_v<T, nv_bfloat16>) {
+            float xf = __bfloat162float(x);
+            float bx = beta * xf;
+            float out = (bx > threshold) ? xf : log1pf(expf(bx)) / beta;
+            return __float2bfloat16(out);
+        } else {
+            using CalcType = std::conditional_t<std::is_same_v<T, double>, double, float>;
+
+            CalcType x_val = static_cast<CalcType>(x);
+            CalcType b_val = static_cast<CalcType>(beta);
+            CalcType t_val = static_cast<CalcType>(threshold);
+
+            CalcType bx = b_val * x_val;
+
+            if (bx > t_val) {
+                return static_cast<T>(x_val);
+            } else {
+                if constexpr (std::is_same_v<CalcType, double>) {
+                    return static_cast<T>(::log1p(::exp(bx)) / b_val);
+                } else {
+                    return static_cast<T>(::log1pf(::expf(bx)) / b_val);
+                }
+            }
+        }
+    }
+};
+
+static constexpr int MAX_DIMS = 8;
+
+struct TensorMetadata {
+    int ndim;
+    int64_t shape[MAX_DIMS];
+    int64_t strides[MAX_DIMS];
+};
+
+template <typename T>
+__global__ void softplus_kernel_contiguous(
+    T *output,
+    const T *input,
+    size_t n,
+    float beta,
+    float threshold) {
+    
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    if (idx < n) {
+        SoftplusOp functor;
+        output[idx] = functor(input[idx], beta, threshold);
+    }
+}
+
+template <typename T>
+__global__ void softplus_kernel_strided(
+    T *output,
+    const T *input,
+    size_t n,
+    float beta,
+    float threshold,
+    TensorMetadata meta) {
+    
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    if (idx < n) {
+        size_t input_offset = 0;
+        size_t temp_idx = idx;
+
+#pragma unroll
+        for (int d = meta.ndim - 1; d >= 0; --d) {
+            size_t dim_size = meta.shape[d];
+            size_t coord    = temp_idx % dim_size;
+            temp_idx       /= dim_size;
+            input_offset   += coord * meta.strides[d];
+        }
+
+        SoftplusOp functor;
+        output[idx] = functor(input[input_offset], beta, threshold);
+    }
+}
+
+template <typename T>
+void launch_kernel(
+    void *output,
+    const void *input,
+    const SoftplusInfo &info,
+    void *stream) {
+
+    size_t n = info.num_elements();
+    auto mc_stream = reinterpret_cast<mcStream_t>(stream);
+    
+    dim3 block(256);
+    dim3 grid((n + block.x - 1) / block.x);
+    if (grid.x == 0) grid.x = 1;
+
+    if (info.is_contiguous()) {
+        softplus_kernel_contiguous<T><<<grid, block, 0, mc_stream>>>(
+            reinterpret_cast<T *>(output),
+            reinterpret_cast<const T *>(input),
+            n,
+            info.beta(),
+            info.threshold()
+        );
+    } else {
+        TensorMetadata meta;
+        meta.ndim = info.ndim();
+
+        const auto &shape_vec  = info.shape();
+        const auto &stride_vec = info.strides();
+
+        for (int i = 0; i < meta.ndim && i < MAX_DIMS; ++i) {
+            meta.shape[i]   = shape_vec[i];
+            meta.strides[i] = stride_vec[i];
+        }
+
+        softplus_kernel_strided<T><<<grid, block, 0, mc_stream>>>(
+            reinterpret_cast<T *>(output),
+            reinterpret_cast<const T *>(input),
+            n,
+            info.beta(),
+            info.threshold(),
+            meta
+        );
+    }
+}
+
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+        _opaque = nullptr;
+    }
+}
 
 infiniStatus_t Descriptor::create(
     infiniopHandle_t handle_,
     Descriptor **desc_ptr,
     infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+    infiniopTensorDescriptor_t input_desc,
+    float beta,
+    float threshold) {
 
     auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
 
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
+    auto result = SoftplusInfo::create(out_desc, input_desc, beta, threshold);
+    if (!result) {
+        return result.status();
+    }
 
-    // create METAX elementwise descriptor
-    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+    *desc_ptr = new Descriptor(
+        new Opaque(),
+        result.take(),
+        0,
+        handle->device,
+        handle->device_id
+    );
 
     return INFINI_STATUS_SUCCESS;
 }
@@ -35,26 +189,30 @@ infiniStatus_t Descriptor::calculate(
     void *workspace,
     size_t workspace_size,
     void *output,
-    std::vector<const void *> inputs,
+    const void *input,
     void *stream) const {
 
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
+    (void)workspace;
+    (void)workspace_size;
 
-    switch (_dtype) {
+    switch (_info.dtype()) {
     case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::SoftplusOp, half>(_info, workspace, output, inputs, stream);
+        launch_kernel<__half>(output, input, _info, stream);
+        break;
     case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, cuda::SoftplusOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+        launch_kernel<nv_bfloat16>(output, input, _info, stream);
+        break;
     case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::SoftplusOp, float>(_info, workspace, output, inputs, stream);
+        launch_kernel<float>(output, input, _info, stream);
+        break;
     case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, cuda::SoftplusOp, double>(_info, workspace, output, inputs, stream);
+        launch_kernel<double>(output, input, _info, stream);
+        break;
     default:
         return INFINI_STATUS_BAD_TENSOR_DTYPE;
     }
 
     return INFINI_STATUS_SUCCESS;
 }
+
 } // namespace op::softplus::metax
diff --git a/src/infiniop/ops/softplus/moore/softplus_moore.h b/src/infiniop/ops/softplus/moore/softplus_moore.h
new file mode 100644
index 000000000..fce1bd895
--- /dev/null
+++ b/src/infiniop/ops/softplus/moore/softplus_moore.h
@@ -0,0 +1,8 @@
+#ifndef __SOFTPLUS_MOORE_H__
+#define __SOFTPLUS_MOORE_H__
+
+#include "../softplus.h"
+
+DESCRIPTOR(moore)
+
+#endif // __SOFTPLUS_MOORE_H__
diff --git a/src/infiniop/ops/softplus/moore/softplus_moore.mu b/src/infiniop/ops/softplus/moore/softplus_moore.mu
new file mode 100644
index 000000000..275637045
--- /dev/null
+++ b/src/infiniop/ops/softplus/moore/softplus_moore.mu
@@ -0,0 +1,181 @@
+#include "softplus_moore.h"
+#include "softplus_moore_kernel.h" 
+#include "../../../handle.h"
+#include <musa_fp16.h>
+#include <musa_bf16.h>
+#include "../../../devices/moore/moore_handle.h"
+
+namespace op::softplus::moore {
+
+static constexpr int MAX_DIMS = 8;
+
+struct TensorMetadata {
+    int ndim;
+    int64_t shape[MAX_DIMS];
+    int64_t strides[MAX_DIMS];
+};
+
+// ==================================================================
+// Kernel 1: 连续内存路径
+// ==================================================================
+template <typename T>
+__global__ void softplus_kernel_contiguous(
+    T *output,
+    const T *input,
+    size_t n,
+    float beta,
+    float threshold) {
+    
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    if (idx < n) {
+        op::softplus::moore::SoftplusOp functor;
+        output[idx] = functor(input[idx], beta, threshold);
+    }
+}
+
+// ==================================================================
+// Kernel 2: 非连续内存路径 (Strided)
+// ==================================================================
+template <typename T>
+__global__ void softplus_kernel_strided(
+    T *output,
+    const T *input,
+    size_t n,
+    float beta,
+    float threshold,
+    TensorMetadata meta) {
+    
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    if (idx < n) {
+        size_t input_offset = 0;
+        size_t temp_idx = idx;
+
+        #pragma unroll
+        for (int d = meta.ndim - 1; d >= 0; --d) {
+            size_t dim_size = meta.shape[d];
+            size_t coord = temp_idx % dim_size;
+            temp_idx /= dim_size;
+            input_offset += coord * meta.strides[d];
+        }
+
+        op::softplus::moore::SoftplusOp functor;
+        output[idx] = functor(input[input_offset], beta, threshold);
+    }
+}
+
+// ==================================================================
+// Launch Logic
+// ==================================================================
+template <typename T>
+void launch_kernel(
+    void *output,
+    const void *input,
+    const SoftplusInfo &info,
+    void *stream) {
+
+    size_t n = info.num_elements();
+    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
+    
+    dim3 block(256);
+    dim3 grid((n + block.x - 1) / block.x);
+
+    if (info.is_contiguous()) {
+        softplus_kernel_contiguous<T><<<grid, block, 0, musa_stream>>>(
+            reinterpret_cast<T *>(output),
+            reinterpret_cast<const T *>(input),
+            n,
+            info.beta(),
+            info.threshold()
+        );
+    } 
+    else {
+        TensorMetadata meta;
+        meta.ndim = info.ndim();
+        
+        const auto& shape_vec = info.shape();
+        const auto& stride_vec = info.strides();
+
+        for (int i = 0; i < meta.ndim; ++i) {
+            meta.shape[i] = shape_vec[i];
+            meta.strides[i] = stride_vec[i];
+        }
+
+        softplus_kernel_strided<T><<<grid, block, 0, musa_stream>>>(
+            reinterpret_cast<T *>(output),
+            reinterpret_cast<const T *>(input),
+            n,
+            info.beta(),
+            info.threshold(),
+            meta
+        );
+    }
+}
+
+// ==================================================================
+// Descriptor Implementation
+// ==================================================================
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+        _opaque = nullptr;
+    }
+}
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t input_desc,
+    float beta,
+    float threshold) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+
+    auto result = SoftplusInfo::create(out_desc, input_desc, beta, threshold);
+    if (!result) {
+        return result.status();
+    }
+
+    *desc_ptr = new Descriptor(
+        new Opaque(),
+        result.take(),
+        0,
+        handle->device,
+        handle->device_id
+    );
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    const void *input,
+    void *stream) const {
+
+    switch (_info.dtype()) {
+    case INFINI_DTYPE_F16:
+        launch_kernel<half>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_BF16:
+        launch_kernel<__mt_bfloat16>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_F32:
+        launch_kernel<float>(output, input, _info, stream);
+        break;
+    case INFINI_DTYPE_F64:
+        launch_kernel<double>(output, input, _info, stream);
+        break;
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::softplus::moore
\ No newline at end of file
diff --git a/src/infiniop/ops/softplus/moore/softplus_moore_kernel.h b/src/infiniop/ops/softplus/moore/softplus_moore_kernel.h
new file mode 100644
index 000000000..e1492702e
--- /dev/null
+++ b/src/infiniop/ops/softplus/moore/softplus_moore_kernel.h
@@ -0,0 +1,64 @@
+#ifndef __SOFTPLUS_MOORE_KERNEL_H__
+#define __SOFTPLUS_MOORE_KERNEL_H__
+
+#include <musa_runtime.h>
+#include <musa_fp16.h>
+#include <musa_bf16.h>
+#include <cmath>
+#include <type_traits>
+
+namespace op::softplus::moore {
+
+struct SoftplusOp {
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x, float beta, float threshold) const {
+        
+        // 1. Half (FP16)
+        if constexpr (std::is_same_v<T, half>) {
+            float xf = __half2float(x);
+            float bx = beta * xf;
+            float out = (bx > threshold) ? xf : ::log1pf(::expf(bx)) / beta;
+            return __float2half(out);
+        } 
+        else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
+            float xf = __bfloat162float(x);
+            float bx = beta * xf;
+            float out = (bx > threshold) ? xf : ::log1pf(::expf(bx)) / beta;
+            return __float2bfloat16(out);
+        } 
+        // 3. Half2 (FP16 Vector)
+        else if constexpr (std::is_same_v<T, half2>) {
+            float2 xf = __half22float2(x);
+            float2 out;
+            float bx_x = beta * xf.x;
+            out.x = (bx_x > threshold) ? xf.x : ::log1pf(::expf(bx_x)) / beta;
+            float bx_y = beta * xf.y;
+            out.y = (bx_y > threshold) ? xf.y : ::log1pf(::expf(bx_y)) / beta;
+            return __floats2half2_rn(out.x, out.y);
+        } 
+        // 4. Float / Double
+        else {
+            using CalcType = std::conditional_t<std::is_same_v<T, double>, double, float>;
+            CalcType x_val = static_cast<CalcType>(x);
+            CalcType b_val = static_cast<CalcType>(beta);
+            CalcType t_val = static_cast<CalcType>(threshold);
+            CalcType bx = b_val * x_val;
+            
+            if (bx > t_val) {
+                return static_cast<T>(x_val);
+            } else {
+                if constexpr (std::is_same_v<CalcType, double>) {
+                    return static_cast<T>(::log1p(::exp(bx)) / b_val);
+                } else {
+                    return static_cast<T>(::log1pf(::expf(bx)) / b_val);
+                }
+            }
+        }
+    }
+};
+
+} // namespace op::softplus::moore
+
+#endif // __SOFTPLUS_MOORE_KERNEL_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/softplus/nvidia/softplus_nvidia.cu b/src/infiniop/ops/softplus/nvidia/softplus_nvidia.cu
index 392e2dc18..460407cc8 100644
--- a/src/infiniop/ops/softplus/nvidia/softplus_nvidia.cu
+++ b/src/infiniop/ops/softplus/nvidia/softplus_nvidia.cu
@@ -1,31 +1,152 @@
-#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
-
-#include "../cuda/kernel.cuh"
 #include "softplus_nvidia.cuh"
-
+#include "../cuda/kernel.cuh"
+#include "../../../handle.h"
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include "../../../devices/nvidia/nvidia_common.cuh"
 namespace op::softplus::nvidia {
 
-Descriptor::~Descriptor() = default;
+// 最大支持维度，用于 Kernel 参数传递
+static constexpr int MAX_DIMS = 8;
+
+struct TensorMetadata {
+    int ndim;
+    int64_t shape[MAX_DIMS];
+    int64_t strides[MAX_DIMS];
+};
+
+// ==================================================================
+// Kernel 1: 连续内存路径 (Fast Path)
+// ==================================================================
+template <typename T>
+__global__ void softplus_kernel_contiguous(
+    T *output,
+    const T *input,
+    size_t n,
+    float beta,
+    float threshold) {
+    
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    if (idx < n) {
+        op::softplus::cuda::SoftplusOp functor;
+        output[idx] = functor(input[idx], beta, threshold);
+    }
+}
+
+// ==================================================================
+template <typename T>
+__global__ void softplus_kernel_strided(
+    T *output,
+    const T *input,
+    size_t n,
+    float beta,
+    float threshold,
+    TensorMetadata meta) { // 按值传递元数据
+    
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    if (idx < n) {
+        size_t input_offset = 0;
+        size_t temp_idx = idx;
+
+        // 坐标变换：Linear Index -> Coordinate -> Strided Offset
+        #pragma unroll
+        for (int d = meta.ndim - 1; d >= 0; --d) {
+            size_t dim_size = meta.shape[d];
+            size_t coord = temp_idx % dim_size;
+            temp_idx /= dim_size;
+            input_offset += coord * meta.strides[d];
+        }
+
+        op::softplus::cuda::SoftplusOp functor;
+        output[idx] = functor(input[input_offset], beta, threshold);
+    }
+}
+
+// ==================================================================
+// Launch Logic
+// ==================================================================
+template <typename T>
+void launch_kernel(
+    void *output,
+    const void *input,
+    const SoftplusInfo &info,
+    void *stream) {
+
+    size_t n = info.num_elements();
+    auto cuda_stream = reinterpret_cast<cudaStream_t>(stream);
+    
+    dim3 block(256);
+    dim3 grid((n + block.x - 1) / block.x);
+    if (info.is_contiguous()) {
+        softplus_kernel_contiguous<T><<<grid, block, 0, cuda_stream>>>(
+            reinterpret_cast<T *>(output),
+            reinterpret_cast<const T *>(input),
+            n,
+            info.beta(),
+            info.threshold()
+        );
+    } 
+    else {
+        // 准备元数据
+        TensorMetadata meta;
+        meta.ndim = info.ndim();
+        
+
+        const auto& shape_vec = info.shape();
+        const auto& stride_vec = info.strides();
+
+        for (int i = 0; i < meta.ndim; ++i) {
+            meta.shape[i] = shape_vec[i];
+            meta.strides[i] = stride_vec[i];
+        }
+
+        softplus_kernel_strided<T><<<grid, block, 0, cuda_stream>>>(
+            reinterpret_cast<T *>(output),
+            reinterpret_cast<const T *>(input),
+            n,
+            info.beta(),
+            info.threshold(),
+            meta
+        );
+    }
+}
+
+// ==================================================================
+// Descriptor Implementation
+// ==================================================================
+struct Descriptor::Opaque {};
+
+Descriptor::~Descriptor() {
+    if (_opaque) {
+        delete _opaque;
+        _opaque = nullptr;
+    }
+}
 
 infiniStatus_t Descriptor::create(
     infiniopHandle_t handle_,
     Descriptor **desc_ptr,
     infiniopTensorDescriptor_t out_desc,
-    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+    infiniopTensorDescriptor_t input_desc,
+    float beta,
+    float threshold) {
 
     auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
-    auto dtype = out_desc->dtype();
-
-    const auto &x_desc = input_desc_vec.at(0);
-    const auto &y_shape = out_desc->shape();
-    const auto &x_shape = x_desc->shape();
 
-    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16);
-
-    CHECK_SAME_SHAPE(y_shape, x_shape);
+    auto result = SoftplusInfo::create(out_desc, input_desc, beta, threshold);
+    if (!result) {
+        return result.status();
+    }
 
-    // create CUDA elementwise descriptor
-    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+    *desc_ptr = new Descriptor(
+        new Opaque(),
+        result.take(),
+        0,
+        handle->device,
+        handle->device_id
+    );
 
     return INFINI_STATUS_SUCCESS;
 }
@@ -34,26 +155,27 @@ infiniStatus_t Descriptor::calculate(
     void *workspace,
     size_t workspace_size,
     void *output,
-    std::vector<const void *> inputs,
+    const void *input,
     void *stream) const {
 
-    if (workspace_size < _workspace_size) {
-        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
-    }
-
-    switch (_dtype) {
+    switch (_info.dtype()) {
     case INFINI_DTYPE_F16:
-        return _device_info->calculate<256, cuda::SoftplusOp, half>(_info, workspace, output, inputs, stream);
+        launch_kernel<half>(output, input, _info, stream);
+        break;
     case INFINI_DTYPE_BF16:
-        return _device_info->calculate<256, cuda::SoftplusOp, cuda_bfloat16>(_info, workspace, output, inputs, stream);
+        launch_kernel<nv_bfloat16>(output, input, _info, stream);
+        break;
     case INFINI_DTYPE_F32:
-        return _device_info->calculate<256, cuda::SoftplusOp, float>(_info, workspace, output, inputs, stream);
+        launch_kernel<float>(output, input, _info, stream);
+        break;
     case INFINI_DTYPE_F64:
-        return _device_info->calculate<256, cuda::SoftplusOp, double>(_info, workspace, output, inputs, stream);
+        launch_kernel<double>(output, input, _info, stream);
+        break;
     default:
         return INFINI_STATUS_BAD_TENSOR_DTYPE;
     }
 
     return INFINI_STATUS_SUCCESS;
 }
-} // namespace op::softplus::nvidia
+
+} // namespace op::softplus::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/softplus/nvidia/softplus_nvidia.cuh b/src/infiniop/ops/softplus/nvidia/softplus_nvidia.cuh
index ef0261eac..d87fc6631 100644
--- a/src/infiniop/ops/softplus/nvidia/softplus_nvidia.cuh
+++ b/src/infiniop/ops/softplus/nvidia/softplus_nvidia.cuh
@@ -1,8 +1,8 @@
-#ifndef __SOFTPLUS_CUDA_API_H__
-#define __SOFTPLUS_CUDA_API_H__
+#ifndef __SOFTPLUS_NVIDIA_CUH__
+#define __SOFTPLUS_NVIDIA_CUH__
 
-#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+#include "../softplus.h"
 
-ELEMENTWISE_DESCRIPTOR(softplus, nvidia)
+DESCRIPTOR(nvidia)
 
-#endif // __SOFTPLUS_CUDA_API_H__
+#endif // __SOFTPLUS_NVIDIA_CUH__
\ No newline at end of file
diff --git a/src/infiniop/ops/softplus/operator.cc b/src/infiniop/ops/softplus/operator.cc
index 7d09bad6c..146989c70 100644
--- a/src/infiniop/ops/softplus/operator.cc
+++ b/src/infiniop/ops/softplus/operator.cc
@@ -14,12 +14,25 @@
 #ifdef ENABLE_KUNLUN_API
 #include "kunlun/softplus_kunlun.h"
 #endif
+// 新增 Moore 头文件引用
+#ifdef ENABLE_MOORE_API
+#include "moore/softplus_moore.h"
+#endif
+
+// =======================================================================
+// [关键修复] 定义结构体
+// 必须在这里定义 InfiniopSoftplusDescriptor 并继承 InfiniopDescriptor，
+// 这样编译器才能识别 desc->device_type。
+// =======================================================================
+struct InfiniopSoftplusDescriptor : public InfiniopDescriptor {};
 
 __C infiniStatus_t infiniopCreateSoftplusDescriptor(
     infiniopHandle_t handle,
     infiniopSoftplusDescriptor_t *desc_ptr,
     infiniopTensorDescriptor_t y_desc,
-    infiniopTensorDescriptor_t x_desc) {
+    infiniopTensorDescriptor_t x_desc,
+    float beta,
+    float threshold) {
 
 #define CREATE(CASE, NAMESPACE)                                                 \
     case CASE:                                                                  \
@@ -27,7 +40,9 @@ __C infiniStatus_t infiniopCreateSoftplusDescriptor(
             handle,                                                             \
             reinterpret_cast<op::softplus::NAMESPACE::Descriptor **>(desc_ptr), \
             y_desc,                                                             \
-            {x_desc})
+            {x_desc},                                                           \
+            beta,                                                               \
+            threshold)
 
     switch (handle->device) {
 
@@ -48,6 +63,10 @@ __C infiniStatus_t infiniopCreateSoftplusDescriptor(
 #endif
 #ifdef ENABLE_KUNLUN_API
         CREATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+// 新增 Moore 分支
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
 #endif
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -81,6 +100,10 @@ __C infiniStatus_t infiniopGetSoftplusWorkspaceSize(infiniopSoftplusDescriptor_t
 #endif
 #ifdef ENABLE_KUNLUN_API
         GET(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+// 新增 Moore 分支
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
 #endif
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -98,8 +121,8 @@ __C infiniStatus_t infiniopSoftplus(
     const void *x,
     void *stream) {
 
-#define CALCULATE(CASE, NAMESPACE)                                                 \
-    case CASE:                                                                     \
+#define CALCULATE(CASE, NAMESPACE)                                         \
+    case CASE:                                                             \
         return reinterpret_cast<const op::softplus::NAMESPACE::Descriptor *>(desc) \
             ->calculate(workspace, workspace_size, y, {x}, stream)
 
@@ -122,6 +145,10 @@ __C infiniStatus_t infiniopSoftplus(
 #endif
 #ifdef ENABLE_KUNLUN_API
         CALCULATE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+// 新增 Moore 分支
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
 #endif
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -157,10 +184,14 @@ infiniopDestroySoftplusDescriptor(infiniopSoftplusDescriptor_t desc) {
 #endif
 #ifdef ENABLE_KUNLUN_API
         DELETE(INFINI_DEVICE_KUNLUN, kunlun);
+#endif
+// 新增 Moore 分支
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
 #endif
     default:
         return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
     }
 
 #undef DELETE
-}
+}
\ No newline at end of file
diff --git a/src/infiniop/ops/softplus/softplus.h b/src/infiniop/ops/softplus/softplus.h
new file mode 100644
index 000000000..37b25baa2
--- /dev/null
+++ b/src/infiniop/ops/softplus/softplus.h
@@ -0,0 +1,49 @@
+#ifndef __SOFTPLUS_H__
+#define __SOFTPLUS_H__
+
+#include "../../operator.h"
+#include "info.h" // 引用对应的 SoftplusInfo 定义
+
+// 宏定义：用于生成不同命名空间下的 Descriptor 类
+#define DESCRIPTOR(NAMESPACE)                                            \
+    namespace op::softplus::NAMESPACE {                                  \
+    class Descriptor final : public InfiniopDescriptor {                 \
+        struct Opaque;                                                   \
+        Opaque *_opaque;                                                 \
+        SoftplusInfo _info;                                              \
+        size_t _workspace_size;                                          \
+                                                                         \
+        Descriptor(                                                      \
+            Opaque *opaque,                                              \
+            SoftplusInfo info,                                           \
+            size_t workspace_size,                                       \
+            infiniDevice_t device_type,                                  \
+            int device_id)                                               \
+            : InfiniopDescriptor{device_type, device_id},                \
+              _opaque(opaque),                                           \
+              _info(info),                                               \
+              _workspace_size(workspace_size) {}                         \
+                                                                         \
+    public:                                                              \
+        ~Descriptor();                                                   \
+                                                                         \
+        size_t workspaceSize() const { return _workspace_size; }         \
+                                                                         \
+        static infiniStatus_t create(                                    \
+            infiniopHandle_t handle,                                     \
+            Descriptor **desc_ptr,                                       \
+            infiniopTensorDescriptor_t out_desc,                         \
+            infiniopTensorDescriptor_t input_desc,                       \
+            float beta,                                                  \
+            float threshold);                                            \
+                                                                         \
+        infiniStatus_t calculate(                                        \
+            void *workspace,                                             \
+            size_t workspace_size,                                       \
+            void *output,                                                \
+            const void *input,                                           \
+            void *stream) const;                                         \
+    };                                                                   \
+    }
+
+#endif // __SOFTPLUS_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/softsign/cpu/softsign_cpu.cc b/src/infiniop/ops/softsign/cpu/softsign_cpu.cc
new file mode 100644
index 000000000..bf1a5f757
--- /dev/null
+++ b/src/infiniop/ops/softsign/cpu/softsign_cpu.cc
@@ -0,0 +1,55 @@
+#include "softsign_cpu.h"
+
+namespace op::softsign::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    // Softsign 只有一个输入
+    const auto &x_desc = input_desc_vec.at(0);
+    
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64, INFINI_DTYPE_BF16, INFINI_DTYPE_I32, INFINI_DTYPE_I64);
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    // 调用 SoftsignOp 进行计算
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<SoftsignOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<SoftsignOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<SoftsignOp, double>(_info, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<SoftsignOp, bf16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I32:
+        return _device_info->calculate<SoftsignOp, int32_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_I64:
+        return _device_info->calculate<SoftsignOp, int64_t>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::softsign::cpu
\ No newline at end of file
diff --git a/src/infiniop/ops/softsign/cpu/softsign_cpu.h b/src/infiniop/ops/softsign/cpu/softsign_cpu.h
new file mode 100644
index 000000000..02ada11a3
--- /dev/null
+++ b/src/infiniop/ops/softsign/cpu/softsign_cpu.h
@@ -0,0 +1,21 @@
+#ifndef __SOFTSIGN_CPU_H__
+#define __SOFTSIGN_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+#include <cmath> 
+
+ELEMENTWISE_DESCRIPTOR(softsign, cpu)
+
+namespace op::softsign::cpu {
+typedef struct SoftsignOp {
+public:
+    static constexpr size_t num_inputs = 1;
+    
+    template <typename T>
+    T operator()(const T &x) const {
+        return x / (static_cast<T>(1) + std::abs(x));
+    }
+} SoftsignOp;
+} // namespace op::softsign::cpu
+
+#endif // __SOFTSIGN_CPU_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/softsign/cuda/kernel.cuh b/src/infiniop/ops/softsign/cuda/kernel.cuh
new file mode 100644
index 000000000..bc22a2f51
--- /dev/null
+++ b/src/infiniop/ops/softsign/cuda/kernel.cuh
@@ -0,0 +1,42 @@
+#ifndef __SOFTSIGN_CUDA_H__
+#define __SOFTSIGN_CUDA_H__
+
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <type_traits>
+#include <cmath>
+
+namespace op::softsign::cuda {
+
+struct SoftsignOp {
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            const half2 one = __float2half2_rn(1.0f);
+            const half2 abs_x = __habs2(x);
+            const half2 denom = __hadd2(one, abs_x);
+            return __h2div(x, denom);
+        } else if constexpr (std::is_same_v<T, half>) {
+#if __CUDA_ARCH__ >= 530
+            const half one = __float2half(1.0f);
+            return __hdiv(x, __hadd(one, __habs(x)));
+#else
+            return static_cast<half>(static_cast<float>(x) / (1.0f + fabsf(static_cast<float>(x))));
+#endif
+        } else if constexpr (std::is_same_v<T, nv_bfloat16>) {
+            // Avoid __habs which is for fp16. Use manual abs or operators to keep bf16 precision.
+            const T abs_x = (x >= T(0.0f)) ? x : -x;
+            return x / (T(1.0f) + abs_x);
+        } else if constexpr (std::is_same_v<T, float>) {
+            return x / (1.0f + fabsf(x));
+        } else {
+            return x / (static_cast<T>(1) + std::abs(x));
+        }
+    }
+};
+
+} // namespace op::softsign::cuda
+
+#endif // __SOFTSIGN_CUDA_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/softsign/metax/softsign_metax.h b/src/infiniop/ops/softsign/metax/softsign_metax.h
new file mode 100644
index 000000000..6d4f57a72
--- /dev/null
+++ b/src/infiniop/ops/softsign/metax/softsign_metax.h
@@ -0,0 +1,8 @@
+#ifndef __SOFTSIGN_METAX_API_H__
+#define __SOFTSIGN_METAX_API_H__
+
+#include "../../../elementwise/metax/elementwise_metax_api.h"
+
+ELEMENTWISE_DESCRIPTOR(softsign, metax)
+
+#endif // __SOFTSIGN_METAX_API_H__
diff --git a/src/infiniop/ops/softsign/metax/softsign_metax.maca b/src/infiniop/ops/softsign/metax/softsign_metax.maca
new file mode 100644
index 000000000..0b16252cf
--- /dev/null
+++ b/src/infiniop/ops/softsign/metax/softsign_metax.maca
@@ -0,0 +1,95 @@
+#include "../../../elementwise/metax/elementwise_metax.h"
+#include "softsign_metax.h"
+
+#include <maca_fp16.h>
+#include <maca_bfloat16.h>
+#include <type_traits>
+#include <cmath>
+
+namespace op::softsign::metax {
+
+using nv_bfloat16 = __maca_bfloat16;
+
+struct SoftsignOp {
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        if constexpr (std::is_same_v<T, __half>) {
+            float xf = __half2float(x);
+            float res = xf / (1.0f + fabsf(xf));
+            return __float2half(res);
+        } else if constexpr (std::is_same_v<T, nv_bfloat16>) {
+            T zero = T(0.0f);
+            T one  = T(1.0f);
+            T abs_x = (x >= zero) ? x : -x;
+            return x / (one + abs_x);
+        } else if constexpr (std::is_same_v<T, float>) {
+            return x / (1.0f + fabsf(x));
+        } else if constexpr (std::is_same_v<T, double>) {
+            return x / (1.0 + std::fabs(x));
+        } else {
+            using std::abs;
+            return x / (static_cast<T>(1) + abs(x));
+        }
+    }
+};
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::metax::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype,
+                INFINI_DTYPE_F16,
+                INFINI_DTYPE_F32,
+                INFINI_DTYPE_BF16,
+                INFINI_DTYPE_F64);
+
+    CREATE_ELEMENTWISE_METAX_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, SoftsignOp, __half>(
+            _info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, SoftsignOp, nv_bfloat16>(
+            _info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, SoftsignOp, float>(
+            _info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, SoftsignOp, double>(
+            _info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::softsign::metax
\ No newline at end of file
diff --git a/src/infiniop/ops/softsign/moore/softsign_moore.h b/src/infiniop/ops/softsign/moore/softsign_moore.h
new file mode 100644
index 000000000..68652b6ba
--- /dev/null
+++ b/src/infiniop/ops/softsign/moore/softsign_moore.h
@@ -0,0 +1,8 @@
+#ifndef __SOFTSIGN_MOORE_API_H__
+#define __SOFTSIGN_MOORE_API_H__
+
+#include "../../../elementwise/moore/elementwise_moore_api.h"
+
+ELEMENTWISE_DESCRIPTOR(softsign, moore)
+
+#endif // __SOFTSIGN_MOORE_API_H__
diff --git a/src/infiniop/ops/softsign/moore/softsign_moore.mu b/src/infiniop/ops/softsign/moore/softsign_moore.mu
new file mode 100644
index 000000000..99fa85a1b
--- /dev/null
+++ b/src/infiniop/ops/softsign/moore/softsign_moore.mu
@@ -0,0 +1,69 @@
+#include "softsign_moore.h"
+#include "softsign_moore_kernel.h"
+
+#include "../../../elementwise/moore/elementwise_moore.h"
+#include <musa_fp16.h>
+#include <musa_bf16.h>
+
+namespace op::softsign::moore {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    // Softsign 是单输入算子，只取 index 0
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+
+    CHECK_DTYPE(dtype,
+                INFINI_DTYPE_F16,
+                INFINI_DTYPE_F32,
+                INFINI_DTYPE_BF16,
+                INFINI_DTYPE_F64);
+
+    // 创建 Moore elementwise descriptor
+    CREATE_ELEMENTWISE_MOORE_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, SoftsignOp, half>(
+            _info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, SoftsignOp, __mt_bfloat16>(
+            _info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, SoftsignOp, float>(
+            _info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, SoftsignOp, double>(
+            _info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+} // namespace op::softsign::moore
diff --git a/src/infiniop/ops/softsign/moore/softsign_moore_kernel.h b/src/infiniop/ops/softsign/moore/softsign_moore_kernel.h
new file mode 100644
index 000000000..d42a81107
--- /dev/null
+++ b/src/infiniop/ops/softsign/moore/softsign_moore_kernel.h
@@ -0,0 +1,141 @@
+#ifndef __SOFTSIGN_MOORE_KERNEL_H__
+#define __SOFTSIGN_MOORE_KERNEL_H__
+
+#include <musa_runtime.h>
+#include <musa_fp16.h>
+#include <musa_bf16.h>
+#include <type_traits>
+#include <cmath>
+
+namespace op::softsign::moore {
+
+// ================================================================
+// 类型转换辅助函数
+// ================================================================
+template <typename T>
+__device__ __forceinline__ float to_float(T v) {
+    if constexpr (std::is_same_v<T, half>) {
+        return __half2float(v);
+    } else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
+        return __bfloat162float(v);
+    } else {
+        return static_cast<float>(v);
+    }
+}
+
+template <typename T>
+__device__ __forceinline__ T from_float(float v) {
+    if constexpr (std::is_same_v<T, half>) {
+        return __float2half(v);
+    } else if constexpr (std::is_same_v<T, __mt_bfloat16>) {
+        return __float2bfloat16(v);
+    } else {
+        return static_cast<T>(v);
+    }
+}
+
+// ================================================================
+// Softsign Functor
+// y = x / (1 + |x|)
+// ================================================================
+struct SoftsignOp {
+    static constexpr size_t num_inputs = 1;
+
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &x) const {
+        float xf = to_float(x);
+        float res = xf / (1.0f + fabsf(xf));
+        return from_float<T>(res);
+    }
+};
+
+// ================================================================
+// TensorMetadata
+// ================================================================
+static constexpr int MAX_DIMS = 8;
+
+struct TensorMetadata {
+    int ndim;
+    int64_t shape[MAX_DIMS];
+    int64_t strides[MAX_DIMS];
+};
+
+// ================================================================
+// Kernel 1: 连续内存
+// ================================================================
+template <typename T>
+__global__ void softsign_kernel_contiguous(T* output, const T* input, size_t n) {
+
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < n) {
+        SoftsignOp functor;
+        output[idx] = functor(input[idx]);
+    }
+}
+
+// ================================================================
+// Kernel 2: 非连续内存 (Strided)
+// ================================================================
+template <typename T>
+__global__ void softsign_kernel_strided(
+    T* output,
+    const T* input,
+    size_t n,
+    TensorMetadata meta
+) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    if (idx < n) {
+        size_t offset = 0;
+        size_t t = idx;
+
+#pragma unroll
+        for (int d = meta.ndim - 1; d >= 0; --d) {
+            size_t dim_size = meta.shape[d];
+            size_t coord = t % dim_size;
+            t /= dim_size;
+            offset += coord * meta.strides[d];
+        }
+
+        SoftsignOp functor;
+        output[idx] = functor(input[offset]);
+    }
+}
+
+// ================================================================
+// Launch Kernel
+// ================================================================
+template <typename T>
+void launch_kernel(
+    void* output,
+    const void* input,
+    const TensorMetadata& meta,
+    size_t numel,
+    bool is_contiguous,
+    void* stream
+) {
+    auto musa_stream = reinterpret_cast<musaStream_t>(stream);
+
+    dim3 block(256);
+    dim3 grid((numel + block.x - 1) / block.x);
+
+    if (is_contiguous) {
+        softsign_kernel_contiguous<T><<<grid, block, 0, musa_stream>>>(
+            reinterpret_cast<T*>(output),
+            reinterpret_cast<const T*>(input),
+            numel
+        );
+    } else {
+        softsign_kernel_strided<T><<<grid, block, 0, musa_stream>>>(
+            reinterpret_cast<T*>(output),
+            reinterpret_cast<const T*>(input),
+            numel,
+            meta
+        );
+    }
+}
+
+} // namespace op::softsign::moore
+
+#endif // __SOFTSIGN_MOORE_KERNEL_H__
diff --git a/src/infiniop/ops/softsign/nvidia/softsign_nvidia.cu b/src/infiniop/ops/softsign/nvidia/softsign_nvidia.cu
new file mode 100644
index 000000000..0a5632877
--- /dev/null
+++ b/src/infiniop/ops/softsign/nvidia/softsign_nvidia.cu
@@ -0,0 +1,56 @@
+#include "../../../elementwise/nvidia/elementwise_nvidia.cuh"
+
+#include "../cuda/kernel.cuh"
+#include "softsign_nvidia.cuh"
+
+namespace op::softsign::nvidia {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::nvidia::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    // Softsign 是单输入算子，只取 index 0
+    const auto &x_desc = input_desc_vec.at(0);
+    const auto &y_shape = out_desc->shape();
+    const auto &x_shape = x_desc->shape();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_BF16, INFINI_DTYPE_F64);
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, cuda::SoftsignOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_BF16:
+        return _device_info->calculate<256, cuda::SoftsignOp, nv_bfloat16>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, cuda::SoftsignOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, cuda::SoftsignOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::softsign::nvidia
\ No newline at end of file
diff --git a/src/infiniop/ops/softsign/nvidia/softsign_nvidia.cuh b/src/infiniop/ops/softsign/nvidia/softsign_nvidia.cuh
new file mode 100644
index 000000000..b5b3ad7c8
--- /dev/null
+++ b/src/infiniop/ops/softsign/nvidia/softsign_nvidia.cuh
@@ -0,0 +1,8 @@
+#ifndef __SOFTSIGN_CUDA_API_H__
+#define __SOFTSIGN_CUDA_API_H__
+
+#include "../../../elementwise/nvidia/elementwise_nvidia_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(softsign, nvidia)
+
+#endif // __SOFTSIGN_CUDA_API_H__
\ No newline at end of file
diff --git a/src/infiniop/ops/softsign/operator.cc b/src/infiniop/ops/softsign/operator.cc
new file mode 100644
index 000000000..feadccadb
--- /dev/null
+++ b/src/infiniop/ops/softsign/operator.cc
@@ -0,0 +1,201 @@
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/softsign.h" // 必须包含上面定义的头文件
+
+#ifdef ENABLE_CPU_API
+#include "cpu/softsign_cpu.h"
+#endif
+#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_ILUVATAR_API) || defined(ENABLE_QY_API)
+#include "nvidia/softsign_nvidia.cuh"
+#endif
+#ifdef ENABLE_METAX_API
+#include "metax/softsign_metax.h"
+#endif
+#ifdef ENABLE_MOORE_API
+#include "moore/softsign_moore.h"
+#endif
+
+// -----------------------------------------------------------------------------
+// Struct Definition
+// -----------------------------------------------------------------------------
+
+// 修正：使用 infiniDevice_t
+struct InfiniopSoftsignDescriptor {
+    infiniDevice_t device_type;
+};
+
+// -----------------------------------------------------------------------------
+// Create Descriptor
+// -----------------------------------------------------------------------------
+
+__C infiniStatus_t infiniopCreateSoftsignDescriptor(
+    infiniopHandle_t handle,
+    infiniopSoftsignDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t y_desc,
+    infiniopTensorDescriptor_t x_desc) {
+
+// 使用 {x_desc} 构建 vector
+#define CREATE(CASE, NAMESPACE)                                                    \
+    case CASE: {                                                                   \
+        auto status = op::softsign::NAMESPACE::Descriptor::create(                 \
+            handle,                                                                \
+            reinterpret_cast<op::softsign::NAMESPACE::Descriptor **>(desc_ptr),    \
+            y_desc,                                                                \
+            {x_desc});                                                             \
+        if (status == INFINI_STATUS_SUCCESS) {                                     \
+            (*desc_ptr)->device_type = CASE;                                       \
+        }                                                                          \
+        return status;                                                             \
+    }
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CREATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CREATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CREATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CREATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+// -----------------------------------------------------------------------------
+// Get Workspace Size
+// -----------------------------------------------------------------------------
+
+__C infiniStatus_t infiniopGetSoftsignWorkspaceSize(infiniopSoftsignDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                                    \
+    case CASE:                                                                                  \
+        *size = reinterpret_cast<op::softsign::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        GET(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        GET(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        GET(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        GET(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore);
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+// -----------------------------------------------------------------------------
+// Execute (Calculate)
+// -----------------------------------------------------------------------------
+
+__C infiniStatus_t infiniopSoftsign(
+    infiniopSoftsignDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *y,
+    const void *x,
+    void *stream) {
+
+// 使用 {x} 构建 vector
+#define CALCULATE(CASE, NAMESPACE)                                                 \
+    case CASE:                                                                     \
+        return reinterpret_cast<const op::softsign::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, y, {x}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        CALCULATE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        CALCULATE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        CALCULATE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        CALCULATE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+// -----------------------------------------------------------------------------
+// Destroy Descriptor
+// -----------------------------------------------------------------------------
+
+__C infiniStatus_t
+infiniopDestroySoftsignDescriptor(infiniopSoftsignDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                             \
+    case CASE:                                                                              \
+        delete reinterpret_cast<const op::softsign::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_NVIDIA_API
+        DELETE(INFINI_DEVICE_NVIDIA, nvidia);
+#endif
+#ifdef ENABLE_ILUVATAR_API
+        DELETE(INFINI_DEVICE_ILUVATAR, nvidia);
+#endif
+#ifdef ENABLE_QY_API
+        DELETE(INFINI_DEVICE_QY, nvidia);
+#endif
+#ifdef ENABLE_METAX_API
+        DELETE(INFINI_DEVICE_METAX, metax);
+#endif
+#ifdef ENABLE_MOORE_API
+        DELETE(INFINI_DEVICE_MOORE, moore);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
\ No newline at end of file
diff --git a/test/infinicore/ops/broadcast_to.py b/test/infinicore/ops/broadcast_to.py
index 0813cf14f..ff129c0fb 100644
--- a/test/infinicore/ops/broadcast_to.py
+++ b/test/infinicore/ops/broadcast_to.py
@@ -64,9 +64,8 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.broadcast_to(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.broadcast_to(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+         return infinicore.broadcast_to(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/huber_loss.py b/test/infinicore/ops/huber_loss.py
index 592695993..8670b7b04 100644
--- a/test/infinicore/ops/huber_loss.py
+++ b/test/infinicore/ops/huber_loss.py
@@ -66,9 +66,8 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.nn.functional.huber_loss(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.nn.functional.huber_loss(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+         return infinicore.nn.functional.huber_loss(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/softplus.py b/test/infinicore/ops/softplus.py
index c01c6d632..a74a60974 100644
--- a/test/infinicore/ops/softplus.py
+++ b/test/infinicore/ops/softplus.py
@@ -76,9 +76,8 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.nn.functional.softplus(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.nn.functional.softplus(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+         return infinicore.nn.functional.softplus(*args, **kwargs)
 
 
 def main():
diff --git a/test/infinicore/ops/softsign.py b/test/infinicore/ops/softsign.py
index c482e72e5..b55d47539 100644
--- a/test/infinicore/ops/softsign.py
+++ b/test/infinicore/ops/softsign.py
@@ -68,9 +68,8 @@ def get_test_cases(self):
     def torch_operator(self, *args, **kwargs):
         return torch.nn.functional.softsign(*args, **kwargs)
 
-    # def infinicore_operator(self, *args, **kwargs):
-    #     """InfiniCore implementation (operator not yet available)."""
-    #     return infinicore.nn.functional.softsign(*args, **kwargs)
+    def infinicore_operator(self, *args, **kwargs):
+         return infinicore.nn.functional.softsign(*args, **kwargs)
 
 
 def main():